From 1dec7d42689f6aaaac7642f4d5a5b7a9ed12c109 Mon Sep 17 00:00:00 2001 From: "annika.fust@mpi-bn.mpg.de" Date: Mon, 18 Mar 2019 08:51:00 +0100 Subject: [PATCH] version update --- uropa/annotation.py | 693 +++++++++++++------------------------- uropa/config.py | 209 ------------ uropa/overlaps.py | 461 -------------------------- uropa/uropa.py | 751 ++++++++++++++++++++---------------------- uropa/utils.py | 243 ++++++++++++++ utils/uropa_summary.R | 145 ++++---- 6 files changed, 917 insertions(+), 1585 deletions(-) delete mode 100644 uropa/config.py delete mode 100644 uropa/overlaps.py create mode 100644 uropa/utils.py diff --git a/uropa/annotation.py b/uropa/annotation.py index 01c3532..aa9b4fb 100644 --- a/uropa/annotation.py +++ b/uropa/annotation.py @@ -1,463 +1,234 @@ #!/usr/bin/env python3 -# coding: utf-8 -"""Contains code for annotation process.""" -import sys -import operator -import numpy as np + +"""Contains code for the annotation process""" import pysam -from . import overlaps as ovls - - -def annotation_process(input_args, peak_file, log=None): - """Contains main logic for the annotation process.""" - [outdir, gtf_index, attrib_k, queries, max_distance, - priority, gtf_has_chr] = input_args - - try: - anno = pysam.TabixFile(gtf_index) - except IOError: - log.error("Unable to open tabix index {}.".format(gtf_index)) - except ValueError: - log.error("Tabix index file is missing ({})".format(gtf_index)) - - try: - prefix_pk = peak_file.split("_peak_")[1] - except IndexError: - # In case peaks file is small and not splitted - prefix_pk = "" - - with open(peak_file, 'r') as peaks: - Best_hits_tab = dict() - All_hits_tab = dict() - min_dist = dict() - - from collections import OrderedDict - for j in range(len(queries)): - min_dist[j] = dict() - Best_hits_tab[j] = OrderedDict() - All_hits_tab[j] = OrderedDict() - - counter = 0 - for p in peaks: - counter += 1 - - peak = ovls.parse_peak(p, extend=max_distance) - if peak is None: - if log is not None: - log.warning("Skipping invalid peak '{}'".format(p)) - continue - #peak['id'] = peak['name'] if peak[ - # 'name'] is not None else peak['id'] - # Re-initialise table records for each peak - for q in enumerate(queries): - - All_hits_tab[q[0]][peak['id']] = "" - Best_hits_tab[q[0]][peak['id']] = "" - - q[1]["distance"] = [abs(int(d)) for d in q[1]["distance"]] - if len(q[1]["distance"]) == 2: - min_dist[q[0]][peak['id']] = [q[1]["distance"], list()] - else: - min_dist[q[0]][peak['id']] = [q[1]["distance"][0], list()] - - # Run TABIX - chrom_db = ovls.peak_has_chr(peak['chr'], gtf_has_chr) - tabix_query = chrom_db + ":" + \ - str(peak['estart']) + '-' + str(peak['eend']) - - try: - hits = anno.fetch(tabix_query, multiple_iterators=True) - except ValueError: - hits = list() - if log is not None: - log.warning("Region {} could not be found in GTF annotation for peak {}. Tabix query was: {}:{}-{}".format( - chrom_db, peak['id'], chrom_db, str(peak['estart']), str(peak['eend']))) - - has_hits = list() - # keep array of all valid queries (T,F) for all hits in the peak. - valid_q_per_peak = list() - dict_vqp = dict() - - # Collection and analysis of all hits for the peak - for hit in hits: # All overlapping features - hit = hit.split("\t") # from "anno.fetch" hit is a string - h = {'feature': hit[2], 'strand': hit[6]} - hit_center = np.mean([int(hit[3]), int(hit[4])]) - - dist_from_peak = min( - abs(int(peak['estart']) - int(hit[3])), - abs(int(peak['start']) - int(hit[3])), - abs(int(peak['eend']) - int(hit[3])), - abs(int(peak['end']) - int(hit[3])), - abs(int(peak['eend']) - int(hit[4])), - abs(int(peak['end']) - int(hit[4])), - abs(int(peak['estart']) - int(hit[4])), - abs(int(peak['start']) - int(hit[4])), - abs(peak['center'] - hit_center), - abs(peak['center'] - int(hit[3])), - abs(peak['center'] - int(hit[4]))) - # Verify that hit is valid at least for one of the distances of - # queries - valid_dist = any([dist_from_peak <= int( - max(q["distance"])) for q in queries]) - - # Keep hit if internals are required in any query - i_want_internals = any([q["internals"] in [ - ['T'], ['True'], ['TRUE'], ['Yes'], ['YES'], ['Y'], ['yes'], ['F'], ['False'], ['FALSE'], ['No'], ['NO'], ['N'], ['no']] for q in queries]) - - if (i_want_internals) and not valid_dist: - # Keep if gene inside the peak even if Dist > config.D - feat_in_peak = (int(peak['start']) < int( - hit[3]) and int(hit[4]) < int(peak['end'])) - peak_in_feat = (int(hit[3]) < int( - peak['start']) and int(peak['end']) < int(hit[4])) - - if feat_in_peak or peak_in_feat: - valid_dist = True - - # Find hits with valid values for the queries (Search all - # queries ,Not only PRIORITY) - v_fsa = [ovls.valid_fsa(h, hit, q, peak['strand']) for q in queries] - # Pair the Valid query values(fsb) with valid_distance and - # valid strand for each query - vsd = [[x, valid_dist] for x in v_fsa] - valid_queries = [i for i, v in enumerate(vsd) if all(v)] - hitj = "\t".join(hit) - - # Create dictionary of hit with its valid query per peak - if valid_queries: - has_hits.append(True) - valid_q_per_peak.append(valid_queries) - dict_vqp.update({hitj: valid_queries}) - - if not valid_queries: - has_hits.append(False) - - # ----- All hits parsed. Check only hits with valid-queries now ----- # - # log.debug("\n---All hits parsed for the peak. Totally, the Peak has {} hits from which {} are Valid ---".format( len(has_hits), has_hits.count(True) )) - # 9 cols= feat, f.start, f.end, f.strand, dist, min_pos, genom_loc, - # feat-to-peak-ovl , peak-to-feat-ovl (no Q) - nas_len = len(attrib_k) + 9 - NAsList_q = list(np.repeat("NA", nas_len)) - - # Search hits with only Prior or Secondary Queries - for hitj, vq in list(dict_vqp.items()): - hit = hitj.split("\t") - hit_len = abs(int(hit[4]) - int(hit[3])) - strand = hit[6] - for j in vq: - # Check DIRECTION before Closest_Distance, if given - if queries[j]["direction"] != ['any_direction']: - correct_dir = ovls.define_direction(queries[j]["direction"], int(hit[3]), int( - hit[4]), strand, peak['length'], int(peak['start']), int(peak['end']), peak['center']) - #logger.debug("\nFor the hit {}-{}, direction is correct : {}".format(hit[3],hit[4] ,correct_dir)) - - if correct_dir: - # IF direction of hit is the desired, then filter - # for Distance - min_pos, Dhit = ovls.distance_to_peak_center(peak['center'], int( - hit[3]), int(hit[4]), strand, queries[j]["feature.anchor"]) - #logger.debug("\nClosest Distance from the peak center : {} ".format(Dhit)) - - if not correct_dir and Best_hits_tab[j][peak['id']] == "" and All_hits_tab[j][peak['id']] == "": - # hit with Incorrect Dir: move to new peak, fill in NAs if peak is empty - #log.debug("\nThe hit doesn't have the correct direction -> NAs in All & Best hits table -> Continue to New hit.") - All_hits_tab[j][peak['id']] = "\t".join(np.hstack([peak['name'], peak['chr'], peak[ - 'start'], peak['center'], peak['end'], NAsList_q, str(j) + "\n"])) - Best_hits_tab[j][peak['id']] = "\t".join(np.hstack([peak['name'], peak['chr'], peak[ - 'start'], peak['center'], peak['end'], NAsList_q, str(j) + "\n"])) - continue # Move to next valid query and then next hit - - # When direction NOT correct but tables not empty. - elif not correct_dir and Best_hits_tab[j][peak['id']] != "" and All_hits_tab[j][peak['id']] != "": - continue - - elif queries[j]['direction'] == ['any_direction']: - min_pos, Dhit = ovls.distance_to_peak_center(peak['center'], int( - hit[3]), int(hit[4]), strand, queries[j]["feature.anchor"]) - #log.debug("\nClosest Distance(Dhit) when No Direction defined :{} ".format(Dhit)) - - # > Check OVERLAPS of peak-hit, define genomic_location - genomic_location = "not.specified" - genomic_location, ovl_pk, ovl_feat = ovls.overlap_peak_feature(genomic_location, int( - peak['start']), peak['center'], int(peak['end']), peak['length'], hit_len, int(hit[3]), int(hit[4]), strand) - - # > After Direction is checked move on to check Distance hit <--> peak - same_gene = all(x in min_dist[j][peak['id']][1] for x in [hit[2], hit[3], hit[ - 4], hit[6]]) # next hit shouldn't have same {feat,start,end} - internals_location = list() - if "Inside" in genomic_location: - # > Check internals direction for defining Bi-direct. distance window - internals_location = [ovls.find_internals_dir(a, peak['center'], int(hit[3]), int(hit[4]), strand) for a in queries[j]["feature.anchor"]] - - if len(queries[j]["distance"]) == 2: - d_is_best = ovls.get_distance_by_dir([min_dist[j][peak['id']][0][0], min_dist[ - j][peak['id']][0][1]], genomic_location, internals_location, Dhit, queries[j]["internals"]) - if len(queries[j]["distance"]) == 1: - d_is_best = ovls.get_distance_by_dir([min_dist[j][peak['id']][0], min_dist[j][ - peak['id']][0]], genomic_location, internals_location, Dhit, queries[j]["internals"]) - #log.debug('Called with {} (q {}): {}, {}, {}, {}'.format(peak['id'], j, min_dist[j][peak['id']][0], genomic_location, internals_location, Dhit)) - #log.debug('Returned {} {}'.format(peak['id'], d_is_best)) - - # print "Best distance found :{}".format(d_is_best) - if d_is_best and not same_gene: # Dhit < Dbest - best_hit_dist = min_dist[j][peak['id']][0] - current_overlap = ovls.get_besthit(j, len(queries[j]["distance"]), peak['id'], hit, attrib_k, Dhit) - #log.debug("best_hit_dist is {}".format(best_hit_dist)) - #log.debug("current_overlap dist is {}".format(current_overlap[0])) - - if current_overlap[0] <= best_hit_dist: - #log.debug("I just confirmed that {} is smaller than {}, or internals T".format(current_overlap[0], best_hit_dist)) - min_dist[j][peak['id']] = current_overlap - Best_res = ovls.create_table(peak['name'], peak['chr'], peak['start'], peak['end'], str(peak['center']), min_dist[j][peak['id']], attrib_k, min_pos, genomic_location, ovl_pk, ovl_feat, j) - Best_hits_tab[j][peak['id']] = Best_res - - # Dhit > Dbest - # only if empty fill in with NAs - elif not d_is_best and not same_gene and Best_hits_tab[j][peak['id']] == "": - #log.debug("\nDistance of Hit from Peak Center = {}, LARGER THAN current Min Distance = {}\n".format(Dhit, min_dist[j][peak['id']][0])) - Best_hits_tab[j][peak['id']] = "\t".join(np.hstack([peak['name'], peak['chr'], peak[ - 'start'], str(peak['center']), peak['end'], NAsList_q, str(j) + "\n"])) - #log.debug("min_dist is {}".format(min_dist[j][peak['id']])) - - # EVERY HIT getting registered in Allhits file if Dhit < - # query.distance - if len(queries[j]["distance"]) > 1: - Dhit_smaller = ovls.get_distance_by_dir([queries[j]["distance"][0], queries[j][ - "distance"][1]], genomic_location, internals_location, Dhit, queries[j]["internals"]) - else: - Dhit_smaller = ovls.get_distance_by_dir([queries[j]["distance"][0], queries[j][ - "distance"][0]], genomic_location, internals_location, Dhit, queries[j]["internals"]) - - if Dhit_smaller and not same_gene: - #log.debug("Distance of hit SMALLER than Config Distance -> Hit written to All_Hits.") - All_hits_tab[j][peak['id']] = ovls.write_hit_to_All(All_hits_tab, peak['id'], attrib_k, Dhit, hit, peak['name'], peak['chr'], peak['start'], - peak['end'], str(peak['center']), min_pos, genomic_location, ovl_pk, ovl_feat, j) - - # Hit not fitting in any of the distance values - if not Dhit_smaller and queries[j]["internals"] != ["True"] and (All_hits_tab[j][peak['id']] == ''): - #log.debug("\nDhit= {} is LARGER than config Distance:{}.".format(Dhit, queries[j]["distance"])) - # Write over empty or NA the new NA, for new hit,so - # each hit is parsed. - All_hits_tab[j][peak['id']] = "\t".join(np.hstack([peak['name'], peak['chr'], peak[ - 'start'], str(peak['center']), peak['end'], NAsList_q, str(j) + "\n"])) - - # Hits with further Distance -> Check for Internals - if not Dhit_smaller and queries[j]["internals"] == ["True"] and not same_gene: - if "Inside" in genomic_location: - #log.debug("\n-> Hit is 'Internal' with distance LARGER than config.distance.It will be recorded to the All_hits table.") - All_hits_tab[j][peak['id']] = ovls.write_hit_to_All(All_hits_tab, peak['id'], attrib_k, Dhit, hit, peak['name'], peak['chr'], peak['start'], peak['end'], str(peak['center']), - min_pos, genomic_location, ovl_pk, ovl_feat, j) - - if not Dhit_smaller and All_hits_tab[j][peak['id']] != '': - #log.debug("Hit has larger distance than allowed ") - continue - - # > Add to Best Hits the Best Internal features(when internals=True, D>config."distance"), - # after having filled-in the All Hits for the peak. - for hitj, vq in list(dict_vqp.items()): - for j in vq: - if Best_hits_tab[j][peak['id']] != "" and All_hits_tab[j][peak['id']] != "": - if (Best_hits_tab[j][peak['id']].split("\t")[10] == "NA" and - All_hits_tab[j][peak['id']].split("\t")[10] != "NA" and - queries[j]["internals"] == ["True"]): - - hit_line = All_hits_tab[j][peak['id']].split("\n") - hit_line = [h for h in hit_line if h != ''] - internal = [h.split("\t")[11] for h in hit_line] - hit_dist = [int( - h.split("\t")[10]) for h in hit_line] - mv_pos, min_val = min( - enumerate(hit_dist), key=operator.itemgetter(1)) - - if internal[mv_pos] in ["PeakInsideFeature", "FeatureInsidePeak"] and min_val != []: - Best_hits_tab[j][peak['id']] = hit_line[ - mv_pos] + "\n" - #logger.debug("\nBest Internal hit to be recorded: {}\n".format(hit_line[mv_pos]+"\n")) - - else: # if All_hits_tab[j][peak['id']] == "" : - continue - - # When no hits are valid for one query-> All_hits[q] stays <""> -> - # Get them filled with NAs - for a in range(len(queries)): - if All_hits_tab[a][peak['id']] == "": - #log.debug("\nQuery {} didn't validate any hit ! All_hits will be filled in with NAs".format(a)) - All_hits_tab[a][peak['id']] = "\t".join(np.hstack([peak['name'], peak['chr'], peak[ - 'start'], str(peak['center']), peak['end'], NAsList_q, str(a) + "\n"])) - if Best_hits_tab[a][peak['id']] == "": - #log.debug("\nQuery {} didn't validate any hit !Best hits will be filled in with NAs".format(a)) - Best_hits_tab[a][peak['id']] = "\t".join(np.hstack([peak['name'], peak['chr'], peak[ - 'start'], str(peak['center']), peak['end'], NAsList_q, str(a) + "\n"])) - - # end.for_hit - if not has_hits or True not in has_hits: - # Write the Non Overlaps in the first dict,so it is saved only once. - # log.debug ("\nPeak has no hits ! All queries Tables will be - # filled in with NAs\n") #only to [0] - for j in range(len(queries)): - All_hits_tab[j][peak['id']] = "\t".join(np.hstack([peak['name'], peak['chr'], peak[ - 'start'], str(peak['center']), peak['end'], NAsList_q, str(j) + "\n"])) - Best_hits_tab[j][peak['id']] = "\t".join(np.hstack([peak['name'], peak['chr'], peak[ - 'start'], str(peak['center']), peak['end'], NAsList_q, str(j) + "\n"])) - - # > PRIORITY FLIPPING - # Check for each Line in the Tabs if Query 1 = NA, when Pr= True - # and replace with Secondary Query. - if priority and len(queries) > 1: - # empty if only 0 exists - rest_q = [j for j in range(len(queries)) if j != 0] - # ** All_hits_per_peak -> may have lots of hits, sep with \n - - isNA = list() - for i in enumerate(queries): - if All_hits_tab[i[0]][peak['id']]: - isNA.append(All_hits_tab[i[0]][ - peak['id']].split("\t")[9] == "NA") - - QnotNA = [pos for pos, val in enumerate(isNA) if val is False] - Q_NA = [pos for pos, val in enumerate(isNA) if val is True] - - # All have Hit ,and Priority query - if isNA[0] is False or not any(isNA): - for r in rest_q: - All_hits_tab[r][peak['id']] = Best_hits_tab[ - r][peak['id']] = "" - - # 1st Query is NA, there are Sec.Queries. // any(QnotNA) = [] - # when isNA=[True] for one Q - if isNA[0] is True and any(QnotNA): - # Queries with NA: map(lambda qna : - # All_hits_tab[qna][peak['id']], Q_NA) - repl_q = min(QnotNA) # The query to be used instead of qO. - ##log.debug ("Priority Query hasn't given any hit, so it will be replaced by secondary Query's Hit: {} ".format (repl_q)) - All_hits_tab[0][peak['id']] = Best_hits_tab[ - 0][peak['id']] = "" - ##log.debug ("\nHits from Secondary Q to replace priority query: \n{} ".format(All_hits_tab[repl_q][peak['id']] )) - rest_QnotNA = [qn for qn in QnotNA if qn != repl_q] - if any(rest_QnotNA): - for qr in rest_QnotNA: - All_hits_tab[qr][peak['id']] = Best_hits_tab[ - qr][peak['id']] = "" - for qna in Q_NA: - All_hits_tab[qna][peak['id']] = Best_hits_tab[ - qna][peak['id']] = "" - - if all(isNA): - #logg.debug("\nNo query has any Hit.No replacement of Priority Query possible-> NAs will be filled in the Output.") - TabInList_p = [All_hits_tab[l][ - peak['id']] for l in range(len(queries))] - ##log.debug("Hit lines for all queries are : {}".format(TabInList_p )) - # If all_hits_tab doesn't have all queries will have "" - TabInList_p = [Tab for Tab in TabInList_p if Tab != ""] - - for j in enumerate(TabInList_p): - if j[1] == "": - TabInList_p[j[0]] = "\t".join(np.hstack([peak['name'], peak['chr'], peak[ - 'start'], str(peak['center']), peak['end'], NAsList_q, str(j[0]) + "\n"])) - - All_hits_tab[0][ - peak['id']] = ovls.merge_queries(TabInList_p) - Best_hits_tab[0][ - peak['id']] = ovls.merge_queries(TabInList_p) - ##log.debug("\nQueries hits for this peak to be merged in the Q.col:{}".format(ovls.merge_queries(TabInList_p)) ) - for r in rest_q: - All_hits_tab[r][peak['id']] = Best_hits_tab[ - r][peak['id']] = "" - - All_combo = OrderedDict() # Output peaks with same order as All_hits_tab - mydict = All_hits_tab - for k in mydict[0].keys(): - All_combo[k] = [pid[k] for q, pid in list(mydict.items())] - - # Best hits # - Best_combo = OrderedDict() - mybestD = Best_hits_tab - for k in mybestD[0].keys(): - Best_combo[k] = [pid[k] for q, pid in list(mybestD.items())] - - def all_same(items): - """Returns true if all items of a list are the same.""" - return all(x == items[0] for x in items) - - # > Merge hits of queries to have one best, if queries >1 - if len(queries) > 1 and not priority: - BestBest_hits = OrderedDict() - for k in Best_combo: - # [ [] , [] , [] ]-> can be of same query, same distance - records = [s.split("\n") for s in Best_combo[k]] - # split also internally each query's string to see if it - # contains more >1 hits(when same distance, more >1 are conc. - # to same query) - spl_rec = [[t if t != '' else None for t in r] for r in records] - recs = [x + "\n" for s in spl_rec for x in s if x != None] - - if len(recs) > 1: # Only when more than two hits, and no NAs here - splitted_hits = [recs[h].split("\t") for h in range(len(recs))] - # s= each hit line in string - splitted_hits = [s for s in splitted_hits if s != [""]] - featOfHit = [splitted_hits[s][ - 5] for s in range(len(splitted_hits))] - startPos = [splitted_hits[s][ - 6] for s in range(len(splitted_hits))] - endPos = [splitted_hits[s][ - 7] for s in range(len(splitted_hits))] - Dist_hits = [float(splitted_hits[s][10]) if splitted_hits[s][ - 10] != "NA" else float("Inf") for s in range(len(splitted_hits))] # 9th col= Distance - # Dist_hit = [Dist_hits[0] if all_same(Dist_hits) else - # Dist_hits] #Either same dist or All NAs - - # When All "NAs" for all queries-> keep only line from 1st - # query - NA_hits = ["NA" if splitted_hits[s][ - 10] == "NA" else None for s in range(len(splitted_hits))] - # If None means ALL queries have a feature - all_have_feat = all(x is None for x in NA_hits) - - same_feat = (all_same(featOfHit) and all_same( - startPos) and all_same(endPos) and all_same(Dist_hits)) - - if len(splitted_hits) > 1: - if all_same(NA_hits): - # BestBest_hits[k] = Best_combo[k][0] #Keep only - # one of the hits wth NA - BestBest_hits[k] = ovls.merge_queries( - Best_combo[k]) - - if (all_have_feat and same_feat) or (not same_feat and all_same(Dist_hits)): - BestBest_hits[k] = ovls.merge_queries( - Best_combo[k]) - - if (all_have_feat) and not same_feat or (not all_same(NA_hits) and not all_have_feat): - # min_pos,min_v = min(enumerate(Dist_hits), - # key=operator.itemgetter(1)) ## If only one min in - # one pos - min_v = min(Dist_hits) - # retrieve the value of the list with [0],should be - # only 1 ! - min_pos = [min_pos for min_pos, v in enumerate(Dist_hits) if v == min_v][ - 0] - # Keep to BB the pos with Min_Dist - BestBest_hits[k] = recs[min_pos] - - elif len(splitted_hits) == 1: - BestBest_hits[k] = Best_combo[k] - - # > Write in file - allhits_file = outdir + "allhits_part_" + prefix_pk + ".txt" - - try: - ovls.write_partial_file(allhits_file, All_combo) - except IOError: - if not log is None: - log.error("Unable to open file " + allhits_file + " for writing results!") - sys.exit() - - finalhits_file = outdir + "finalhits_part_" + prefix_pk + ".txt" - if len(queries) > 1 and not priority: - besthits_file = outdir + "besthits_part_" + prefix_pk + ".txt" - ovls.write_partial_file(besthits_file, Best_combo) - ovls.write_partial_file(finalhits_file, BestBest_hits) - else: - ovls.write_partial_file(finalhits_file, Best_combo) - return +from functools import reduce +import re +import numpy as np +import logging + + +def create_anno_dict(peak, hit=None): + """ Returns a dictionary containing information on the hit from gtf """ + + #Initialize dict with NAs + keys = ["feature", "feat_strand", "feat_start", "feat_end", "feat_center", "feat_length", "feat_attributes", "distance", + "feat_anchor", "query", "query_name", "best_hit", "relative_location", "feat_ovl_peak", "peak_ovl_feat"] + anno_dict = {key:"NA" for key in keys} + + #Add peak information + anno_dict.update(peak) #fills out peak chr/start/end/id/score/strand + anno_dict["peak_center"] = int((anno_dict["peak_end"] + anno_dict["peak_start"])/2) + anno_dict["peak_length"] = anno_dict["peak_end"] - anno_dict["peak_start"] + + #If hit was given, parse this as well + if hit is not None: + + #Parse info from gtf string + pairs = re.split(";\s*", hit.attributes) #regex remove 0 to n spaces + pairs.remove("") + attribute_dict = {pair.split()[0]:pair.split()[1].replace("\"", "") for pair in pairs} # parse " of attribute values + + # Fill in with feature info + anno_dict["feature"] = hit.feature + anno_dict["feat_strand"] = hit.strand + anno_dict["feat_start"] = hit.start + anno_dict["feat_end"] = hit.end + anno_dict["feat_center"] = int((anno_dict["feat_end"] + anno_dict["feat_start"])/2) + anno_dict["feat_length"] = anno_dict["feat_end"] - anno_dict["feat_start"] + anno_dict["feat_attributes"] = attribute_dict + + #Look-up keys for annotation + anno_dict["anchor_pos"] = {"start": anno_dict["feat_start"] if anno_dict["feat_strand"] != "-" else anno_dict["feat_end"], + "center": anno_dict["feat_center"], + "end": anno_dict["feat_end"] if anno_dict["feat_strand"] != "-" else anno_dict["feat_start"]} + + #Change with each query + anno_dict["query"] = 0 #query for which the hit was valid + anno_dict["best_hit"] = 0 + + return(anno_dict) + + +def distance_to_peak_center(anno_dict, query_anchor): + """ Assigns the distance of peak center to best query anchor """ + + #Calculate distances to each possible anchor + raw_distances = [anno_dict["peak_center"] - anno_dict["anchor_pos"][anchor] for anchor in query_anchor] + abs_distances = [abs(dist) for dist in raw_distances] + min_dist_i = abs_distances.index(min(abs_distances)) + + #Set minimum distance as best anchor + anno_dict["raw_distance"] = raw_distances[min_dist_i] + anno_dict["distance"] = abs(raw_distances[min_dist_i]) + anno_dict["feat_anchor"] = query_anchor[min_dist_i] + + return(anno_dict) + + +# import "division" allows decimals +def calculate_overlap(anno_dict): + """ Calculates percentage of length covered by the peak/feature """ + + peak_range = list(range(anno_dict["peak_start"], anno_dict["peak_end"])) + feature_range = list(range(anno_dict["feat_start"], anno_dict["feat_end"])) + ovl_range = set(peak_range).intersection(feature_range) + + ovl_pk = round(len(ovl_range) / anno_dict["peak_length"], 3) + ovl_feat = round(len(ovl_range) / anno_dict["feat_length"], 3) + + anno_dict["feat_ovl_peak"] = ovl_feat + anno_dict["peak_ovl_feat"] = ovl_pk + + return(anno_dict) + + +def get_relative_location(anno_dict): + """ Sets the relative location of peak to feature """ + + if anno_dict["peak_start"] <= anno_dict["feat_start"] and anno_dict["peak_end"] >= anno_dict["feat_end"]: + location = "FeatureInsidePeak" + + elif anno_dict["peak_start"] > anno_dict["feat_start"] and anno_dict["peak_end"] < anno_dict["feat_end"]: + location = "PeakInsideFeature" + + elif anno_dict["feat_anchor"] == "start": + if anno_dict["feat_ovl_peak"] > 0: + location = "OverlapStart" + else: + location = "Upstream" + + elif anno_dict["feat_anchor"] == "end": + if anno_dict["feat_ovl_peak"] > 0: + location = "OverlapEnd" + else: + location = "Downstream" + else: + location = "NA" + + anno_dict["relative_location"] = location + + return(anno_dict) + + +def annotate_peaks(peaks, gtf_gz, gtf_index, cfg_dict, logger=None): + """ Peaks is a list of tuple-elements (chrom, start, end, name, ...) + gtf_gz and gtf_index relate to the tabix gtf file + cfg-dict is the loaded config containing queries + """ + + if logger is None: + logger = logging.getLogger('') #local logger leading to nowhere + + #Open tabix file + tabix_obj = pysam.TabixFile(gtf_gz, index=gtf_index) + + #Information on queries + queries = cfg_dict["queries"] + n_queries = len(queries) + distances = sum([query["distance"] for query in queries], []) + max_distance = int(max(distances)) + + #For each peak in input peaks, collect all_valid_annotations + all_valid_annotations = [] + for peak in peaks: + logger.debug("Annotating peak: {0}".format(peak)) + + #Fetch all possible hits from tabix + extend_start = int(max(1, peak["peak_start"] - max_distance)) + extend_end = peak["peak_end"] + max_distance + tabix_query = "{0}:{1}-{2}".format(peak["gtf_chr"], extend_start, extend_end) + logger.debug("Tabix query: {0}".format(tabix_query)) + try: + hits = tabix_obj.fetch(peak["peak_chr"], extend_start, extend_end, parser=pysam.asGTF()) + except ValueError: + logger.error() + + #Go through each hit from tabix and establish whether they are valid for queries + valid_annotations = [] + stop_searching = False + while stop_searching == False: # Loop will stop when stop_searching is true + for hit in hits: + logger.debug("Validating hit: {0}".format(hit)) + anno_dict = create_anno_dict(peak, hit) + + #Validate annotation for each peakquery + stop_searching = False + query_i = -1 #first query will be index 0 + while query_i+1 < n_queries and stop_searching == False: + + #Query information + query_i += 1 + query = queries[query_i] #current query to check + anno_dict["query"] = query_i + anno_dict["query_name"] = query["name"] + + #Calculate distances/relative location + anno_dict = distance_to_peak_center(anno_dict, query["feature_anchor"]) + anno_dict = calculate_overlap(anno_dict) + anno_dict = get_relative_location(anno_dict) + + ##### Test validity of the hit to query_i #### + checks = {} + checks["feature"] = anno_dict["feature"] in query["feature"] #feature + + #Check feature anchor + checks["feature_anchor"] = anno_dict["feat_anchor"] in query["feature_anchor"] + + #Peak strand relative to feature strand + if query["strand"] != "ignore" and anno_dict["peak_strand"] != ".": + checks["strand"].append((anno_dict["peak_strand"] == anno_dict["feat_strand"] and query["strand"] == "same") or + (anno_dict["peak_strand"] != anno_dict["feat_strand"] and query["strand"] == "opposite")) + + #Check whether distance was valid + if anno_dict["feat_strand"] == "+": + checks["distance"] = anno_dict["raw_distance"] in range(-query["distance"][0], query["distance"][1]) + else: + checks["distance"] = anno_dict["raw_distance"] in range(-query["distance"][1], query["distance"][0]) + + #Check distance (Distance can still be valid if PeakInsideFeature/FeatureInsidePeak and internals flag is set) + max_overlap = max(anno_dict["feat_ovl_peak"], anno_dict["peak_ovl_feat"]) + checks["distance"] = checks["distance"] or (query["internals"]*1.0 > 0 and max_overlap >= query["internals"]*1.0) #if internals is set to more than 0 overlap + + #Filter on relative location + checks["relative_location"] = anno_dict["relative_location"] in query["relative_location"] + + #Filter on attribute if any was set + if query["filter_attribute"] != None: + checks["attribute"] = anno_dict["feat_attributes"].get(query["filter_attribute"], None) in query["attribute_values"] + + ##### All checks are done -> establish if hit is a valid annotation ##### + logger.debug("Query {0} | Checks: {1} | Annotation: {2}".format(query_i+1, checks, {key:anno_dict[key] for key in anno_dict if key != "feat_attributes"})) + + valid = sum(checks.values()) == len(checks.values()) #all checks must be valid + if valid: + valid_annotations.append(anno_dict.copy()) + + #Stop searching for queries fitting to hits if a hit was found and priority is true + stop_searching = (valid and cfg_dict["priority"]) + + #All tabix hits hits were checked + stop_searching = True + + #After all hits have been checked; make final checks and set best flag + if len(valid_annotations) > 0: + + #If priority == True, find the highest ranked annotations for this peak + if cfg_dict["priority"] == True: + highest_priority_query = min([anno_dict["query"] for anno_dict in valid_annotations]) + logger.debug("Highest priority hit for {0} was {1}".format(peak, highest_priority_query)) + valid_annotations = [anno_dict for anno_dict in valid_annotations if anno_dict["query"] == highest_priority_query] + + distances = [anno_dict["distance"] for anno_dict in valid_annotations] + valid_annotations[distances.index(min(distances))]["best_hit"] = 1 + + else: + valid_annotations = [create_anno_dict(peak)] #empty hit + valid_annotations[0]["best_hit"] = 1 #the empty hit is the best hit + + #Add result to all the valid annotations + all_valid_annotations.extend(valid_annotations) + + tabix_obj.close() + + return(all_valid_annotations) diff --git a/uropa/config.py b/uropa/config.py deleted file mode 100644 index 0a86213..0000000 --- a/uropa/config.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# coding: utf-8 -"""Contains functions for UROPA configuration.""" -import json -import ast -import os -import subprocess -from textwrap import dedent -import numpy as np - - -def howtoconfig(): - """Defines the epilog that is given when help is requested.""" - epilog = dedent("""\ - UROPA is a peak annotation tool facilitating the analysis of next-generation sequencing methods for - chromatin biology, like ChIPseq or ATACseq. There are already different peak annotation tools, like - HOMER or ChIPpeakAnno, but the advantage of UROPA is, that it can easily be fitted to your requirements. - UROPA was developed as an open source analysis pipeline for peaks generated from any peak caller. - - Please cite upon usage: - Kondili M, Fust A, Preussner J, Kuenne C, Braun T and Looso M. UROPA: A tool for Universal RObust Peak Annotation. - Scientific Reports 7 (2017), doi: 10.1038/s41598-017-02464-y - - All parameters and paths to input or output files should be reported in a JSON configuration file. - The configuration file should at least contain paths for bed and GTF files: - - { - "queries": [], - "bed": "/path/to/bed/file.bed", - "gtf": "/path/to/annotation/file.gtf" - } - - Different query types can be defined using the queries key: - - { - "queries": [ - {...}, - {...}], - "bed": "/path/to/bed/file.bed", - "gtf": "/path/to/annotation/file.gtf" - } - - Optionally, the priority key can be used to fine tune UROPAs behaviour: - - { - "queries": [ - {...}, - {...}], - "bed": "/path/to/bed/file.bed", - "gtf": "/path/to/annotation/file.gtf", - "priority": "True" - } - - Please visit http://uropa-manual.readthedocs.io/config.html for detailed information on configuration. - """) - return epilog - - -def parse_json(infile): - """Read a json file""" - assert isinstance(infile, str), 'Argument {0} of wrong type ({1}), should be {2}!'.format( - 'column', type(infile), 'str') - - with open(infile, 'r', encoding='utf-8') as f: - return ast.literal_eval(json.dumps(json.load(f))) - - -def column_from_file(file, column, log=None): - """Extracts a given column from a file, and returns unified values as list.""" - assert isinstance(file, str), 'Argument {0} of wrong type ({1}), should be {2}!'.format( - 'column', type(file), 'str') - assert isinstance(column, int), 'Argument {0} of wrong type ({1}), should be {2}!'.format( - 'column', type(column), 'int') - - cmd = 'cut -f' + str(column) + ' ' + str(file) + \ - ' | sort | uniq | grep -v "^#"' - - try: - vals = str(subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True), encoding='utf-8') - except subprocess.CalledProcessError: - if log is not None: - log.warning("File {} might be empty or has not enough columns".format(file)) - return([]) - return([v for v in vals.split('\n') if v != ""]) - - -def parse_parameters(config, log=None): - """Fills the configuration with default values. Writes a warning to logs, if unknown keys are detected.""" - defaults = {"priority": "False", "bed": "no_peaks.bed", - "gtf": "no_annotation.gtf"} # , "bigwig": "none.bw" - keys = list(defaults.keys()) - values = [config[x] if x in config else defaults[x] for x in keys] - - if log is not None: - unknown = [k for k in list(config.keys()) if k not in keys and k != - "queries"] - if any(unknown): - log.warning( - "Unknown keys detected in configuration: {}".format(unknown)) - parameters = dict(list(zip(keys, values))) - return parameters - - -def parse_queries(config, gtf_feat, log=None): - """Adds in defaults where query parameters are missing.""" - defaults = {"feature": gtf_feat, "strand": "ignore", "show.attributes": "None", "filter.attribute": "None", - "attribute.value": "None", "distance": 100000, "feature.anchor": ["start", "center", "end"], - "direction": "any_direction", "internals": "False", "priority": "False"} - keys = list(defaults.keys()) - - try: - query_list = config["queries"] - except KeyError: - query_list = [defaults] - if not log is None: - log.warning("No 'queries' key given, so the default values will be used: {}".format(query_list)) - - if len(query_list) == 0: - if not log is None: - log.info('Empty queries given ("queries":[]). Will use defaults.') - query_list = [defaults] - - if not isinstance(query_list, list): - query_list = [query_list] - - def give_val(q): - return [q[x] if x in q and q[x] != "" else defaults[x] for x in keys] - - def make_list(l): - return [[v] if not isinstance(v, list) else v for v in l] - - vals = [give_val(l) for l in query_list] - values = [make_list(l) for l in vals] - - if not log is None: - unknown = [k for k in list(config.keys()) if k not in list(set(['gtf', 'bed', 'queries']).union(keys))] - if any(unknown): - log.warning("Unknown keys detected in configuration: {}".format(unknown)) - - queries = [dict(list(zip(keys, x))) for x in values] - return queries - - -def remove_invalid_queries(queries, log=None): - """Removes queries that have multiple attribute.filter or filter.value values and more than two distance constraints.""" - # Validate distance constraints - has_valid_distance = [len(q["distance"]) < 3 for q in queries] - if not all(has_valid_distance) and log is not None: - log.warning("Queries with invalid distances present! Affected queries: {}".format( - [i for i, x in enumerate(has_valid_distance) if not x])) - - # Validate query attributes - has_valid_attributes = [False if (q["filter.attribute"] != ['None'] and q["attribute.value"] == [ - 'None']) or (q["filter.attribute"] == ['None'] and q["attribute.value"] != ['None']) else True for q in queries] - - if not all(has_valid_attributes) and log is not None: - log.warning("Queries with invalid filter.attribute and attribute.value pairings present! Affected queries: {}".format( - [i for i, x in enumerate(has_valid_attributes) if not x])) - - # Validate strand attribute - has_valid_strand = [True if q["strand"] in [["ignore"], ["same"], ["opposite"]] else False for q in queries] - - if not all(has_valid_strand) and log is not None: - log.warning("Queries with invalid strand values present! Affected queries: {}".format( - [i for i, x in enumerate(has_valid_strand) if not x])) - - # Validate query attribute lengths - has_valid_attribute_lengths = [False if len( - q["filter.attribute"]) > 1 or len(q["attribute.value"]) > 1 else True for q in queries] - if not all(has_valid_attribute_lengths) and log is not None: - log.warning("Queries with more than one value for either filter.attribute or attribute.value present! Affected queries: {}".format( - [i for i, x in enumerate(has_valid_attribute_lengths) if not x])) - - invalid_queries = list(np.unique([i for i, x in enumerate(has_valid_attributes) if not x] + [i for i, x in enumerate( - has_valid_attribute_lengths) if not x] + [i for i, x in enumerate(has_valid_distance) if not x] + [i for i, x in enumerate(has_valid_strand) if not x])) - return([queries[i] for i in range(0, len(queries)) if not i in invalid_queries]) - - -def parse_first_gtf_line(gtf): - """Removes comment lines, reads first line to check for prefix chr. Returns if chr was found and the number of columns.""" - f = open(gtf, "r", encoding='utf-8') - is_comment = True - while is_comment: - line = f.readline() - is_comment = line.startswith("#") - - has_chr = line.startswith('chr') - num_cols = len(line.split("\t")) - return(has_chr, num_cols) - - -def cut_gtf_perFeat(gtf, features, prefix): - """Removes lines with features not in features from a gtf file.""" - gtf_per_feat = prefix + os.path.basename(gtf).split(".gtf")[0] + "_cut_per_feat.gtf" - feat2cut = np.unique(features) - - f = open(gtf, "r", encoding='utf-8') - is_comment = True - while is_comment: - line = f.readline() - is_comment = line.startswith("#") - - lines = f.readlines() - gtf_query_feat = [li for li in lines if li.split("\t")[2] in feat2cut] - - # List of lines containing only selected features from query - with open(gtf_per_feat, "w") as cutgtf: - cutgtf.write("".join(gtf_query_feat)) - return gtf_per_feat diff --git a/uropa/overlaps.py b/uropa/overlaps.py deleted file mode 100644 index 3f65d63..0000000 --- a/uropa/overlaps.py +++ /dev/null @@ -1,461 +0,0 @@ -#!/usr/bin/env python3 -# coding: utf-8 -"""Contains functions for UROPA overlap evaluation.""" - -import os -import re -import sys -import subprocess -import logging -import shlex -import shutil -import glob -import numpy as np - - -def peak_has_chr(chrom, gtf_has_chr): - """Checks if the peak chromosome name against the reference chromosome name.""" - p_wo_chr = "chr" not in chrom - gtf_no_chr = not gtf_has_chr - - if p_wo_chr and gtf_has_chr: # Peak 'chr' should have chr if gtf has - chrom_db = "chr" + chrom - - elif not p_wo_chr and not gtf_has_chr: # peak has chr, gtf doesn't-> remove chr from peak - chrom_db = chrom.strip("chr") - - elif (gtf_no_chr and p_wo_chr) or (gtf_has_chr and not p_wo_chr): # none has chr -> keep p chrom - chrom_db = chrom - - return chrom_db - - -def check4_chr(file): - """Checks if a file has lines starting with 'chr'.""" - if os.path.exists(file): - with open(file, "r") as f: - flines = f.readlines() - - comm_lines = len([li for li in flines if li.startswith("#")]) - first_line = flines[comm_lines] - file_has_chr = first_line.startswith("chr") # >> True / False - return file_has_chr - - -def tabix_index(annot_gtf): - """ Prepares gtf for query: Performs sorting, zipping and indexing of given gtf""" - out_sorted = annot_gtf + ".sorted" - out_zipped = out_sorted + ".gz" - - with open(out_zipped, 'w'): - os.system('grep -v ^# ' + annot_gtf + - ' | sort -k1,1 -k4,4n > ' + annot_gtf + '.sorted') - os.system('bgzip -c -f ' + annot_gtf + - '.sorted ' + ' > ' + out_zipped) - - run_tabix = 'tabix -f ' + out_zipped - idx = subprocess.Popen(shlex.split(run_tabix), stdout=subprocess.PIPE) - idx.wait() - - -def valid_fsa(h, hit, q, pstrand): - """Returns if a hit h is valid for query q in their basic common keys.""" - - # not just the first feature should be hit but any! - #vf = (h["feature"] == q["feature"][0]) - vf = (h["feature"] in q["feature"]) - v_str = valid_strand(h["strand"], pstrand, q["strand"][0]) - va = valid_attribute(q["filter.attribute"][0], q["attribute.value"][0], hit) - return all([vf, v_str, va]) - - -def valid_strand(hit_str, p_str, q_str): - """Returns boolean showing match of peak.strand to feature.strand according to query requirements """ - if p_str != ".": - if q_str == "ignore": - return True if (hit_str != p_str) or (hit_str == p_str) else False - if q_str == "same": - return True if (hit_str == p_str) else False - if q_str == "opposite": - return True if (hit_str == "+" and p_str == "-") or (hit_str == "-" and p_str == "+") else False - # OR : return (True if (hit_str != p_str) ,i.e when one is "." and - # other is "+,-" - - elif p_str == "." or hit_str == ".": - return True - - -def valid_attribute(attr_filter_key, attr_filter_val, hit): - """Validates the hit according to a filter attribute.""" - if (attr_filter_key != "None") and (attr_filter_val != "None"): - try: - # If key for filtering is not correct or doesn't exist-> error - # should be ignored - hit_attrib_val = re.split( - "; " + attr_filter_key + " ", hit[8])[1].split(';')[0].strip('"\'').rstrip('\"') - except IndexError: # if key doesn't exist re.split will give error - hit_attrib_val = "NA" - - # If biotype of hit == attr_value from query-> continue annotation - return attr_filter_val == hit_attrib_val - else: - return True # no filtering given->ignore - - -def file_len(fname): - """Returns Number of lines of a file""" - with open(fname) as f: - i = -1 - for i, l in enumerate(f): - pass - return i + 1 - - -def parse_peak(peakstr, extend=0, delim='\t'): - """ Parses a string into peak dictionaries """ - fields = peakstr.replace('\n', '').replace('\r', '').split(delim) - num_fields = len(fields) - - if num_fields < 3: - return None - - p = dict() - if num_fields >= 3: - p['chr'] = fields[0] - p['start'] = fields[1] - p['end'] = fields[2] - - if num_fields >= 4: - if fields[3] != '.': - p['name'] = fields[3] - - if num_fields >= 5: - p['score'] = fields[4] - - if num_fields >= 6: - p['strand'] = fields[5] - - defaults = {'id': p['chr'] + ":" + p['start'] + "-" + p['end'], 'name': p['chr'] + ":" + p['start'] + "-" + p['end'], 'score': 0, 'strand': ".", - 'center': int(np.around(np.mean([int(p['start']), int(p['end'])]))), 'estart': max(1, int(p['start']) - extend), 'eend': int(p['end']) + extend, 'length': abs(int(p['end']) - int(p['start']))} - if 'name' in list(p.keys()): - p['id'] = p['chr'] + ":" + p['start'] + "-" + p['end'] + "_" + p['name'] - - values = [p[k] if k in p else defaults[k] for k in list(defaults.keys())] - peak = dict(list(zip(list(p.keys()) + list(defaults.keys()), list(p.values()) + values))) - return peak - - -def define_direction(dir_key, fstart, fend, strand, p_len, pstart, pend, p_center): - """Verifies if direction of peak is the same as required by query""" - - peak_dir = find_peak_dir(fstart, fend, strand, pstart, p_center, pend) -# Inverse in case of negative strand - feat_start = fstart if strand == "+" else fend - -# How much internally to peak can a feature be, to be labeled as upstr/downstr. - ten_percent = p_len * 0.1 - - if abs(p_center - feat_start) >= ten_percent: - if peak_dir == "upstream" and dir_key == ["upstream"]: - return True - return peak_dir == "downstream" and dir_key == ["downstream"] - - if abs(p_center - feat_start) < ten_percent: - return False - - -def distance_to_peak_center(p_center, hit3, hit4, strand, feat_pos): - """Returns the position of feature from which the distance is minimum to the peak and the measured distance to peak center""" - import operator - - feat_start = hit3 if strand == "+" else hit4 - feat_end = hit4 if strand == "+" else hit3 - - hit_center = np.mean([feat_start, feat_end]) - hit_pos = {"start": feat_start, "center": hit_center, "end": feat_end} - d_pos = [abs(hit_pos[i] - p_center) for i in feat_pos] - pos, dmin = min(enumerate(d_pos), key=operator.itemgetter(1)) - - min_pos = feat_pos[pos] - return min_pos, dmin - - -def find_peak_dir(fstart, fend, strand, pstart, p_center, pend): - """ Returns the location of the queried peak relative to the feature direction """ - feat_start = fstart if strand == "+" else fend - feat_end = fend if strand == "+" else fstart - - # Add one base to single-base features to give length for direction - # detection. - if feat_start == feat_end and strand == "+": - feat_end = feat_start + 1 - elif feat_start == feat_end and strand == "-": - feat_start = feat_end + 1 - - up_sides = abs(p_center - feat_start) < abs(p_center - - feat_end) # left and right - down_sides = abs(p_center - feat_start) > abs(p_center - feat_end) - - upstream = ((abs(pstart - feat_end) > abs(pstart - feat_start)) - and (abs(pend - feat_end) > abs(pend - feat_start))) - downstream = ((abs(pstart - feat_end) < abs(pstart - feat_start)) - and (abs(pend - feat_end) < abs(pend - feat_start))) - - if upstream and up_sides: - return "upstream" - elif downstream and down_sides: - return "downstream" - else: - return "not.specified" - - -def find_internals_dir(anchor, p_center, feat_start, feat_end, strand): - """ Determine the direction of peak when feature-inside-peak or peak-inside-feature """ - # +strand: When internal feature/peak is Left of p.center, dir->downstream, otherwise dir-> upstream, - # -strand :inverse of above - feat_center = np.mean([feat_start, feat_end]) - feat_pos = {"start": feat_start, "center": feat_center, "end": feat_end} - if strand == "+": - return "downstream" if feat_pos[anchor] < p_center else "upstream" - - if strand == "-": - return "upstream" if feat_pos[anchor] < p_center else "downstream" - - -def get_distance_by_dir(inputD, genom_loc, intern_loc, Dhit, internals_allowed=["False"]): - """Limits the hit by a distance window depending on peak's position relative-to-feature """ - - [D_upstr, D_downstr] = inputD - - if genom_loc == "upstream" or genom_loc == "overlapStart": - return Dhit <= D_upstr - if genom_loc == "downstream" or genom_loc == "overlapEnd": - return Dhit <= D_downstr - - # Rescue if internal peak - if internals_allowed == ["True"]: - return(any(intern_loc)) - else: - best_dist = [Dhit <= D_upstr if l == "upstream" else Dhit <= D_downstr for l in intern_loc] - return(any(best_dist)) - - -def detect_internals(pstart, pend, hit_start, hit_end): - """[Local] Check for internal features (for 'genomic_location' column, independent of users key option) """ - - feat_in_peak = (pstart < hit_start and hit_end < pend) - peak_in_feat = (hit_start < pstart and pend < hit_end) - - if feat_in_peak: - return "FeatureInsidePeak" - elif peak_in_feat: - return "PeakInsideFeature" - else: - return "not.specified" - - -def round_up(val): - """[Local] Rounds a value on second decimal """ - import math - val = math.ceil(val * 100) / 100 - return val - - -# import "division" allows decimals -def calculate_overlap(pstart, pend, peakL, featL, hit_start, hit_end): - """[Local] Returns value of overlap btw {0,1} representing percentage of length covered by the peak and that covered by the feature """ - p_range = list(range(pstart, pend)) - feat_range = list(range(hit_start, hit_end)) - pset = set(p_range) - - ovl_range = pset.intersection(feat_range) - peakL = 1 if peakL == 0 else peakL - featL = 1 if featL == 0 else featL - - ovl_pk = round_up(len(ovl_range) / peakL) - ovl_pk = 1 if ovl_pk > 1 else ovl_pk - ovl_feat = round_up(len(ovl_range) / featL) - ovl_feat = 1 if ovl_feat > 1 else ovl_feat - - return ovl_pk, ovl_feat, ovl_range - - -def define_genom_loc(current_loc, pstart, p_center, pend, hit_start, hit_end, hit_strand, ovl_range): - """ [Local] Returns location label to be given to the annotated peak, if upstream/downstream or overlapping one edge of feature.""" - all_pos = ["start", "end"] - closest_pos, dmin = distance_to_peak_center( - p_center, hit_start, hit_end, hit_strand, all_pos) - if current_loc == "not.specified": # Not internal - if closest_pos == "end" and any(ovl_range): - return "overlapEnd" - elif closest_pos == "start" and any(ovl_range): - return "overlapStart" - elif not any(ovl_range): - # Check about direction :"upstream", "downstream" - current_loc = find_peak_dir( - hit_start, hit_end, hit_strand, pstart, p_center, pend) - return current_loc - return current_loc - - -def overlap_peak_feature(genom_loc, pstart, p_center, pend, peakL, featL, hit_start, hit_end, hit_strand): - """Gives the collective information of the location and overlap of a peak to a feature """ - # A/ Find internal features/peaks - # hit_start, hit_end : as given in gtf/no inversion for '-'strand. - genom_loc = detect_internals(pstart, pend, hit_start, hit_end) - # B/ Calculate overlap Ratio peak/feat and feat/peak - ovl_pk, ovl_feat, ovl_range = calculate_overlap( - pstart, pend, peakL, featL, hit_start, hit_end) - - # C/ Define genomic_location with or w/o overlap (input is internals - # genom_loc for comparison and update ) - genom_loc = define_genom_loc( - genom_loc, pstart, p_center, pend, hit_start, hit_end, hit_strand, ovl_range) - return genom_loc, ovl_pk, ovl_feat - - -def get_hit_attribute(hit, attribute): - """Splits attributes from hit lines""" - pos_match = [i if re.match( - attribute, hit_a) else None for i, hit_a in enumerate(hit[8].split("; "))] - pos_match = [x for x in pos_match if x is not None] - if len(pos_match) > 0: - attr_val = [hit[8].split("; ")[m].split(" ")[ - 1].strip('"\'').rstrip('\";') for m in pos_match] - val = [av for av in attr_val if av is not None][0] - val = val.replace("\t", "").replace("\r", "").replace("\n", "") - return val - else: - return "not.found" - - -def get_besthit(q, len_q_dist, p_nm, hit, attrib_keys, Dhit): - """Defines the best hit for a query.""" - Dhit = [Dhit, Dhit] if len_q_dist == 2 else Dhit - - ret = None - if attrib_keys != ["None"]: - #attr_val = map(lambda k: re.split(k+" ", hit[8])[1].split(';')[0].strip('"\'').rstrip('\"'), attrib_keys) - # list of all values if multiple keys - attr_val = [get_hit_attribute(hit, a) for a in attrib_keys] - ret = [Dhit, [hit[2], hit[3], hit[4], hit[6], attr_val]] - - elif attrib_keys == ["None"]: - ret = [Dhit, [hit[2], hit[3], hit[4], hit[6]]] - - return ret - - -def create_table(peak_id, chrom, pstart, pend, p_center, min_dist_hit, attrib_keys, min_pos, genom_loc, ovl_pf, ovl_fp, i): - """Saves info of the hit in a tabular form to be written in the output table. """ - if attrib_keys != ["None"]: - # extract min_dist_content - [dist, [feat, fstart, fend, strand, attrib_val]] = min_dist_hit - # attrib_val.strip("\r").strip("\t").strip("\n") - dist = max(dist) if isinstance(dist, list) else dist - dist = '%d' % round(dist, 1) - best_res = "\t".join(np.hstack([peak_id, chrom, pstart, p_center, pend, feat, fstart, - fend, strand, min_pos, dist, genom_loc, str(ovl_pf), str(ovl_fp), attrib_val, str(i)])) - return best_res + "\n" - - elif attrib_keys == ["None"]: - [dist, [feat, fstart, fend, strand]] = min_dist_hit - dist = max(dist) if isinstance(dist, list) else dist - dist = '%d' % round(dist, 1) - best_res = "\t".join([peak_id, chrom, pstart, p_center, pend, feat, fstart, - fend, strand, min_pos, dist, genom_loc, str(ovl_pf), str(ovl_fp), str(i)]) - return best_res + "\n" - - -def write_hit_to_All(All_hits_tab, p_name, attrib_k, Dhit, hit, peak_id, chrom, pstart, pend, p_center, min_pos, genomic_location, ovl_pk, ovl_feat, j): - """Writes an output table.""" - if attrib_k != ["None"]: - attr_val = [get_hit_attribute(hit, a) for a in attrib_k] - hit2add = [Dhit, [hit[2], hit[3], hit[4], hit[6], attr_val]] - the_res = create_table(peak_id, chrom, pstart, pend, p_center, - hit2add, attrib_k, min_pos, genomic_location, ovl_pk, ovl_feat, j) - elif attrib_k == ["None"]: - hit2add = [Dhit, [hit[2], hit[3], hit[4], hit[6]]] - the_res = create_table(peak_id, chrom, pstart, pend, p_center, - hit2add, attrib_k, min_pos, genomic_location, ovl_pk, ovl_feat, j) - - if All_hits_tab[j][p_name] == '' or All_hits_tab[j][p_name].split("\t")[9] == "NA": - All_hits_tab[j][p_name] = the_res - - elif All_hits_tab[j][p_name] != '' or All_hits_tab[j][p_name].split("\t")[9] != "NA": - All_hits_tab[j][p_name] = All_hits_tab[j][p_name].strip("\n") - All_hits_tab[j][p_name] = '\n'.join((All_hits_tab[j][p_name], the_res)) - - return All_hits_tab[j][p_name] - - -def concat_comments(queries, priority, annot_gtf, peaks_bed): - """Puts together comments for file header.""" - comments = ["#UROPA-Universal RObust Peak Annotator"] - for q in enumerate(queries): - comments.append("#Query No {} :".format(q[0])) - comments.append("#feature: {}, strand: {}, feature.anchor: {}, distance : {} , direction : {}, internals :{},\n#show.attribute(s): {}, filter.attribute: {},attribute.value:{}".format( - q[1]['feature'], q[1]['strand'], q[1]['feature.anchor'], q[ - 1]['distance'], q[1]['direction'], q[1]['internals'], - q[1]['show.attributes'], q[1]['filter.attribute'], q[1]['attribute.value'])) - comments.append("#priority:{}".format(priority)) - comments.append("#GTF: {}".format(annot_gtf)) - comments.append("#BED: {}".format(peaks_bed)) - comments.append("#Columns.terminology:") - comments.append("#feature, feature_start, feature_end, feature_strand: The information of the genomic feature that annotates the peak, as extracted by the gtf file.") - comments.append( - "#distance:The distance measured as following: abs(peak.center-feature.anchor).If no feature.anchor given,then the minimum of 3 distances from each feature.anchor{start,center,end} to peak.center is chosen.") - comments.append("#feat_anchor : The position of the genomic feature chosen for annotation that had the minimum distance to the peak.center.If feature.anchor given in config this will be shown also here.") - comments.append( - "#genomic_location: The position of the peak relative to the annotated feature direction.") - comments.append("#feat_ovl_peak : When peak and feature overlap(i.e genomic_location = overlapStart), Ratio(overlapping region / peak.length) shows percentage of peak covered by the feature.(i.e 1.0 = 100% of peak covered, peak is internal.") - comments.append("#peak_ovl_feat : When peak and feature overlap(i.e genomic_location = overlapStart), Ratio(overlapping region / feature.length) shows percentage of feature covered by the peak.(i.e 1.0 = 100% of feature covered, feature is internal.)") - comments.append("#Attributes that have been given in the key 'show.atttributes' will be shown here and their values extracted by the gtf will be displayed for each feature.If 'filter.attribute' contains same attribute, this column helps confirm the filtering.") - comments.append("#query:The query that validates with its given parameters the feature to be assigned to the peak.If only one query given, column will always display '0',the first query.") - comments.append("#---------------------------------------------------------------------------------------------------------------------------------------#") - - return '\n'.join(comments) - -def write_partial_file(outfile, Table): - """Writes a partial file.""" - with open(outfile, "w") as of: - for k in Table: - of.write("".join(Table[k])) - - -def merge_queries(Best_combo_k): - """Merges queries.""" - merged_q = [Best_combo_k[l].split( - "\t")[-1].strip("\n") for l in range(len(Best_combo_k))] - # Read if combo_k is "" and give it the q.numb, so I have it ready for - # merging when one query ="" - merged_q[-1] = merged_q[-1] + "\n" - joined_q = ",".join(merged_q) - Best_combo_k_0 = "\t".join(Best_combo_k[0].split("\t")[:-1]) - Best_merg_q = Best_combo_k_0 + "\t" + joined_q - return Best_merg_q - -def finalize_file(ffile, partials, header, comments=None, log=None): - """Writes a output table of UROPA.""" - try: - with open(ffile, "w") as f: - if not comments is None: - f.write(comments+'\n') - f.write(header+'\n') - for p in partials: - with open(p) as pf: - shutil.copyfileobj(pf, f) - except IOError: - if not log is None: - log.error("Unable to write to file {}".format(ffile)) - -def cleanup(directory, log=None): - """Removes temporary files from a UROPA directory.""" - temp_files = glob.glob(directory + '*_part_*') - for f in temp_files: - try: - os.remove(f) - except OSError: - if not log is None: - log.error("Could not remove temporary file {}".format(f)) - continue diff --git a/uropa/uropa.py b/uropa/uropa.py index cb04059..15a96da 100644 --- a/uropa/uropa.py +++ b/uropa/uropa.py @@ -3,7 +3,7 @@ """ uropa.py: UROPA - Universal RObust Peak Annotator -@authors: Annika Fust, Jens Preussner, Maria Kondili and Mario Looso +@authors: Annika Fust, Jens Preussner, Maria Kondili, Mette Bentsen and Mario Looso @license: MIT @maintainer: Mario Looso @email: mario.looso@mpi-bn.mpg.de @@ -13,431 +13,406 @@ import os import sys import json -import glob import argparse -import logging import datetime -import shutil -import subprocess as sp +import time +import subprocess +import gzip + +import logging import multiprocessing as mp -from itertools import chain -from functools import partial -from functools import reduce +import pysam +import pandas as pd -import numpy as np +#Import internal functions +from uropa.utils import * +from uropa.annotation import * -from . import config as cfg -from . import overlaps as ovls -from . import annotation as ant +def restricted_float(f, f_min, f_max): + f = float(f) + if f < f_min or f > f_max: + raise argparse.ArgumentTypeError("{0} not in range [0.0, 1.0]".format(f)) + return f -#if __name__ == "__main__": def main(): + + ############################################################################################################ + #################################################### INPUT ################################################# + ############################################################################################################ + + cmd = " ".join(sys.argv) + + #----------------------------------------------------------------------------------------------------------# + # Parse command-line arguments + #----------------------------------------------------------------------------------------------------------# + parser = argparse.ArgumentParser( prog="uropa", description='UROPA - Universal RObust Peak Annotator.', - epilog=cfg.howtoconfig(), - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument( - "-i", - "--input", - help="filename of configuration file", - required=True, - action="store", - metavar="config.json") - parser.add_argument( - "-p", - "--prefix", - dest="prefix", - help="prefix for result file names (defaults to basename of input file)", - required=False, - action="store", - metavar="prefix") - parser.add_argument( - "-r", - "--reformat", - help="create an additional compact and line-reduced table as result file", - required=False, - action="store_true") - parser.add_argument( - "-s", - "--summary", - help="filename of additional visualisation of results in graphical format", - required=False, - action="store_true") - parser.add_argument( - "-t", - "--threads", - help="multiprocessed run: n = number of threads to run annotation process", - type=int, - required=False, - action="store", - metavar="n", - default=1) - parser.add_argument( - "--add-comments", - help="add comment lines to output files", - required=False, - action="store_true") - parser.add_argument( - "-l", - "--log", - help="log file name for messages and warnings", - action="store", - metavar="uropa.log") - parser.add_argument( - "-d", - "--debug", - help="print verbose messages (for debugging) to stdout and log", - required=False, - action="store_true") - parser.add_argument( - "-v", - "--version", - help="prints the version and exits", - action="version", - version="%(prog)s 2.0.4") + epilog=howtoconfig(), + formatter_class=lambda prog: argparse.RawDescriptionHelpFormatter(prog, max_help_position=35, width=90)) + + #Configuation arguments for one query + one_query = parser.add_argument_group("Arguments for one query") + one_query.add_argument("-b", "--bed", metavar="", help="Filename of .bed-file to annotate", action="store") + one_query.add_argument("-g", "--gtf", metavar="", help="Filename of .gtf-file with features", action="store") + one_query.add_argument("--feature", help="Feature for annotation", metavar="", nargs="*", default=[]) + one_query.add_argument("--feature_anchor", help="Feature anchor to annotate to", metavar="", nargs="*", default=["start", "center", "end"]) + one_query.add_argument("--distance", help="Maximum permitted distance from feature (1 or 2 arguments)", metavar="", nargs="*", type=int, default=[1000,10000]) + one_query.add_argument("--strand", metavar="", help="Desired strand of annotated feature relative to peak", nargs="*", choices=['ignore', 'same', 'opposite'], default='ignore') + one_query.add_argument("--relative_location", metavar="", help="Peak locaion relative to feature location", nargs="*", choices=["PeakInsideFeature", "FeatureInsidePeak", "Upstream", "Downstream", "OverlapStart", "OverlapEnd"], default=["PeakInsideFeature", "FeatureInsidePeak", "Upstream", "Downstream", "OverlapStart", "OverlapEnd"]) + one_query.add_argument("--internals", metavar="", help="Set minimum overlap fraction for internal feature annotations. 0 equates to internals=False and 1 equates to internals=True. Default is False.", type=lambda x: restricted_float(x, 0, 1), default=False) + one_query.add_argument("--filter_attribute",metavar="", help="Filter on 9th column of GTF", default=None) + one_query.add_argument("--attribute_values", help="Value(s) of attribute corresponding to --filter_attribute", nargs="*", metavar="", default=[]) + one_query.add_argument("--show_attributes", help="A list of attributes to show in output", metavar="", nargs="*", default=[]) + + #Or configuration arguments for multiple queries (overwrites) + multi_query = parser.add_argument_group("Multi-query configuration file") + multi_query.add_argument("-i", "--input", help="Filename of configuration file (keys in this file overwrite command-line arguments about query)", action="store", metavar="config.json") + + #Other arguments + additional = parser.add_argument_group("Additional arguments") + additional.add_argument("-p", "--prefix", metavar="", help="Prefix for result file names (defaults to basename of .bed-file)") + additional.add_argument("-o", "--outdir", metavar="", help="Output directory for output files (default: current dir)", default=".") + #additional.add_argument("-r","--reformat", help="create an additional compact and line-reduced table as result file", action="store_true") + additional.add_argument("-s","--summary", help="Filename of additional visualisation of results in graphical format", action="store_true") + additional.add_argument("-t","--threads", help="Multiprocessed run: n = number of threads to run annotation process", type=int, action="store",metavar="n",default=1) + #additional.add_argument("--add-comments",help="add comment lines to output files", action="store_true") + additional.add_argument("-l","--log", help="Log file name for messages and warnings (default: log is written to stdout)", action="store", metavar="uropa.log") + additional.add_argument("-d","--debug",help="Print verbose messages (for debugging)", action="store_true") + additional.add_argument("-v","--version",help="Prints the version and exits", action="version",version="%(prog)s 3.0.0") args = parser.parse_args() - config = args.input - - # Configure logging - logger = logging.getLogger(__name__) - logger.setLevel(logging.DEBUG) + #Write help if no input was given + if len(sys.argv[1:]) == 0: + parser.print_help() + sys.exit() - streamHandle = logging.StreamHandler() - loggerFormat = logging.Formatter('[%(levelname)s] - %(message)s') - streamHandle.setFormatter(loggerFormat) + #Check valid input to deal with the split between command-line query and multi-query config file + if args.input == None: + if args.bed == None: + sys.exit("ERROR: --bed is needed for annotation without --input") + if args.gtf == None: + sys.exit("ERROR: --gtf is needed for annotation without --input") - if args.debug: - streamHandle.setLevel(logging.DEBUG) - else: - streamHandle.setLevel(logging.WARNING) - logger.addHandler(streamHandle) - if args.log is not None: - logpath = os.path.dirname(args.log) - if not os.path.exists(logpath) and logpath != '': - try: - os.makedirs(logpath) - except OSError: - logger.error("Could not create directory for log file {}".format(logpath)) - try: - fileHandle = logging.FileHandler(args.log, mode='w') - fileHandle.setLevel(logging.DEBUG) - fileHandle.setFormatter(loggerFormat) - logger.addHandler(fileHandle) - except IOError: - logger.error("Could not create log file {}".format(args.log)) - - if args.prefix is not None: - prefixpath = os.path.dirname(args.prefix) - if not os.path.exists(prefixpath): - if not prefixpath == '': - try: - os.makedirs(prefixpath) - except IOError: - logger.error("Could not create directory {} for output".format(prefixpath)) - else: - prefixpath = '.' - outdir = prefixpath + '/' + os.path.basename(args.prefix) + '_' - else: - outdir = "./" + os.path.splitext(os.path.basename(args.input))[0] + '_' + #----------------------------------------------------------------------------------------------------------# + # Configure logger + #----------------------------------------------------------------------------------------------------------# - logger.debug("Directory for output files is {}".format(outdir)) - logger.info("Start time: %s", datetime.datetime.now().strftime("%d.%m.%Y %H:%M")) + logger = logging.getLogger(__name__) + logger_format = logging.Formatter('%(asctime)s [%(levelname)s] - %(message)s', "%Y-%m-%d %H:%M:%S") + logger_level = logging.DEBUG if args.debug else logging.INFO - try: - cfg_dict = cfg.parse_json(config) - except IOError: - logger.error("File %s does not exists or is not readable.", config) - sys.exit() - except ValueError as e: - logger.error("File %s contains malformed JSON. %s", config, e) - sys.exit() + #Log vs. stream logger + if args.log is not None: - parameters = cfg.parse_parameters(cfg_dict, logger) - priority = parameters["priority"] - annot_gtf = parameters["gtf"] - peaks_bed = parameters["bed"] + #Check if logfile can be created + try: + log = logging.FileHandler(args.log, "w") + log.setLevel(logger_level) + log.setFormatter(logger_format) + logger.addHandler(log) + except: + sys.exit("ERROR: Could not create logfile {0}. Please check that the given path exists.".format(args.log)) - try: - gtf_has_chr, gtf_cols = cfg.parse_first_gtf_line(annot_gtf) - except IOError: - logger.error("File %s does not exist or is not readable.", annot_gtf) - sys.exit() - - # > Check GTF format - if gtf_cols == 9: - logger.info("GTF file format validated. Extracting all features from GTF now...") else: - logger.warning("File %s is not a proper GTF file!", annot_gtf) + #Stdout stream + stream = logging.StreamHandler(sys.stdout) + stream.setLevel(logger_level) + stream.setFormatter(logger_format) + logger.addHandler(stream) - gtf_feat = cfg.column_from_file(annot_gtf, 3, logger) + logger.setLevel(logger_level) - if len(gtf_feat) < 1: - logger.error("No features found in file {} for annotation.".format(annot_gtf)) - sys.exit() - - if not os.path.exists(peaks_bed): - logger.error("File %s does not exists or is not readable.", peaks_bed) - sys.exit() - # Sanity check for other parameters - accepted_val = ['T', 'True', 'TRUE', 'Yes', 'YES', 'Y', 'yes', 'F', 'False', 'FALSE', 'No', 'NO', 'N', 'no'] - if priority not in accepted_val: - logger.error("Priority value should be one of the values from %s", accepted_val) + ############################################################################################################ + ############################################ VALIDATION OF INPUT ########################################### + ############################################################################################################ + + logger.info("Started UROPA") + logger.info("Working directory: {0}".format(os.getcwd())) + logger.info("Command-line call: {0}".format(cmd)) + temp_files = [] + + # Validate output folder + outdir = args.outdir + if not os.path.exists(outdir): + try: + logger.debug("Creating directory {}".format(outdir)) + os.makedirs(outdir) + except Exception as e: + logger.error(e) + logger.error("Could not create directory {} for output".format(outdir)) + + #----------------------------------------------------------------------------------------------------------# + # Establish queries from command-line and --input + #----------------------------------------------------------------------------------------------------------# + + logger.info("Reading configuration from commandline/input config") + + #First, fill in parameters from commandline + default_query = {"feature":args.feature, + "feature_anchor":args.feature_anchor, + "distance": [args.distance[0], args.distance[0]] if len(args.distance) == 1 else args.distance, + "strand": args.strand, + "relative_location": args.relative_location, + "internals": args.internals, + "filter_attribute": args.filter_attribute, + "attribute_values": args.attribute_values, + } + + #create cfg_dict like it would have been parsed from config .json + cfg_dict = {"queries": [default_query], + "show_attributes": args.show_attributes, + "priority": False, + "gtf": args.gtf, + "bed": args.bed + } + + logger.debug("Config from command-line arguments: {0}".format(cfg_dict)) + + #Next, overwrite with config arguments if given, otherwise the arguments fall back to commandline default + config = args.input + if config != None: + try: + json_cfg_dict = parse_json(config) + logger.debug("Config from json: {0}".format(json_cfg_dict)) + for key in json_cfg_dict: + cfg_dict[key] = json_cfg_dict[key] #config values always win over commandline + except IOError: + logger.error("File %s does not exists or is not readable.", config) + sys.exit() + except ValueError as e: + logger.error("File %s contains malformed JSON. %s", config, e) + sys.exit() + + #Fill each key with defaults if they were not set + for query in cfg_dict["queries"]: + for key in default_query: + query[key] = query.get(key, default_query[key]) + + #Format keys in cfg_dict + cfg_dict = format_config(cfg_dict) + logger.debug("Formatted config: {0}".format(cfg_dict)) + + #Check for general input in queries, such as no show_attributes + if len(cfg_dict["show_attributes"]) == 0: + logger.warning("No show_attributes given - no attributes for annotations are displayed in output.") + + #catch duplicates in query names + query_names = [query["name"] for query in cfg_dict["queries"]] + if len(query_names) != len(set(query_names)): + logger.warning("Duplicates in query names: {0}".format(query_names)) + + #----------------------------------------------------------------------------------------------------------# + # Validate gtf / bed input + #----------------------------------------------------------------------------------------------------------# + + #Check if bed & gtf files exists + check_file_access(cfg_dict["bed"], logger) + check_file_access(cfg_dict["gtf"], logger) + + #Check prefix + if args.prefix == None: + args.prefix = os.path.splitext(os.path.basename(cfg_dict["bed"]))[0] + + output_prefix = os.path.join(args.outdir, args.prefix) + logger.debug("Output_prefix set to: {0}".format(output_prefix)) + + #Check whether output files can be written + output_files = [os.path.join(output_prefix + suffix) for suffix in ["_allhits.txt", "_allhits.bed", "_finalhits.txt", "_finalhits.bed"]] + for f in output_files: + if os.path.exists(f) and not os.access(f, os.W_OK): + logger.error("Output file {0} is not writable.".format(f)) + sys.exit() + + + ############################################################################################################ + ################################################## PREPARATION ############################################# + ############################################################################################################ + + #----------------------------------------------------------------------------------------------------------# + # Prepare bed for internal region-structure + #----------------------------------------------------------------------------------------------------------# + + logger.info("Reading .bed-file to annotate") + + #Check bed format and parse to internal structure + check_bed_format(cfg_dict["bed"], logger) + gtf_has_chr = check_chr(cfg_dict["gtf"]) #True/False + + peaks = parse_bedfile(cfg_dict["bed"], gtf_has_chr) + logger.debug("Read {0} peaks from {1}".format(len(peaks), cfg_dict["bed"])) + + #Establish order of peaks + internal_peak_ids = [peak["internal_peak_id"] for peak in peaks] + + #Split bed into chunks + n_chunks = 100 + peak_chunks = [peaks[i::n_chunks] for i in range(n_chunks)] + + + #----------------------------------------------------------------------------------------------------------# + # Prepare GTF and extract chosen features to a subset gtf-file if needed + #----------------------------------------------------------------------------------------------------------# + + logger.info("Preparing .gtf for fast access") + + #Check all possible features in gtf + logger.debug("Finding all possible features in gtf") + gtf_feat_count = {} + with open(cfg_dict["gtf"]) as f: + for line in f: + columns = line.rstrip().split("\t") + feature = columns[2] + + if feature not in gtf_feat_count: + gtf_feat_count[feature] = 0 + gtf_feat_count[feature] += 1 + gtf_feat = list(gtf_feat_count.keys()) + logger.debug("Features in gtf: {0}".format(gtf_feat_count)) + + #Fill in empty feature keys with all possible + for query in cfg_dict["queries"]: + if len(query.get("feature", [])) == 0: + query["feature"] = gtf_feat + + #Count all features given in config + query_feat = [] + for query in cfg_dict["queries"]: + query_feat.extend(query["feature"]) + query_feat = list(set(query_feat)) + + #Check if any features were given that are not possible to subset on + not_in_gtf = list(set(query_feat) - set(gtf_feat)) + if len(not_in_gtf) > 0: + logger.error("Query feature(s) {0} not found in gtf".format(not_in_gtf)) sys.exit() - # All parameters of Queries, filled-in with default values where necessary. - all_queries = cfg.parse_queries(cfg_dict, gtf_feat, logger) - logger.info("Number of all queries for parametric peak annotation: %s", len(all_queries)) - queries = cfg.remove_invalid_queries(all_queries, logger) - logger.info("Number of valid queries for parametric peak annotation: %s", len(queries)) - - # Validate query features - query_features = list(chain.from_iterable([q["feature"] for q in queries])) - feat_not_valid = [f for f in query_features if f not in gtf_feat] - feat_valid = [f for f in query_features if f not in feat_not_valid] - - if feat_not_valid: - logger.warning("Invalid features present (found in a query but not in the GTF): %s", ','.join(feat_not_valid)) - - # > Cut gtf according to features of config, and index the cut.gtf for Faster Annotation - # ! Attention : In the UCSC transformed gtf files,the feature is put by default "tfbs" - if len(gtf_feat) > 1: - gtf_cut_file = cfg.cut_gtf_perFeat(annot_gtf, feat_valid, outdir) - mygtf = gtf_cut_file - + #Subset gtf if needed + logger.debug("Subsetting gtf if needed") + gtf_specific = list(set(gtf_feat) - set(query_feat)) #features specific for gtf but which are not taken into account in queries + if len(gtf_specific) > 0: + sub_gtf = output_prefix + "_feature_subset.gtf" + logger.debug("Subsetting {0} -> {1} with features {2}".format(cfg_dict["gtf"], sub_gtf, query_feat)) + subset_gtf(cfg_dict["gtf"], gtf_feat, sub_gtf) + anno_gtf = sub_gtf + temp_files.append(sub_gtf) else: - mygtf = annot_gtf - - # Create a new JSON for SUMMARY, with all keys completed : - summ_dict = dict() - summ_dict["queries"] = queries - summ_dict["priority"] = priority - summ_dict["gtf"] = annot_gtf - summ_dict["bed"] = peaks_bed - - with open(outdir+"summary_config.json", "w") as fj: - json.dump(summ_dict, fj, indent=4) + anno_gtf = cfg_dict["gtf"] - distances = reduce(list.__add__, [q["distance"] for q in queries]) - max_distance = int(max(distances)) + #Compress and index using gzip/tabix + logger.debug("Tabix compress") - logger.info("Distance for peak enlargment: %s", max_distance) + anno_gtf_gz = output_prefix + ".gtf.gz" + anno_gtf_index = anno_gtf_gz + ".tbi" + pysam.tabix_compress(anno_gtf, anno_gtf_gz, force=True) + anno_gtf_gz = pysam.tabix_index(anno_gtf_gz, index=anno_gtf_index, seq_col=0, start_col=3, end_col=4, keep_original=True, force=True) + temp_files.extend([anno_gtf_gz, anno_gtf_index]) - gtf_index = mygtf + '.sorted.gz' + #Write out formatted config file + json_string = config_string(cfg_dict) + f = open(output_prefix + ".json", "w") + f.write(json_string) + f.close() - if os.path.exists(gtf_index): - logger.warning( - "GTF-Index already existed before indexing. Will overwrite old index.") - try: - ovls.tabix_index(mygtf) # Sort, zip, index the gtf - except IOError: - logger.error("Unable to create index for file %s. Make sure tabix, bgzip and sort are available and UROPA has rights to (over)write files.", mygtf) - sys.exit() + ############################################################################################################ + ################################################## Annotation ############################################## + ############################################################################################################ - # Validate priority parameter - pr = True if priority in ['T', 'True', 'TRUE', 'Yes', 'YES', 'Y', 'yes'] else False - - # Create list of Attributes from all queries together,to be shown in final - # Table-columns for All queries - query_attributes_none = list([q["show.attributes"] for q in queries]) - query_attributes = [a for a in np.unique(list(chain.from_iterable(query_attributes_none))) if a is not None and a != 'None'] - - logger.info("Additional attributes included in output: %s", ','.join(query_attributes)) - - # query included, when there is no hit from tabix - nas_len = len(query_attributes) + 10 - NAsList = list(np.repeat("NA", nas_len)) - NAsList[nas_len - 1] = "- " - - header_base = [ - "peak_id", - "peak_chr", - "peak_start", - "peak_center", - "peak_end", - "feature", - "feat_start", - "feat_end", - "feat_strand", - "feat_anchor", - "distance", - "genomic_location", - "feat_ovl_peak", - "peak_ovl_feat"] - - header = "\t".join(header_base + query_attributes + ["query"]) - - # - # Preparation of multiprocessing - # - spl_dir = outdir + "split_peaks/" - if args.threads > 1: - logger.info("Multiprocessing: Peak file will be split in %s smaller files.", args.threads) - if not os.path.exists(spl_dir): - os.makedirs(spl_dir) - chunks = args.threads - 1 - #cmd = ['split', - # '-n l/' + str(args.threads), - # peaks_bed, - # spl_dir + 'spl_peak_'] - - cmd = ["split", "-l $(expr `wc", peaks_bed, - "| cut -d ' ' -f3` /", str(chunks), ")", peaks_bed, spl_dir + 'spl_peak_' - ] - - try: - #sp.check_call(cmd, stderr=open(os.devnull, 'w')) - os.system(str(" ".join(cmd))) - except sp.CalledProcessError: - args.threads = 1 - logger.warning( - "Unable to split peak input into smaller files. Falling back to one thread.") - logger.info("Check if split command is installed in version 8.22 or higher.") - except OSError as e: - args.threads = 1 - logger.warning( - "Split command not available. Falling back to one thread.") - -# -# Processing peaks -# - input_args = [outdir, gtf_index, query_attributes, queries, max_distance, pr, gtf_has_chr] - - # > Write output according to Thread option + logger.info("Started annotation") + n_chunks = len(peak_chunks) if args.threads > 1: + pool = mp.Pool(args.threads) - partial_func = partial(ant.annotation_process, input_args) - pool.map(partial_func, glob.glob(spl_dir + "spl_peak_*")) - pool.close() + task_list = [pool.apply_async(annotate_peaks, args=(chunk, anno_gtf_gz, anno_gtf_index, cfg_dict, )) for chunk in peak_chunks] + pool.close() #done sending jobs to pool + + #Wait for tasks to finish + count = -1 + finished = sum([task.ready() for task in task_list]) + while finished < n_chunks: + finished = sum([task.ready() for task in task_list]) + if count != finished: + logger.info("Progress: {0:.0f}%".format(finished/float(n_chunks)*100)) + count = finished + else: + time.sleep(0.5) pool.join() + + #Get results from processes + results = [task.get() for task in task_list] + else: - ant.annotation_process(input_args, peaks_bed, logger) - # Files created after annot.process: + results = [] + for i, chunk in enumerate(peak_chunks): + results.append(annotate_peaks(chunk, anno_gtf_gz, anno_gtf_index, cfg_dict, logger)) + logger.info("Progress: {0:.0f}%".format((i+1)/float(n_chunks)*100)) - allhits_partials = glob.glob(outdir + "allhits_part_*") - finalhits_partials = glob.glob(outdir + "finalhits_part_*") + #Join results from threads to one dictionary + all_annotations = sum(results, []) + - logger.info("Writing output files to {}".format(outdir)) + ############################################################################################################ + ################################################ POSTPROCESSING ############################################ + ############################################################################################################ - if args.add_comments: - comments = ovls.concat_comments(queries, str(pr), annot_gtf, peaks_bed) - else: - comments = None + logger.info("Processing annotated peaks") - # - # Merging output files - # - allhits_outfile = outdir + "allhits.txt" + #Add attribute columns + #The keys are different internally vs. the output columns + attribute_columns = cfg_dict["show_attributes"] + main = ["peak_chr", "peak_start", "peak_end", "peak_id", "feature", "feat_start", "feat_end", "feat_strand", "feat_anchor", "distance", "relative_location", "feat_ovl_peak", "peak_ovl_feat"] + header_internal = main + ["show_" + col for col in attribute_columns] + ["query_name"] + header_output = main + attribute_columns + ["name"] - if len(queries) > 1 and not pr: - besthits_outfile = outdir + "besthits.txt" - merged_outfile = outdir + "finalhits.txt" + for annotation in all_annotations: + attributes_dict = annotation["feat_attributes"] + if type(attributes_dict) == dict: + annotation.update({"show_" + key: attributes_dict[key] for key in attributes_dict}) - besthits_partials = glob.glob(outdir + "besthits_part_*") - ovls.finalize_file(merged_outfile, finalhits_partials, header, comments, log=logger) - ovls.finalize_file(besthits_outfile, besthits_partials, header, comments, log=logger) - else: - besthits_outfile = outdir + "finalhits.txt" - ovls.finalize_file(besthits_outfile, finalhits_partials, header, comments, log=logger) - - logger.debug("Filenames for output files are: {}, {}". format(allhits_outfile, besthits_outfile)) - - ovls.finalize_file(allhits_outfile, allhits_partials, header, comments, log=logger) - - - # finalhits in bed format - besthits_outfile_bed = outdir + "finalhits.bed" - # colnames: peak_chr peak_start peak_end peak_id peak_strand peak_score feature feat_start feat_end feat_strand feat_anchor distance genomic_location - os.system("awk 'BEGIN { OFS = \"\t\" } ; { print $2,$3,$5,$1,\".\",\".\",$6,$7,$8,$9,$10,$11,$12 }' " + outdir + "finalhits.txt" + " | sed -e 1d | sed -e 's/\t\t/\t/g' > " + besthits_outfile_bed + ".tmp") - # append shown attributes - attributes = os.popen("head -1 "+ outdir + "finalhits.txt" + " | awk '{print NF}'").read() - attributes = int(attributes) - 1 - os.system("cut -f15-" + str(attributes) + " " + outdir + "finalhits.txt" + " | sed -e 1d | paste -d'\t' "+ besthits_outfile_bed + ".tmp - > " + besthits_outfile_bed) - os.system("sort -o " + besthits_outfile_bed + " -k1,1 -k2,2n " + besthits_outfile_bed) - os.system("rm "+ besthits_outfile_bed + ".tmp") - # - # Reformat output - # - if args.reformat and len(queries) > 1 and not pr: - logger.info("Reformatting output...") - R_reform_Best = [ - 'uropa_reformat_output.R', - '-i', - besthits_outfile, - '-k', - 'peak_id', - '-c', - '1:5', - '-d', - ',', - '-t', - str(args.threads)] - try: - # creates output of Best and gives name "Reformatted_" - logger.debug('Reformat output call is {}'.format(R_reform_Best)) - pr_R = sp.check_output(R_reform_Best) - except sp.CalledProcessError: - logger.warning("Reformatted output could not be created with call %s", ' '.join(R_reform_Best)) - except OSError: - logger.warning("Rscript command not available for summary output.") - - # - # Visual summary - # - if args.summary: - logger.info("Creating the Summary graphs of the results...") - summary_script = "uropa_summary.R" - summary_output = outdir + "summary.pdf" - - if len(queries) > 1 and not pr and os.path.exists(merged_outfile): - call = [ - summary_script, - "-f", - merged_outfile, - "-c", - outdir + "summary_config.json", - "-o", - summary_output, - "-b", - besthits_outfile] - else: - call = [ - summary_script, - "-f", - besthits_outfile, - "-c", - outdir + "summary_config.json", - "-o", - summary_output] - try: - logger.debug('Summary output call is {}'.format(call)) - sum_pr = sp.check_output(call) - except sp.CalledProcessError: - logger.warning("Visualized summary output could not be created from %s.", ' '.join(call)) - except OSError: - logger.warning("Rscript command not available for summary output.") - - # - # Clean up of temporary files - # - outputs_ready = os.path.exists( - allhits_outfile) and os.path.exists(besthits_outfile) - - if outputs_ready: - #logger.debug("Attempting to clean {}".format(outdir)) - ovls.cleanup(outdir, logger) - - if os.path.exists(spl_dir): - shutil.rmtree(spl_dir) - if os.path.exists(outdir+"summary_config.json"): - os.remove(outdir+"summary_config.json") - os.remove(gtf_index) # .gz - os.remove(gtf_index + ".tbi") - if len(gtf_feat) >= 1: - os.remove(gtf_cut_file) - os.remove(gtf_cut_file + ".sorted") - - logger.info("End time: %s", datetime.datetime.now().strftime("%d.%m.%Y %H:%M")) + ##### Write output files ##### + all_hits = pd.DataFrame(all_annotations) + + #Sort on peak_ids + all_hits["original_order"] = [internal_peak_ids.index(peak_id) for peak_id in all_hits["internal_peak_id"]] #map position + all_hits.sort_values(by=["original_order", "feat_start", "feat_end"]) + + #All hits + logger.debug("Writing _allhits.txt") + all_hits.to_csv(os.path.join(output_prefix + "_allhits.txt"), na_rep="NA", sep="\t", index=False, columns=header_internal, header=header_output) + all_hits.to_csv(os.path.join(output_prefix + "_allhits.bed"), na_rep="NA", sep="\t", index=False, columns=header_internal, header=False) + + #Best hits + best_hits = all_hits.loc[all_hits["best_hit"] == 1] + best_hits.to_csv(os.path.join(output_prefix + "_finalhits.txt"), na_rep="NA", sep="\t", index=False, columns=header_internal, header=header_output) + best_hits.to_csv(os.path.join(output_prefix + "_finalhits.bed"), na_rep="NA", sep="\t", index=False, columns=header_internal, header=False) + + ##### Visual summary ##### + logger.info("Creating the Summary graphs of the results...") + summary_script = "uropa_summary.R" + summary_output = output_prefix + "_summary.pdf" + + #cmd is the command-line call str + call = [summary_script, "-f", os.path.join(output_prefix + "_finalhits.txt"), "-c", output_prefix + ".json", "-o", summary_output, "-b", os.path.join(output_prefix + "_allhits.txt"), "-a \'", cmd, "\'"] + call_str = ' '.join(call) + + try: + logger.debug('Summary output call is {}'.format(call_str)) + sum_pr = subprocess.check_output(call_str, shell=True) + except subprocess.CalledProcessError: + logger.warning("Visualized summary output could not be created from: %s", call_str) + except OSError: + logger.warning("Rscript command not available for summary output.") + + ##### Cleanup ##### + logger.info("Cleaning up temporary files.") + if args.debug == False: + for f in temp_files: + try: + os.remove(f) + except: + logger.warning("Could not remove temporary file {0}".format(f)) + + logger.info("UROPA run ended successfully!") diff --git a/uropa/utils.py b/uropa/utils.py new file mode 100644 index 0000000..46421d0 --- /dev/null +++ b/uropa/utils.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 +"""Contains helper functions for UROPA """ + +import json +from textwrap import dedent +import ast +import numpy as np +import os +import sys +import re + +#Validation of config +def howtoconfig(): + """Defines the epilog that is given when help is requested.""" + epilog = dedent("""\ + UROPA is a peak annotation tool facilitating the analysis of next-generation sequencing methods such + as ChIPseq and ATACseq. The advantage of UROPA is that it can accommodate advanced structures of annotation + requirements. UROPA is developed as an open source analysis pipeline for peaks generated from standard peak callers. + + Please cite upon usage: + Kondili M, Fust A, Preussner J, Kuenne C, Braun T and Looso M. UROPA: A tool for Universal RObust Peak Annotation. + Scientific Reports 7 (2017), doi: 10.1038/s41598-017-02464-y + + Please visit http://uropa-manual.readthedocs.io/config.html for detailed information on configuration. + + """) + """ + More information on how to set up the config-file is found at: + https://uropa-manual.readthedocs.io/config.html + + + The configuration file should at least contain paths for bed and GTF files: + + { + "queries": [], + "bed": "/path/to/bed/file.bed", + "gtf": "/path/to/annotation/file.gtf" + } + + Different query types can be defined using the queries key: + + { + "queries": [ + {...}, + {...}], + "bed": "/path/to/bed/file.bed", + "gtf": "/path/to/annotation/file.gtf" + } + + Optionally, the priority key can be used to fine tune UROPAs behaviour: + + { + "queries": [ + {...}, + {...}], + "bed": "/path/to/bed/file.bed", + "gtf": "/path/to/annotation/file.gtf", + "priority": "True" + } + + Please visit http://uropa-manual.readthedocs.io/config.html for detailed information on configuration. + """ + return epilog + + +def parse_json(infile): + """ Read a json file to a dict """ + with open(infile, 'r', encoding='utf-8') as f: + return ast.literal_eval(json.dumps(json.load(f))) + + +def format_config(cfg_dict): + """ Format input config to catch any errors """ + + convert_values = {value: True for value in [True, 'T', 'True', 'TRUE', 'Yes', 'YES', 'Y', 'yes']} + convert_values.update({value: False for value in [False, 'F', 'False', 'FALSE', 'No', 'NO', 'N', 'no']}) + query_prefix = "query_" + + #Set upper-level keys + cfg_dict["priority"] = convert_values[cfg_dict.get("priority", False)] + + for i, query in enumerate(cfg_dict["queries"]): + + #Feature + if type(query.get("feature", [])) != list: + query["feature"] = [query["feature"]] + + if type(query.get("feature_anchor", [])) != list: + query["feature_anchor"] = [query["feature_anchor"]] + + if type(query.get("attribute_values", [])) != list: + query["attribute_values"] = [query["attribute_values"]] + + #Internals + internalval = query.get("internals", False) + query["internals"] = float(convert_values.get(internalval, internalval)) #if internals is a float, the value will not be converted + + if "distance" in query: + if type(query["distance"]) == list: + query["distance"] = [int(query["distance"][0]), int(query["distance"][1])] + else: + query["distance"] = [int(query["distance"]), int(query["distance"])] #same distance in both positions + + #Name the query if it was not already named + if "name" not in query: + query["name"] = query_prefix + str(i + 1) + + return(cfg_dict) + +def config_string(cfg_dict): + """ Pretty-print cfg_dict with one-line queries """ + + upper_level = ["queries", "show_attributes", "priority", "gtf", "bed"] + query_level = ["feature", "feature_anchor", "distance", "strand", "relative_location", "filter_attribute", "attribute_values", "internals", "name"] + + upper_lines = [] + for upper_key in upper_level: + upper_str = "\"{0}\":".format(upper_key) + + if upper_key == "queries": + + queries = [] + for query in cfg_dict[upper_key]: + + one_query = [] + for query_key in query_level: + query_string = "" + + if query_key in query: + value = None + if type(query[query_key]) == list: + if len(query[query_key]) > 0: + value = "[" + ",".join(["\"{0}\"".format(value) for value in query[query_key]]) + "]" + else: + if query[query_key] != None: + value = "\"{0}\"".format(query[query_key]) + + if value is not None: + query_string += "\"{0}\": {1}".format(query_key, value) + one_query.append(query_string) + + querystr = " {" + ", ".join(one_query) + "}" + queries.append(querystr) + + upper_str += "[\n" + ",\n".join(queries) + "\n]" + + elif upper_key == "show_attributes": + upper_str += "[" + ",".join(["\"{0}\"".format(value) for value in cfg_dict[upper_key]]) + "]" + + else: + upper_str += "\"{0}\"".format(cfg_dict[upper_key]) + + upper_lines.append(upper_str ) + + fstr = "{\n" + ",\n".join(upper_lines) + "\n}\n" + + return(fstr) + +def check_file_access(fil, logger): + """ Check if file exists and can be read """ + + if os.path.isfile(fil): + if not os.access(fil, os.R_OK): + logger.error("File %s could not be read.", fil) + sys.exit() + else: + logger.error("File %s does not exist.", fil) + sys.exit() + +def check_bed_format(bedfile, logger): + """ Checks whether bedfile has the correct format """ + + with open(bedfile) as f: + for i, line in enumerate(f): + if not re.match("(\S)+\s+([0-9]+)\s+([0-9]+)(\s+\S+)?(\s+[0-9,.]+)?(\s+[.\-+])?", line): + logger.error("Line {0} in {1} is not proper bed format: {2}".format(i+1, bedfile, line)) + sys.exit() + + #todo: also check the number of columns in file + +def parse_bedfile(bedfile, gtf_has_chr): + + peaks = [] + with open(bedfile) as f: + for i, line in enumerate(f): + columns = line.rstrip().split() + + #bed6-columns + chrom, start, end = columns[0], int(columns[1]), int(columns[2]) + name = columns[3] if len(columns) > 3 else "peak_{0}".format(i+1) + score = columns[4] if len(columns) > 4 else "." + strand = columns[5] if len(columns) > 5 else "." + + #Additional columns + if len(columns) > 6: + additional = columns[6:] + additional_header = ["custom_" + str(val) for val in range(1,len(additional)+1)] + else: + additional_header = [] + additional = [] + + #Key for the matching gtf chromosome + if gtf_has_chr == True: + gtf_chr = "chr" + chrom if "chr" not in chrom else chrom + + peak_dict = {"gtf_chr": gtf_chr, "peak_chr":chrom, "peak_start":start, "peak_end":end, "peak_id":name, "peak_score":score, "strand":strand, "internal_peak_id": "peak_{0}".format(i+1)} + peak_dict.update(dict(zip(additional_header, additional))) + peaks.append(peak_dict) + + return(peaks) + +def check_chr(file, lines=100): + """Checks if a file has lines starting with 'chr'.""" + + chrom = False + count = 0 + with open(file, "r") as f: + for line in f: + if not line.startswith("#"): + + if line.startswith("chr"): + chrom = True + break + + count += 1 + if count == lines: + break + + return(chrom) + +def subset_gtf(gtf, features, sub_gtf): + """Removes lines with features not in features from a gtf file.""" + feat2cut = np.unique(features) + + with open(gtf, "r") as f: + lines = f.readlines() + gtf_query_feat = [line for line in lines if line.startswith("#") or line.split("\t")[2] in feat2cut] + + # List of lines containing only selected features from query + with open(sub_gtf, "w") as f: + f.write("".join(gtf_query_feat)) + + diff --git a/utils/uropa_summary.R b/utils/uropa_summary.R index 800b469..701807c 100755 --- a/utils/uropa_summary.R +++ b/utils/uropa_summary.R @@ -21,7 +21,8 @@ options = matrix(c( 'finalhits', 'f', 1, 'character', 'file containing the final hits from UROPA.', 'config', 'c', 1, 'character', 'file containing the json formatted configuration from the UROPA run.', 'output', 'o', 2, 'character', 'file name of output file [summary.pdf].', - 'besthits', 'b', 2, 'character', 'file containing the best hits from UROPA.', + 'allhits', 'b', 2, 'character', 'file containing all hits from UROPA.', + 'call', 'a', 2, 'character', 'original command line call.', 'help', 'h', 0, 'logical','Provides command line help.' ), byrow=TRUE, ncol=5) opt = getopt(options) @@ -45,6 +46,7 @@ if (is.null(opt$output)) { num.features <- 0 features <- c() + # reformat every row of the config.query file to string for cover page .print.query <- function(row){ r <- paste(unname(as.vector(unlist(row))), collapse=" ") @@ -68,12 +70,12 @@ features <- c() feat <- features[f] df.feature <- subset(df.uropa, df.uropa$feature==feat) - unique.loci <- sort(unique(df.feature$genomic_location)) + unique.loci <- sort(unique(df.feature$relative_location)) occurence.loci <- c() for(j in 1:length(unique.loci)){ loci <- as.character(unique.loci[j]) - occurence <- as.numeric(length(grep(loci, df.feature$genomic_location))) + occurence <- as.numeric(length(grep(loci, df.feature$relative_location))) occurence.loci <- c(occurence.loci, occurence) } df.pie.full <- data.frame(location=unique.loci, value=occurence.loci) @@ -138,7 +140,7 @@ features <- c() df.uropa.final[,"distance"] <- as.numeric(df.uropa.final[,"distance"]) df.uropa.final <- df.uropa.final[complete.cases(df.uropa.final),] if(nrow(df.uropa.final)==0){ - mtext("UROPA summary", side=3, line=0,outer=FALSE, cex=2) + mtext("UROPA summary", side=3, line=-3,outer=FALSE, cex=2) mtext("No valid peak annotations with specified query/queries, summary unfeasible!", line=-5) invisible(dev.off()) stop("No valid peak annotations with specified query/queries, summary unfeasible!") @@ -147,14 +149,16 @@ features <- c() # get infos from config for overview page config <- fromJSON(conf) config.query <- as.data.frame(config$queries) - config.query$feature <- substring(config.query$feature, 2) - config.query$feature <- gsub(pattern='\\(|\\)|\\"',"",config.query$feature) + config.query$feature <- sapply(config.query$feature, paste, collapse=",") config.query$feature <- gsub(pattern=',',"\n",config.query$feature) - config.query$query <- 0:(nrow(config.query)-1) + config.query$relative_location <- sapply(config.query$relative_location, paste, collapse=", ") + config.query$distance <- sapply(config.query$distance, paste, collapse=",") + config.query$relative_location <- gsub(pattern='PeakInsideFeature, FeatureInsidePeak, Upstream, Downstream, OverlapStart, OverlapEnd',"any",config.query$relative_location) + config.query$relative_location <- gsub(pattern=', ',"\n",config.query$relative_location) + config.query[is.null(config.query)] <- NA config.cols <- colnames(config.query) priority <- config$priority - - + show_attributes <- as.character(paste(config$show_attributes, collapse=", ")) # specify y limit for plots y.lim <- round(median(df.uropa.final[,"distance"]) + (max(df.uropa.final[,"distance"])/15)) if(y.lim > max(df.uropa.final[,"distance"]) || y.lim > 10000){ @@ -167,9 +171,9 @@ features <- c() } # expand multiple valid annotations to one row each - df.uropa.final <- separate_rows(df.uropa.final, query) # queries of uropa annotation run - num.queries <- max(config.query$query) - queries <- sprintf("%02d", config.query$query) + df.uropa.final <- separate_rows(df.uropa.final, name) # queries of uropa annotation run + num.queries <- length(config.query$name) + #queries <- sprintf("%02d", config.query$name) features <<- as.character(unique(df.uropa.final$feature)) num.features <<- length(features) @@ -177,32 +181,37 @@ features <- c() priority <- paste0(priority, "\nNote: No pairwise comparisons and Chow Ruskey plot (no overlaps)") } # add query to query data frame - config.query$query <- paste0("query",sprintf("%02d",0:(nrow(config.query)-1))) - config.query <- config.query[,c("query", "feature", "distance", "feature.anchor", "internals", "direction", - "filter.attribute", "attribute.value", "show.attributes")] - # replaye "start,center,end" position by "any_pos" - config.query$feature.anchor <- sapply(config.query$feature.anchor, function(x) if(length(x)==3){return("any_pos")}else{return(x)}) - mtext("UROPA summary", side=3, line=0,outer=FALSE, cex=2) - mtext(paste0("There were ", num.peaks, " peaks in the input bed file,\nUROPA annotated ", anno.peaks, " peaks\n"), - side=3, line=-3,outer=FALSE, cex=.7) + #config.query$name <- paste0("query",sprintf("%02d",0:(nrow(config.query)-1))) + #config.query <- config.query[,c("name", "feature", "distance", "feature_anchor", "internals", "direction", + # "filter_attribute", "attribute_value")] + # replaye "start,center,end" position by "any" + config.query$feature_anchor <- sapply(config.query$feature_anchor, function(x) if(length(x)==3){return("any")}else{return(x)}) + + mtext("UROPA summary", side=3, line=3.3,outer=FALSE, cex=1,col="red") + + if(!is.null(opt$call)){ + mtext(paste0("UROPA command line call:\n", paste(strwrap(opt$call, width= 1.9 * getOption("width")), collapse="\n")), + side=3, line=1.5,cex=.5) + } + + mtext(paste0("There were ", num.peaks, " peaks in the input bed file, UROPA annotated ", anno.peaks, " peaks\n"), + side=3, line=0,outer=FALSE, cex=.7) if(num.queries != nrow(config.query)){ - mtext(paste("Only queries", paste(queries, collapse=","), "are represented in the FinalHits", sep=" "), - side=3, line=-5,outer=FALSE, cex=.7) + mtext(paste("Only queries", paste(queries, collapse=","), "are represented in the finalhits", sep=" "), + side=3, line=-0.5,outer=FALSE, cex=.7) } - - mytheme <- ttheme_default(core = list(fg_params=list(cex = 0.5)),colhead = list(fg_params=list(cex = 0.5)), rowhead = list(fg_params=list(cex = 0.5))) config.query <- data.frame(lapply(config.query, as.character), stringsAsFactors=FALSE) g <- tableGrob(format(config.query), theme=mytheme,rows=NULL) grid.draw(g) - - mtext(paste0("priority: ", priority), cex=.7,side=1, line=-2) - input <- paste("Input:",unlist(config$bed),collapse=" ") - anno <- paste("Anno:",unlist(config$gtf),collapse=" ") + mtext(paste0("priority: ", priority), cex=.5,side=1, line=2) + mtext(paste0("show_attributes: ", show_attributes), cex=.5,side=1, line=2.6) + input <- paste("Input peak file:",unlist(config$bed),collapse=" ") + anno <- paste("Annotation file:",unlist(config$gtf),collapse=" ") input.anno <- paste(input,anno,sep="\n") - mtext(input.anno,cex=.6, side=1, line=0) + mtext(input.anno,cex=.7, side=1, line=4) # plot 1 # description @@ -211,7 +220,7 @@ features <- c() "Additional Info:\nThis is independent of the number of queries,\nall features present in the finalhits are displayed.", "\n\n\nNote on output files:\n", "\nallhits: All candidate features resulting from any query\n(1 peak : x queries : y annotations)", - "\n\nbesthits: All candidate features resulting from any query\n(1 peak : x queries : 1 annotation)", + "\n\nallhits: All candidate features resulting from any query\n(1 peak : x queries : 1 annotation)", "\n\nfinalhits: Only the one best feature among all queries\n(1 peak : 1 query : 1 annotation)") grid.newpage() mtext(plot1, cex=1, adj=0, padj=1) @@ -246,16 +255,17 @@ features <- c() mtext(plot3, cex=1, adj=0, padj=1) .plot.feature.distribution(df.uropa=df.uropa.final,header="Feature distribution across finalhits") } + return(df.uropa.final) } # arg 1 = finalhits # arg 2 = summery config # arg 3 = output pdf -# arg 4 = besthits +# arg 4 = allhits ## basic summary -- only one query definded -if (is.null(opt$besthits)) { +if (is.null(opt$allhits)) { df.uropa.final <- .basic.summary(opt$finalhits, opt$config, opt$output) invisible(dev.off()) } else @@ -265,29 +275,28 @@ if (is.null(opt$besthits)) { ## increased summary -- there is more than one query definded df.uropa.final <- .basic.summary(opt$finalhits, opt$config, opt$output) - df.uropa.best.per.query <- read.table(opt$besthits, header=TRUE, sep="\t",stringsAsFactors = FALSE) + df.uropa.allhits <- read.table(opt$allhits, header=TRUE, sep="\t",stringsAsFactors = FALSE) num.peaks <- length(unique(df.uropa.final$peak_id)) - df.uropa.best.per.query[,"distance"] <- as.numeric(df.uropa.best.per.query[,"distance"]) - df.uropa.best.per.query <- df.uropa.best.per.query[complete.cases(df.uropa.best.per.query),] - features <<- as.character(unique(df.uropa.best.per.query$feature)) + df.uropa.allhits[,"distance"] <- as.numeric(df.uropa.allhits[,"distance"]) + df.uropa.allhits <- df.uropa.allhits[complete.cases(df.uropa.allhits),] + features <<- as.character(unique(df.uropa.allhits$feature)) num.features <<- length(features) - queries <- sort.int(as.numeric(unique(df.uropa.best.per.query$query))) - queries <- sprintf("%02d", queries) + queries <- unique(df.uropa.allhits$name) + #queries <- sprintf("%02d", queries) num.queries <- length(queries) - # plot 4 # description - plot4 <- paste0("4. Distances of annotated peaks seperated for features and queries in besthits:", + plot4 <- paste0("4. Distances of annotated peaks seperated for features and queries in allhits:", "\n\nThe distribution of the distances per feature per query", - "\nis displayed in histograms based on the besthits.", + "\nis displayed in histograms based on the allhits.", "\n\nAdditional Info:\nThis is dependent on the number of queries;", "\nall features present in any query are displayed.") grid.newpage() mtext(plot4, cex=1, adj=0, padj=1) # plot # get max distance to calculate binwidth - dist <- df.uropa.best.per.query[,"distance"] + dist <- df.uropa.allhits[,"distance"] median.uropa.best <- median(dist) max.uropa.best <- max(dist) considered.distance <- median.uropa.best + round(max.uropa.best/15) @@ -299,42 +308,43 @@ if (is.null(opt$besthits)) { } } - df.distance.query <- subset(df.uropa.best.per.query[,c("feature","distance","query")], - (df.uropa.best.per.query[,"distance"] < considered.distance)) + + df.distance.query <- subset(df.uropa.allhits[,c("feature","distance","name")], + (df.uropa.allhits[,"distance"] < considered.distance)) max.distance.query <- round(max(as.numeric(df.distance.query[,"distance"]))) bin.width <- round(max.distance.query/20) - dpq <- qplot(df.distance.query[,2],data =df.distance.query, facets=query~feature, + dpq <- qplot(df.distance.query[,2],data =df.distance.query, facets=name~feature, geom="histogram", binwidth=bin.width, xlab = "Distance to feature", ylab = "Total count") - print(dpq + ggtitle("Distance of query vs. feature across besthits Hits")) + print(dpq + ggtitle("Distance of query vs. feature across allhits")) # plot 5 # description - plot5 <- paste0("5. Relative locations of annotated peaks in besthits:", - "\n\nThe following pie chart(s) illustrate the relativ location of\nthe peaks in relation the respective annotated feature as represented in the besthits.\n\n", - "Additional Info:\nThis is dependent on the number of queries;\nall features present in the besthits are displayed.") + plot5 <- paste0("5. Relative locations of annotated peaks in allhits:", + "\n\nThe following pie chart(s) illustrate the relativ location of\nthe peaks in relation the respective annotated feature as represented in the allhits.\n\n", + "Additional Info:\nThis is dependent on the number of queries;\nall features present in the allhits are displayed.") grid.newpage() mtext(plot5, cex=1, adj=0, padj=1) # plot - .plot.genomic.location.per.feature(df.uropa.best.per.query, "besthits Hits") + .plot.genomic.location.per.feature(df.uropa.allhits, "allhits") # plot 6 if(num.features > 1){ - plot6 <- paste0("6. Allocation of available featurs in besthits:", + plot6 <- paste0("6. Allocation of available featurs in allhits:", "\n\nBar plot displaying the occurrence of the different features if there is more", - "\nthan one feature assigned for peak annotation based on the besthits.", + "\nthan one feature assigned for peak annotation based on the allhits.", "\nThe best annotation found in each query is used for this plot", - "\n\nAdditional Info:\nThis is independent of the number of queries,\nall features present in the besthits are displayed.", + "\n\nAdditional Info:\nThis is independent of the number of queries,\nall features present in the allhits are displayed.", "\n\nIf only one feature is present,", "\nthis plot will be skipped.") grid.newpage() mtext(plot6, cex=1, adj=0, padj=1) - .plot.feature.distribution(df.uropa=df.uropa.best.per.query,header="Feature distribution across besthits Hits") + .plot.feature.distribution(df.uropa=df.uropa.allhits,header="Feature distribution across allhits") } # plot 7 # description plot7 <- paste0("7: Pairwise comparisons of query annotations:", - "\n\nThe following venn diagrams display peak based pairwise comparisons\namong all queries based on the besthits.", + "\n\nThe following venn diagrams display peak based pairwise comparisons\namong all queries based on the allhits.", "\n\nAdditional Info:\nIf only one query is present,", "\nthis plot will be skipped.") grid.newpage() @@ -342,19 +352,20 @@ if (is.null(opt$besthits)) { # get annotated peaks unique for each query peaks.per.query <- list() for(q in 1:num.queries){ - peaks.per.query[[q]] <- unique(df.uropa.best.per.query[as.numeric(df.uropa.best.per.query$query)==as.numeric(queries[q]),"peak_id"]) - names(peaks.per.query)[[q]] <- paste0("query",queries[q]) + peaks.per.query[[q]] <- unique(df.uropa.allhits[df.uropa.allhits$name==queries[q],"peak_id"]) + names(peaks.per.query)[[q]] <- queries[q] #paste0("query",q) } # number of pairwise compares: (n-1)n/2 with n = # queries # iterater for pairwise compare tmp.num.query <- 1 - all.peaks <- unique(df.uropa.best.per.query$peak_id) + all.peaks <- unique(df.uropa.allhits$peak_id) for(q in 1:(num.queries-1)){ - initial.query <- peaks.per.query[[paste0("query",queries[q])]] + #initial.query <- peaks.per.query[[paste0("query",queries[q])]] + initial.query <- peaks.per.query[[queries[q]]] if(tmp.num.query < num.queries){ for(c in num.queries:tmp.num.query){ if(queries[q] != queries[c]){ - compare.query <- peaks.per.query[[paste0("query",queries[c])]] + compare.query <- peaks.per.query[[queries[c]]] venn.object <- venn(list(initial.query, compare.query)) pushViewport(viewport(x=.5, y=0.5, width=.72, height=.72)) grid.rect(gp=gpar(fill="white",lty = "blank"),width = unit(1, "npc"), height = unit(1, "npc")) @@ -368,12 +379,12 @@ if (is.null(opt$besthits)) { dist <- c(0.02, -0.02, -0.02) } venn.plot <- draw.triple.venn(num.peaks, anno.initial.query, anno.compare.query, anno.initial.query, matches, - anno.compare.query, matches, category =c("all",paste0("query",queries[q]),paste0("query",queries[c])), + anno.compare.query, matches, category =c("all",queries[q],queries[c]), lwd = 2, lty ="solid", col = c("black", "red","blue"), fill = c("white","red","blue"), cex=1, cat.pos = c(0,0,0), cat.dist = dist, reverse = FALSE,cat.cex =1, cat.default.pos= "outer", alpha = c(0,.4,.4), euler.d=TRUE, scaled=TRUE) grid.draw(venn.plot) - mtext(paste0("Peak based pairwise compare of ", paste0("query",queries[q]), " and ", paste0("query",queries[c])), side=3, line=0,outer=FALSE, cex=1) + mtext(paste0("Peak based pairwise compare of ", queries[q], " and ", queries[c]), side=3, line=0,outer=FALSE, cex=1) popViewport() } } @@ -389,7 +400,7 @@ if (is.null(opt$besthits)) { suppressPackageStartupMessages(library(Vennerable)) tryCatch({ plot8 <- paste0("8: Comparison of all specified queries:", - "\n\nThe following Chow Ruskey plot compares all queries\nbased on the besthits.", + "\n\nThe following Chow Ruskey plot compares all queries\nbased on the allhits.", "\nIt represents an area-proportional Venn diagram,\nrevealing the distribution of peaks that could be annotated\nper query and works for up to 5 queries.", "\n\nAdditional Info:\nIt is evalueated whether peaks are annotated,\nbut not whether they are annotated for the same feature") grid.newpage() @@ -440,14 +451,16 @@ if (is.null(opt$besthits)) { mtext("Chow Ruskey comparison of all peaks annotated with UROPA", side=3, line=0,outer=FALSE, cex=1) }, error = function(e){ cat("\nChowRuskey plot was invalid do upsetR plot\n") - suppressPackageStartupMessages(library(UpSetR,quietly = TRUE)) - upset(fromList(peaks.per.query), keep.order=TRUE,nintersects = NA,empty.intersections = "on",number.angles=35,mainbar.y.label = "Peak Intersections", sets.x.label = "Annotated Genes") + suppressPackageStartupMessages(library(UpSetR,quietly = TRUE)) + dt <- fromList(peaks.per.query) + upset(dt, sets=names(peaks.per.query), number.angles=15, mainbar.y.label = "Peak Intersections", sets.x.label = "Annotated Genes") #nsets=(ncol(combn(num.queries,2))+num.queries) }) } } else { suppressPackageStartupMessages(library(UpSetR,quietly = TRUE)) - upset(fromList(peaks.per.query), keep.order=TRUE,nintersects = NA,empty.intersections = "on",number.angles=35,mainbar.y.label = "Peak Intersections", sets.x.label = "Annotated Genes") + dt <- fromList(peaks.per.query) + upset(dt, sets=names(peaks.per.query), number.angles=15, mainbar.y.label = "Peak Intersections", sets.x.label = "Annotated Genes") } invisible(dev.off())