From c452934c1ff6a68b5621b4a98d44522d7ddc3385 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Mon, 7 Jan 2019 14:42:18 +0100 Subject: [PATCH] fixing the bug with overlaps + adding a merging function if the footprints have small ammount of bp in between --- .../footprints_extraction.py | 57 +++++++++++++++++-- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/bin/1.1_footprint_extraction/footprints_extraction.py b/bin/1.1_footprint_extraction/footprints_extraction.py index e636c91..bcec148 100644 --- a/bin/1.1_footprint_extraction/footprints_extraction.py +++ b/bin/1.1_footprint_extraction/footprints_extraction.py @@ -273,10 +273,59 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe return peak_footprints, footprint_count -def check_for_overlap_and_merge(peak_footprints): +def check_and_merge(peak_footprints): + peak_footprints_new = {} + for footprint_to_check in peak_footprints.keys(): - print(footprint_to_check, peak_footprints[footprint_to_check]) - return peak_footprints + start_to_check = peak_footprints[footprint_to_check]['start'] + end_to_check = peak_footprints[footprint_to_check]['end'] + + #if not peak_footprints_new: #the dictionary is empty, save the first footprint + # peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, []) + # peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check] + #print(peak_footprints_new.keys()) + + merge_footprints_left = None + + for compared_footprint in peak_footprints.keys(): + + if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] <= 5: #5 is hardcoded !!!!!! :( + #make compared_footprint longer + print("MERGE", footprint_to_check, compared_footprint) + merge_footprints_left = False + break + elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check <= 5: + #make footprint_to_check longer + print("MERGE", footprint_to_check, compared_footprint) + merge_footprints_left = True + break + + if merge_footprints_left: #if it is true + print("merging left") + if start_to_check < peak_footprints[compared_footprint]['start'] and compared_footprint in peak_footprints_new.keys(): + print("make longer") + print("before merging" + str(peak_footprints_new[compared_footprint])) + peak_footprints_new[compared_footprint]['start'] = start_to_check + peak_footprints_new[compared_footprint]['len'] = peak_footprints[compared_footprint]['end'] - start_to_check + peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2 + print(peak_footprints_new[compared_footprint]) + + elif merge_footprints_left == False: + print("merging right") + if end_to_check > peak_footprints[compared_footprint]['end'] and compared_footprint in peak_footprints_new.keys(): + print("make longer") + print("before merging" + str(peak_footprints_new[compared_footprint])) + peak_footprints_new[compared_footprint]['end'] = end_to_check + peak_footprints_new[compared_footprint]['len'] = end_to_check - peak_footprints[compared_footprint]['end'] + peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2 + print(peak_footprints_new[compared_footprint]) + else: + print("saving " + footprint_to_check) + peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, []) + peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check] + + #sys.exit() + return peak_footprints_new #this function uses the information provided from the .bed file to look for footprints within the peaks of interest #as input the information from the original bed file, as well as bigwig file is needed @@ -307,7 +356,7 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage) peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage) #double check for overlaps and possibly merging of footprints having up to 5 bp in between - peak_footprints = check_for_overlap_and_merge(peak_footprints) + peak_footprints = check_and_merge(peak_footprints) for footprint_name in peak_footprints.keys(): all_footprints[footprint_name] = all_footprints.get(footprint_name, {})