From aa1e9f0f761196aa1b665710a900e9f3225e546f Mon Sep 17 00:00:00 2001 From: anastasiia Date: Mon, 7 Jan 2019 12:47:11 +0100 Subject: [PATCH 1/4] fixing the bug with footprint length --- .../footprints_extraction.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/bin/1.1_footprint_extraction/footprints_extraction.py b/bin/1.1_footprint_extraction/footprints_extraction.py index fae040e..e636c91 100644 --- a/bin/1.1_footprint_extraction/footprints_extraction.py +++ b/bin/1.1_footprint_extraction/footprints_extraction.py @@ -154,6 +154,7 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom old_start = peak_footprints[existing_footprint_name]['start'] old_end = peak_footprints[existing_footprint_name]['end'] old_score = peak_footprints[existing_footprint_name]['score'] + old_length = peak_footprints[existing_footprint_name]['len'] if footprint_start >= old_start and footprint_start <= old_end: #the start of the new footprint is between the start and end of an old footprint if footprint_end > old_end: #the new footprint is not completely inside the old one @@ -163,19 +164,21 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom peak_footprints[existing_footprint_name]['end'] = footprint_end peak_footprints[existing_footprint_name]['score'] = footprint_score + peak_footprints[existing_footprint_name]['len'] = footprint_end - old_start #we can not update the max_pos as we do not have the information about scores array of the existing footprint #else: the new footprint is completely inside the old one, do nothing save_current_footprint = False break elif footprint_end >= old_start and footprint_end <= old_end: #the end of the new footprint is between the start and end of an old footprint - if footprint_start < old_start: #the new footprint is not completely inside the old one + if footprint_start < old_start: #the new footprint is not completely inside the old one #update the information about the existing footprint #find the average of both scores footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2 peak_footprints[existing_footprint_name]['start'] = footprint_start peak_footprints[existing_footprint_name]['score'] = footprint_score + peak_footprints[existing_footprint_name]['len'] = old_end - footprint_start #else do nothing save_current_footprint = False break @@ -270,6 +273,11 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe return peak_footprints, footprint_count +def check_for_overlap_and_merge(peak_footprints): + for footprint_to_check in peak_footprints.keys(): + print(footprint_to_check, peak_footprints[footprint_to_check]) + return peak_footprints + #this function uses the information provided from the .bed file to look for footprints within the peaks of interest #as input the information from the original bed file, as well as bigwig file is needed #the optional parameters window_length, step and percentage are needed as well to use the sliding window algorithm and work with the "background" score @@ -295,9 +303,12 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage) peak_end = int(positions[1]) scores_in_peak = np.nan_to_num(np.array(list(bw_open.values(chromosom, peak_start, peak_end)))) #save the scores to an array - + print() peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage) + #double check for overlaps and possibly merging of footprints having up to 5 bp in between + peak_footprints = check_for_overlap_and_merge(peak_footprints) + for footprint_name in peak_footprints.keys(): all_footprints[footprint_name] = all_footprints.get(footprint_name, {}) all_footprints[footprint_name] = peak_footprints[footprint_name] From c452934c1ff6a68b5621b4a98d44522d7ddc3385 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Mon, 7 Jan 2019 14:42:18 +0100 Subject: [PATCH 2/4] fixing the bug with overlaps + adding a merging function if the footprints have small ammount of bp in between --- .../footprints_extraction.py | 57 +++++++++++++++++-- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/bin/1.1_footprint_extraction/footprints_extraction.py b/bin/1.1_footprint_extraction/footprints_extraction.py index e636c91..bcec148 100644 --- a/bin/1.1_footprint_extraction/footprints_extraction.py +++ b/bin/1.1_footprint_extraction/footprints_extraction.py @@ -273,10 +273,59 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe return peak_footprints, footprint_count -def check_for_overlap_and_merge(peak_footprints): +def check_and_merge(peak_footprints): + peak_footprints_new = {} + for footprint_to_check in peak_footprints.keys(): - print(footprint_to_check, peak_footprints[footprint_to_check]) - return peak_footprints + start_to_check = peak_footprints[footprint_to_check]['start'] + end_to_check = peak_footprints[footprint_to_check]['end'] + + #if not peak_footprints_new: #the dictionary is empty, save the first footprint + # peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, []) + # peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check] + #print(peak_footprints_new.keys()) + + merge_footprints_left = None + + for compared_footprint in peak_footprints.keys(): + + if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] <= 5: #5 is hardcoded !!!!!! :( + #make compared_footprint longer + print("MERGE", footprint_to_check, compared_footprint) + merge_footprints_left = False + break + elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check <= 5: + #make footprint_to_check longer + print("MERGE", footprint_to_check, compared_footprint) + merge_footprints_left = True + break + + if merge_footprints_left: #if it is true + print("merging left") + if start_to_check < peak_footprints[compared_footprint]['start'] and compared_footprint in peak_footprints_new.keys(): + print("make longer") + print("before merging" + str(peak_footprints_new[compared_footprint])) + peak_footprints_new[compared_footprint]['start'] = start_to_check + peak_footprints_new[compared_footprint]['len'] = peak_footprints[compared_footprint]['end'] - start_to_check + peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2 + print(peak_footprints_new[compared_footprint]) + + elif merge_footprints_left == False: + print("merging right") + if end_to_check > peak_footprints[compared_footprint]['end'] and compared_footprint in peak_footprints_new.keys(): + print("make longer") + print("before merging" + str(peak_footprints_new[compared_footprint])) + peak_footprints_new[compared_footprint]['end'] = end_to_check + peak_footprints_new[compared_footprint]['len'] = end_to_check - peak_footprints[compared_footprint]['end'] + peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2 + print(peak_footprints_new[compared_footprint]) + else: + print("saving " + footprint_to_check) + peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, []) + peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check] + + #sys.exit() + return peak_footprints_new #this function uses the information provided from the .bed file to look for footprints within the peaks of interest #as input the information from the original bed file, as well as bigwig file is needed @@ -307,7 +356,7 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage) peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage) #double check for overlaps and possibly merging of footprints having up to 5 bp in between - peak_footprints = check_for_overlap_and_merge(peak_footprints) + peak_footprints = check_and_merge(peak_footprints) for footprint_name in peak_footprints.keys(): all_footprints[footprint_name] = all_footprints.get(footprint_name, {}) From 7a5d6df58f7147b7d8aedd8cea9c2d9435db1cc2 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Mon, 7 Jan 2019 15:11:08 +0100 Subject: [PATCH 3/4] allowing user to set the parameter for max bp allowed in between the footprints; also adding some comments --- .../footprints_extraction.py | 53 +++++++++---------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/bin/1.1_footprint_extraction/footprints_extraction.py b/bin/1.1_footprint_extraction/footprints_extraction.py index bcec148..3e94956 100644 --- a/bin/1.1_footprint_extraction/footprints_extraction.py +++ b/bin/1.1_footprint_extraction/footprints_extraction.py @@ -43,6 +43,7 @@ def parse_args(): parser.add_argument('--window_length', default=200, type=int, help='Please enter the length for a window, by defauld 200 bp.') parser.add_argument('--step', default=100, type=int, help='Please enter a step to move the window, by default 100 bp.') parser.add_argument('--percentage', default=0, type=int, help='Please enter a percentage to be added to background while searching for footprints, by default 0%%.') + parser.add_argument('--max_bp_between', default=6, type=int, help='Please enter the number of bp allowed to be in between two footprints, by default 6 bp.') parser.add_argument('--silent', action='store_true', help='While working with data write the information only into ./footprints_extraction.log.') args = parser.parse_args() @@ -273,65 +274,62 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe return peak_footprints, footprint_count -def check_and_merge(peak_footprints): +#this function checks if there are footprints with a small amount of bp in between, and if so, merges them together +#if at this point there are still overlaps of footprints, this function will delete them +#the input parameter are: dictionary with footprints within one peak, and the max number of bp allowed to be in between the footprints +#the output is the renewed dictionary containing only the best footprints for the output file +def check_and_merge(peak_footprints, max_bp_between): peak_footprints_new = {} for footprint_to_check in peak_footprints.keys(): start_to_check = peak_footprints[footprint_to_check]['start'] end_to_check = peak_footprints[footprint_to_check]['end'] - #if not peak_footprints_new: #the dictionary is empty, save the first footprint - # peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, []) - # peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check] - #print(peak_footprints_new.keys()) - merge_footprints_left = None for compared_footprint in peak_footprints.keys(): - if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] <= 5: #5 is hardcoded !!!!!! :( + if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] < max_bp_between: #make compared_footprint longer - print("MERGE", footprint_to_check, compared_footprint) merge_footprints_left = False break - elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check <= 5: + elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check < max_bp_between: #make footprint_to_check longer - print("MERGE", footprint_to_check, compared_footprint) merge_footprints_left = True break - if merge_footprints_left: #if it is true - print("merging left") - if start_to_check < peak_footprints[compared_footprint]['start'] and compared_footprint in peak_footprints_new.keys(): - print("make longer") - print("before merging" + str(peak_footprints_new[compared_footprint])) + if merge_footprints_left: #if the merging left is enabled + #check if this footprint can be merged with the compared_footprint + #if compared footprint is not in peak_footprint_new.keys(), the next loop will check for this footprint. There is no need for doulbe check now + if start_to_check < peak_footprints[compared_footprint]['start'] and compared_footprint in peak_footprints_new.keys(): + #update the start position peak_footprints_new[compared_footprint]['start'] = start_to_check + #update the length peak_footprints_new[compared_footprint]['len'] = peak_footprints[compared_footprint]['end'] - start_to_check + #update the score peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2 - print(peak_footprints_new[compared_footprint]) - elif merge_footprints_left == False: - print("merging right") + elif merge_footprints_left == False: #otherwise merge right + #check if the merging is possible if end_to_check > peak_footprints[compared_footprint]['end'] and compared_footprint in peak_footprints_new.keys(): - print("make longer") - print("before merging" + str(peak_footprints_new[compared_footprint])) + #update the end position peak_footprints_new[compared_footprint]['end'] = end_to_check + #update the length peak_footprints_new[compared_footprint]['len'] = end_to_check - peak_footprints[compared_footprint]['end'] + #update the score peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2 - print(peak_footprints_new[compared_footprint]) - else: - print("saving " + footprint_to_check) + + else: #save the current footprint, as it should not be merged peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, []) peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check] - #sys.exit() return peak_footprints_new #this function uses the information provided from the .bed file to look for footprints within the peaks of interest #as input the information from the original bed file, as well as bigwig file is needed #the optional parameters window_length, step and percentage are needed as well to use the sliding window algorithm and work with the "background" score #the output of this function is a dictionary contains all the found footprints ready to write to the output file -def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage): +def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage, max_bp_between): logger.info('Looking for footprints within peaks...') @@ -352,11 +350,10 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage) peak_end = int(positions[1]) scores_in_peak = np.nan_to_num(np.array(list(bw_open.values(chromosom, peak_start, peak_end)))) #save the scores to an array - print() peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage) #double check for overlaps and possibly merging of footprints having up to 5 bp in between - peak_footprints = check_and_merge(peak_footprints) + peak_footprints = check_and_merge(peak_footprints, max_bp_between) for footprint_name in peak_footprints.keys(): all_footprints[footprint_name] = all_footprints.get(footprint_name, {}) @@ -441,7 +438,7 @@ def main(): logger.info("The script footprints_extraction.py was called using these parameters: " + str(vars(args))) bed_dictionary = make_bed_dictionary(args.bed) - all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage) + all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage, args.max_bp_between) write_to_bed_file(all_footprints, args.output_file) logger.info("the number of peaks: " + str(len(bed_dictionary))) From 0308a0c8a3657769c235b20fff6803aa8b36a38f Mon Sep 17 00:00:00 2001 From: anastasiia Date: Tue, 8 Jan 2019 12:28:29 +0100 Subject: [PATCH 4/4] =?UTF-8?q?changing=20the=20name=20of=20footprints=20f?= =?UTF-8?q?rom=20footprint=5F=20to=20f=5F=20to=20make=20sure=20the=20scrip?= =?UTF-8?q?t=20of=20Ren=C3=A9=20works=20well?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/1.1_footprint_extraction/footprints_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/1.1_footprint_extraction/footprints_extraction.py b/bin/1.1_footprint_extraction/footprints_extraction.py index 3e94956..d1b0f8f 100644 --- a/bin/1.1_footprint_extraction/footprints_extraction.py +++ b/bin/1.1_footprint_extraction/footprints_extraction.py @@ -195,7 +195,7 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom if save_current_footprint == True: #make sure to save this footprint - footprint_name = "footprint_" + str(footprint_count) + footprint_name = "f_" + str(footprint_count) peak_footprints[footprint_name] = peak_footprints.get(footprint_name, {}) peak_footprints[footprint_name] = {'chromosom': chromosom, 'start': footprint_start, 'end': footprint_end, 'score': footprint_score, 'len': len(footprint_scores), 'bonus': bonus_info_from_bed, 'max_pos': max_pos}