diff --git a/bin/1.1_footprint_extraction/footprints_extraction.py b/bin/1.1_footprint_extraction/footprints_extraction.py index 36a2342..00d90ee 100644 --- a/bin/1.1_footprint_extraction/footprints_extraction.py +++ b/bin/1.1_footprint_extraction/footprints_extraction.py @@ -282,7 +282,7 @@ def check_and_merge(peak_footprints, max_bp_between): peak_footprints_new = {} merged_footprints = {} - print(len(peak_footprints)) + #print(len(peak_footprints)) for footprint_to_check in peak_footprints.keys(): start_to_check = peak_footprints[footprint_to_check]['start'] @@ -294,113 +294,166 @@ def check_and_merge(peak_footprints, max_bp_between): if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] < max_bp_between: #make compared_footprint longer - print() - print("footprint_to_check", footprint_to_check) - print(compared_footprint, " + ", footprint_to_check) + #print() + #print("footprint_to_check", footprint_to_check) + #print(compared_footprint, " + ", footprint_to_check) merge_footprints_left = False break elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check < max_bp_between: #make footprint_to_check longer - print() - print("footprint_to_check", footprint_to_check) - print(footprint_to_check, " + ", compared_footprint) + #print() + #print("footprint_to_check", footprint_to_check) + #print(footprint_to_check, " + ", compared_footprint) merge_footprints_left = True break if merge_footprints_left: #the left merging is enabled, start and end of compared_footprint should be smaller than the start of the footprint_to_check - print("entered left merging") - print(start_to_check, end_to_check) - print(peak_footprints[compared_footprint]['start'], peak_footprints[compared_footprint]['end']) + #print("entered left merging") + #print(start_to_check, end_to_check) + #print(peak_footprints[compared_footprint]['start'], peak_footprints[compared_footprint]['end']) if start_to_check < peak_footprints[compared_footprint]['start']: if footprint_to_check not in peak_footprints_new.keys(): - print("left, footprint to check ", footprint_to_check, " not in peak_footprints_new") - #UPDATE + #print("left, footprint to check ", footprint_to_check, " not in peak_footprints_new") + #even if it is not in keys, it could be merged already, so check first, if there are some footprints merged with this one + #print(merged_footprints) if any(footprint_to_check in merged_footprints[x] for x in merged_footprints.keys()): #true if footprint_to_check was already merged with someone #print("footprint_to_check ", footprint_to_check, " was a part of some merging") for k, v in merged_footprints.items(): if footprint_to_check in v: main_footprint = k - print("make merging using the information from the merged_footprints and peak_footprints_new") + #print("make merging using the information from the merged_footprints and peak_footprints_new") #UPDATE + #print("update") + #print(peak_footrpints_new[main_footprint]) + peak_footrpints_new[main_footprint] = footprint_update(peak_footprints_new[main_footprint], peak_footprints[compared_footprint]['start'], peak_footprints[main_footprint]['end'], peak_footprints[compared_footprint]['score']) + #print(peak_footrpints_new[main_footprint]) + merged_array = merged_footprints[main_footprint] + merged_array.append(compared_footprint) + merged_footprints[main_footprint] = merged_array + #print(merged_footprints) + #there are no merged footprints with the footprint_to_check yet, so make a new one else: - print("make normal update, using data from peak_footprints") + #print("make normal update, using data from peak_footprints") merged_footprints[footprint_to_check] = merged_footprints.get(footprint_to_check, []) merged_footprints[footprint_to_check] = [compared_footprint] - print(merged_footprints) + #print(merged_footprints) - print("saving ", footprint_to_check) + #print("saving ", footprint_to_check) peak_footprints_new[footprint_to_check] = peak_footprints.get(footprint_to_check, {}) peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check] #<-- update #UPDATE - + #print("update") + #print(peak_footprints_new[footprint_to_check]) + peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score']) + #print(peak_footprints_new[footprint_to_check]) else: #the footprint_to_check is in peak_footprints_new already + #the footprint_to_check can only be the main part of merging before, check it if footprint_to_check in merged_footprints.keys(): - print("footprint_to_check ", footprint_to_check, " was as main for merging already") - #UPDATE - elif any(footprint_to_check in merged_footprints[x] for x in merged_footprints.keys()): - print("footprint_to_check ", footprint_to_check, " was a part of some merging") - for k, v in merged_footprints.items(): - if footprint_to_check in v: - main_footprint = k + #print("footprint_to_check ", footprint_to_check, " was as main for merging already") #UPDATE + #print("update") + #print(peak_footprints_new[footprint_to_check]) + peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score']) + #print(peak_footprints_new[footprint_to_check]) + #merged_footprints[footprint_to_check].append(compared_footprint) + merged_array = merged_footprints[footprint_to_check] + merged_array.append(compared_footprint) + merged_footprints[footprint_to_check] = merged_array + #print(merged_footprints) + else: - print("merge now and add footprint_to_check to the merged_footprints") + #print("merge now and add footprint_to_check to the merged_footprints") merged_footprints[footprint_to_check] = merged_footprints.get(footprint_to_check, []) merged_footprints[footprint_to_check] = [compared_footprint] #UPDATE - + #print("update") + #print(peak_footprints_new[footprint_to_check]) + peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score']) + #print(peak_footprints_new[footprint_to_check]) elif merge_footprints_left == False: #the right merging is enabled, start and end of compared footprint should be bigger than the start of the footprint_to_check - print("entered right merging") - print(start_to_check, end_to_check) - print(peak_footprints[compared_footprint]['start'], peak_footprints[compared_footprint]['end']) + #print("entered right merging") + #print(start_to_check, end_to_check) + #print(peak_footprints[compared_footprint]['start'], peak_footprints[compared_footprint]['end']) if end_to_check > peak_footprints[compared_footprint]['end']: if compared_footprint not in peak_footprints_new.keys(): - print("right, compared_footprint ", compared_footprint, " not in peak_footprints_new") + #print("right, compared_footprint ", compared_footprint, " not in peak_footprints_new") if any(compared_footprint in merged_footprints[x] for x in merged_footprints.keys()): - print("compared_footprint ", compared_footprint, " was a part of some merging") + #print("compared_footprint ", compared_footprint, " was a part of some merging") for k, v in merged_footprints.items(): if compared_footprint in v: main_footprint = k - print("make merging using the information from the merged_footprints and peak_footprints_new") + #print("make merging using the information from the merged_footprints and peak_footprints_new") #UPDATE + #print("update") + #print(merged_footprints) + peak_footprints_new[main_footprint] = footprint_update(peak_footprints_new[main_footprint], peak_footprints[main_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score']) + #print(merged_footprints) + #merged_footprints[main_footprint] = merged_footprints[main_footprint].append(footprint_to_check) + merged_array = merged_footprints[main_footprint] + merged_array.append(footprint_to_check) + merged_footprints[main_footprint] = merged_array else: - print("make normal update, using data from peak footprints") + #print("make normal update, using data from peak footprints") merged_footprints[compared_footprint] = merged_footprints.get(compared_footprint, []) merged_footprints[compared_footprint] = [footprint_to_check] - print(merged_footprints) + #print(merged_footprints) - print("saving ", compared_footprint) + #print("saving ", compared_footprint) peak_footprints_new[compared_footprint] = peak_footprints.get(compared_footprint, {}) peak_footprints_new[compared_footprint] = peak_footprints[compared_footprint] #UPDATE + #print("update") + #print(peak_footprints_new[compared_footprint]) + peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score']) + #print(peak_footprints_new[compared_footprint]) else: if compared_footprint in merged_footprints.keys(): - print("compared_footprint ", compared_footprint, " was as main for merging already") - #UPDATE - elif any(compared_footprint in merged_footprints[x] for x in merged_footprints.keys()): - #print("compared_footprint ", compared_footprint, " was a part of some merging") - for k, v in merged_footprints.items(): - if compared_footprint in v: - main_footprint = k + #print("compared_footprint ", compared_footprint, " was as main for merging already") #UPDATE + #print("update") + #print(peak_footprints_new[compared_footprint]) + peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score']) + #print(peak_footprints_new[compared_footprint]) + #merged_footprints[compared_footprint] = merged_footprints[compared_footprint].append(footprint_to_check) + merged_array = merged_footprints[compared_footprint] + merged_array.append(footprint_to_check) + merged_footprints[compared_footprint] = merged_array + #print(merged_footprints) else: - print("merge now and add compared_footprint to the merged_footprints") + #print("merge now and add compared_footprint to the merged_footprints") merged_footprints[compared_footprint] = merged_footprints.get(compared_footprint, []) merged_footprints[compared_footprint] = [compared_footprint] - print(merged_footprints) + #print(merged_footprints) #UPDATE - + #print("update") + #print(peak_footprints_new[compared_footprint]) + peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score']) + #print(peak_footprints_new[compared_footprint]) else: #save the current footprint, as it should not be merged peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, []) peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check] - print(len(peak_footprints_new)) - for footprint in peak_footprints_new: - print(footprint) - sys.exit() + #print(len(peak_footprints_new)) + #for footprint in peak_footprints_new: + # print(footprint) + #sys.exit() return peak_footprints_new +#this function is used to update the footprint that should to be merged with another one +#as input the footprint, needed the update, as well as new start, new end and the score of the merged footprint are passed +#the output of this function is a dictionary containing the new information about the footprint +def footprint_update(footprint, start, end, score): + new_len = end - start + new_score = (footprint['score'] + score) / 2 + + footprint['start'] = start + footprint['end'] = end + footprint['score'] = new_score + footprint['len'] = new_len + + return footprint + #this function uses the information provided from the .bed file to look for footprints within the peaks of interest #as input the information from the original bed file, as well as bigwig file is needed #the optional parameters window_length, step and percentage are needed as well to use the sliding window algorithm and work with the "background" score