Skip to content

Peak calling fixing the merging bug #50

Merged
merged 10 commits into from
Jan 12, 2019
158 changes: 126 additions & 32 deletions bin/1.1_footprint_extraction/footprints_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def parse_args():
parser.add_argument('--window_length', default=200, type=int, help='Please enter the length for a window, by defauld 200 bp.')
parser.add_argument('--step', default=100, type=int, help='Please enter a step to move the window, by default 100 bp.')
parser.add_argument('--percentage', default=0, type=int, help='Please enter a percentage to be added to background while searching for footprints, by default 0%%.')
parser.add_argument('--max_bp_between', default=6, type=int, help='Please enter the number of bp allowed to be in between two footprints, by default 6 bp.')
parser.add_argument('--min_gap', default=6, type=int, help='Please enter the number of bp allowed to be in between two footprints, by default 6 bp.')
parser.add_argument('--silent', action='store_true', help='While working with data write the information only into ./footprints_extraction.log.')
args = parser.parse_args()

Expand Down Expand Up @@ -118,7 +118,7 @@ def make_bed_dictionary(bed_file):
#an array containing scores (signals) from the bigwig file; the footprints already saved for the particular peak; information for the bed file:
#chromosom, start and end position, as well as additional information from the original bed file
#the function returns the count for footprints, as well as footprints for the current peak
def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom, footprint_start, footprint_end, bonus_info_from_bed):
def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom, footprint_start, footprint_end, bonus_info_from_bed):

save_current_footprint = False

Expand All @@ -143,6 +143,8 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
else:
max_pos = first_max_pos

max_pos = max_pos + 1 #as the index of an array starts with 0

#calculate the score for the current footprint as mean of all scores from the bigwig file
footprint_score = np.mean(footprint_scores)

Expand Down Expand Up @@ -279,58 +281,139 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe
#if at this point there are still overlaps of footprints, this function will delete them
#the input parameter are: dictionary with footprints within one peak, and the max number of bp allowed to be in between the footprints
#the output is the renewed dictionary containing only the best footprints for the output file
def check_and_merge(peak_footprints, max_bp_between):
def check_and_merge(peak_footprints, min_gap):
#to ensure the merging works well, sort the footprints first arter start and end positions
#the sort can not be applied to a dictionary, we are making a list out of peak_footprints_dict
peak_footprints_list = sorted(peak_footprints.items(), key = lambda x : (x[1]['start'], x[1]['end']), reverse = False)

peak_footprints_new = {}
merged_footprints = {}

for footprint_to_check in peak_footprints.keys():
#we need to check each footprint within this peak with the other footprints for possible merging
#for footprint_to_check in peak_footprints.keys():
for footprint in peak_footprints_list: #work with sorted footprints
footprint_to_check = footprint[0] #save the name of the footprint which we are working with now
start_to_check = peak_footprints[footprint_to_check]['start']
end_to_check = peak_footprints[footprint_to_check]['end']

merge_footprints_left = None

for compared_footprint in peak_footprints.keys():

if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] < max_bp_between:
#make compared_footprint longer
if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] < min_gap:
#make compared_footprint longer: compared_footprint + footprint_to_check
merge_footprints_left = False
break
elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check < max_bp_between:
#make footprint_to_check longer
elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check < min_gap:
#make footprint_to_check longer: footprint_to_check + compared footprint
merge_footprints_left = True
break

if merge_footprints_left: #if the merging left is enabled
#check if this footprint can be merged with the compared_footprint
#if compared footprint is not in peak_footprint_new.keys(), the next loop will check for this footprint. There is no need for doulbe check now
if start_to_check < peak_footprints[compared_footprint]['start'] and compared_footprint in peak_footprints_new.keys():
#update the start position
peak_footprints_new[compared_footprint]['start'] = start_to_check
#update the length
peak_footprints_new[compared_footprint]['len'] = peak_footprints[compared_footprint]['end'] - start_to_check
#update the score
peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2

elif merge_footprints_left == False: #otherwise merge right
#check if the merging is possible
if end_to_check > peak_footprints[compared_footprint]['end'] and compared_footprint in peak_footprints_new.keys():
#update the end position
peak_footprints_new[compared_footprint]['end'] = end_to_check
#update the length
peak_footprints_new[compared_footprint]['len'] = end_to_check - peak_footprints[compared_footprint]['end']
#update the score
peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2
if merge_footprints_left: #the left merging is enabled, start and end of compared_footprint should be smaller than the start of the footprint_to_check
if start_to_check < peak_footprints[compared_footprint]['start']:
if footprint_to_check not in peak_footprints_new.keys():
if any(footprint_to_check in merged_footprints[x] for x in merged_footprints.keys()): #true if footprint_to_check was already merged with someone
for k, v in merged_footprints.items():
if footprint_to_check in v:
main_footprint = k
#make merging using the information from the merged_footprints and peak_footprints_new
#UPDATE
peak_footprints_new[main_footprint] = footprint_update(peak_footprints_new[main_footprint], peak_footprints[compared_footprint]['start'], peak_footprints[main_footprint]['end'], peak_footprints[compared_footprint]['score'])
merged_array = merged_footprints[main_footprint]
merged_array.append(compared_footprint)
merged_footprints[main_footprint] = merged_array
#there are no merged footprints with the footprint_to_check yet, so make a new one
else:
#add the compared footprint and footprint_to_check to the merged_footprints
merged_footprints[footprint_to_check] = merged_footprints.get(footprint_to_check, [])
merged_footprints[footprint_to_check] = [compared_footprint]

peak_footprints_new[footprint_to_check] = peak_footprints.get(footprint_to_check, {})
peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check] #<-- update
#UPDATE
peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score'])
else: #the footprint_to_check is in peak_footprints_new already
#the footprint_to_check can only be the main part of merging before, check it
if footprint_to_check in merged_footprints.keys():
#footprint_to_check was as main for merging already
#UPDATE
peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score'])
#add it to the merged_footprints as well
merged_array = merged_footprints[footprint_to_check]
merged_array.append(compared_footprint)
merged_footprints[footprint_to_check] = merged_array
else:
#the footprint_to check was not merged with anything yet
merged_footprints[footprint_to_check] = merged_footprints.get(footprint_to_check, [])
merged_footprints[footprint_to_check] = [compared_footprint]
#UPDATE
peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score'])
#the right merging is enabled, start and end of compared footprint should be bigger than the start of the footprint_to_check
elif merge_footprints_left == False:
if end_to_check > peak_footprints[compared_footprint]['end']:
if compared_footprint not in peak_footprints_new.keys():
if any(compared_footprint in merged_footprints[x] for x in merged_footprints.keys()):
for k, v in merged_footprints.items():
if compared_footprint in v:
main_footprint = k
#make merging using the information from the merged_footprints and peak_footprints_new
#UPDATE
peak_footprints_new[main_footprint] = footprint_update(peak_footprints_new[main_footprint], peak_footprints[main_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])
#add to the merged_footprints
merged_array = merged_footprints[main_footprint]
merged_array.append(footprint_to_check)
merged_footprints[main_footprint] = merged_array
else:
#make normal update, using data from peak footprints
merged_footprints[compared_footprint] = merged_footprints.get(compared_footprint, [])
merged_footprints[compared_footprint] = [footprint_to_check]

peak_footprints_new[compared_footprint] = peak_footprints.get(compared_footprint, {})
peak_footprints_new[compared_footprint] = peak_footprints[compared_footprint]
#UPDATE
peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])
else:
if compared_footprint in merged_footprints.keys():
#compared_footprint was as main for merging already
#UPDATE
peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])

merged_array = merged_footprints[compared_footprint]
merged_array.append(footprint_to_check)
merged_footprints[compared_footprint] = merged_array

else:
#merge now and add compared_footprint to the merged_footprints
merged_footprints[compared_footprint] = merged_footprints.get(compared_footprint, [])
merged_footprints[compared_footprint] = [compared_footprint]
#UPDATE
peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])

else: #save the current footprint, as it should not be merged
peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, [])
peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check]

return peak_footprints_new

#this function is used to update the footprint that should to be merged with another one
#as input the footprint, needed the update, as well as new start, new end and the score of the merged footprint are passed
#the output of this function is a dictionary containing the new information about the footprint
def footprint_update(footprint, start, end, score):
new_len = end - start
new_score = (footprint['score'] + score) / 2

footprint['start'] = start
footprint['end'] = end
footprint['score'] = new_score
footprint['len'] = new_len

return footprint

#this function uses the information provided from the .bed file to look for footprints within the peaks of interest
#as input the information from the original bed file, as well as bigwig file is needed
#the optional parameters window_length, step and percentage are needed as well to use the sliding window algorithm and work with the "background" score
#the output of this function is a dictionary contains all the found footprints ready to write to the output file
def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage, max_bp_between):
def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage, min_gap):

logger.info('Looking for footprints within peaks...')

Expand All @@ -354,7 +437,7 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage,
peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage)

#double check for overlaps and possibly merging of footprints having up to 5 bp in between
peak_footprints = check_and_merge(peak_footprints, max_bp_between)
peak_footprints = check_and_merge(peak_footprints, min_gap)

for footprint_name in peak_footprints.keys():
all_footprints[footprint_name] = all_footprints.get(footprint_name, {})
Expand Down Expand Up @@ -398,7 +481,18 @@ def write_to_bed_file(all_footprints, sorted_output_file_name):

#write each footprint line for line to the output file
for footprint in all_footprints:
output_file.write('\t'.join([footprint[1]['chromosom'], str(footprint[1]['start']), str(footprint[1]['end']), footprint[0], str(round(footprint[1]['score'], 6)), '.', str(footprint[1]['len']), str(footprint[1]['max_pos']), ';'.join(footprint[1]['bonus'])]) + '\n')
#validation of the footprints, if there is a problem with some of them, write which one it is
#first check the start and end positions
if footprint[1]['start'] >= footprint[1]['end']:
logger.info("The problem occured with start and end positions. This footprint will not be printed to the output file:")
logger.info(footprint)
#then check the max_pos
elif footprint[1]['max_pos'] == 0:
logger.info("The problem occured with max_pos of the footprint. This footprint will not be printed to the output file:")
logger.info(footprint)
#otherwise everything is fine, write to the output
else:
output_file.write('\t'.join([footprint[1]['chromosom'], str(footprint[1]['start']), str(footprint[1]['end']), footprint[0], str(round(footprint[1]['score'], 6)), '.', str(footprint[1]['len']), str(footprint[1]['max_pos']), ';'.join(footprint[1]['bonus'])]) + '\n')

output_file.close()

Expand Down Expand Up @@ -439,7 +533,7 @@ def main():
logger.info("The script footprints_extraction.py was called using these parameters: " + str(vars(args)))

bed_dictionary = make_bed_dictionary(args.bed)
all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage, args.max_bp_between)
all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage, args.min_gap)
write_to_bed_file(all_footprints, args.output_file)

logger.info("the number of peaks: " + str(len(bed_dictionary)))
Expand Down