Skip to content
Merged
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 115 additions & 23 deletions bin/1.1_footprint_extraction/footprints_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
else:
max_pos = first_max_pos

max_pos = max_pos + 1 #as the index of an array starts with 0

#calculate the score for the current footprint as mean of all scores from the bigwig file
footprint_score = np.mean(footprint_scores)

Expand Down Expand Up @@ -281,7 +283,9 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe
#the output is the renewed dictionary containing only the best footprints for the output file
def check_and_merge(peak_footprints, max_bp_between):
peak_footprints_new = {}
merged_footprints = {}

#we need to check each footprint within this peak with the other footprints for possible merging
for footprint_to_check in peak_footprints.keys():
start_to_check = peak_footprints[footprint_to_check]['start']
end_to_check = peak_footprints[footprint_to_check]['end']
Expand All @@ -291,41 +295,118 @@ def check_and_merge(peak_footprints, max_bp_between):
for compared_footprint in peak_footprints.keys():

if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] < max_bp_between:
#make compared_footprint longer
#make compared_footprint longer: compared_footprint + footprint_to_check
merge_footprints_left = False
break
elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check < max_bp_between:
#make footprint_to_check longer
#make footprint_to_check longer: footprint_to_check + compared footprint
merge_footprints_left = True
break

if merge_footprints_left: #if the merging left is enabled
#check if this footprint can be merged with the compared_footprint
#if compared footprint is not in peak_footprint_new.keys(), the next loop will check for this footprint. There is no need for doulbe check now
if start_to_check < peak_footprints[compared_footprint]['start'] and compared_footprint in peak_footprints_new.keys():
#update the start position
peak_footprints_new[compared_footprint]['start'] = start_to_check
#update the length
peak_footprints_new[compared_footprint]['len'] = peak_footprints[compared_footprint]['end'] - start_to_check
#update the score
peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2

elif merge_footprints_left == False: #otherwise merge right
#check if the merging is possible
if end_to_check > peak_footprints[compared_footprint]['end'] and compared_footprint in peak_footprints_new.keys():
#update the end position
peak_footprints_new[compared_footprint]['end'] = end_to_check
#update the length
peak_footprints_new[compared_footprint]['len'] = end_to_check - peak_footprints[compared_footprint]['end']
#update the score
peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2
if merge_footprints_left: #the left merging is enabled, start and end of compared_footprint should be smaller than the start of the footprint_to_check
if start_to_check < peak_footprints[compared_footprint]['start']:
if footprint_to_check not in peak_footprints_new.keys():
if any(footprint_to_check in merged_footprints[x] for x in merged_footprints.keys()): #true if footprint_to_check was already merged with someone
for k, v in merged_footprints.items():
if footprint_to_check in v:
main_footprint = k
#make merging using the information from the merged_footprints and peak_footprints_new
#UPDATE
peak_footprints_new[main_footprint] = footprint_update(peak_footprints_new[main_footprint], peak_footprints[compared_footprint]['start'], peak_footprints[main_footprint]['end'], peak_footprints[compared_footprint]['score'])
merged_array = merged_footprints[main_footprint]
merged_array.append(compared_footprint)
merged_footprints[main_footprint] = merged_array
#there are no merged footprints with the footprint_to_check yet, so make a new one
else:
#add the compared footprint and footprint_to_check to the merged_footprints
merged_footprints[footprint_to_check] = merged_footprints.get(footprint_to_check, [])
merged_footprints[footprint_to_check] = [compared_footprint]

peak_footprints_new[footprint_to_check] = peak_footprints.get(footprint_to_check, {})
peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check] #<-- update
#UPDATE
peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score'])
else: #the footprint_to_check is in peak_footprints_new already
#the footprint_to_check can only be the main part of merging before, check it
if footprint_to_check in merged_footprints.keys():
#footprint_to_check was as main for merging already
#UPDATE
peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score'])
#add it to the merged_footprints as well
merged_array = merged_footprints[footprint_to_check]
merged_array.append(compared_footprint)
merged_footprints[footprint_to_check] = merged_array
else:
#the footprint_to check was not merged with anything yet
merged_footprints[footprint_to_check] = merged_footprints.get(footprint_to_check, [])
merged_footprints[footprint_to_check] = [compared_footprint]
#UPDATE
peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score'])
#the right merging is enabled, start and end of compared footprint should be bigger than the start of the footprint_to_check
elif merge_footprints_left == False:
if end_to_check > peak_footprints[compared_footprint]['end']:
if compared_footprint not in peak_footprints_new.keys():
if any(compared_footprint in merged_footprints[x] for x in merged_footprints.keys()):
for k, v in merged_footprints.items():
if compared_footprint in v:
main_footprint = k
#make merging using the information from the merged_footprints and peak_footprints_new
#UPDATE
peak_footprints_new[main_footprint] = footprint_update(peak_footprints_new[main_footprint], peak_footprints[main_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])
#add to the merged_footprints
merged_array = merged_footprints[main_footprint]
merged_array.append(footprint_to_check)
merged_footprints[main_footprint] = merged_array
else:
#"make normal update, using data from peak footprints
merged_footprints[compared_footprint] = merged_footprints.get(compared_footprint, [])
merged_footprints[compared_footprint] = [footprint_to_check]

peak_footprints_new[compared_footprint] = peak_footprints.get(compared_footprint, {})
peak_footprints_new[compared_footprint] = peak_footprints[compared_footprint]
#UPDATE
peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])
else:
if compared_footprint in merged_footprints.keys():
#compared_footprint was as main for merging already
#UPDATE
peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])

merged_array = merged_footprints[compared_footprint]
merged_array.append(footprint_to_check)
merged_footprints[compared_footprint] = merged_array

else:
#merge now and add compared_footprint to the merged_footprints
merged_footprints[compared_footprint] = merged_footprints.get(compared_footprint, [])
merged_footprints[compared_footprint] = [compared_footprint]
#UPDATE
peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])

else: #save the current footprint, as it should not be merged
peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, [])
peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check]

#print(len(peak_footprints_new))
#for footprint in peak_footprints_new:
# print(footprint)
#sys.exit()
return peak_footprints_new

#this function is used to update the footprint that should to be merged with another one
#as input the footprint, needed the update, as well as new start, new end and the score of the merged footprint are passed
#the output of this function is a dictionary containing the new information about the footprint
def footprint_update(footprint, start, end, score):
new_len = end - start
new_score = (footprint['score'] + score) / 2

footprint['start'] = start
footprint['end'] = end
footprint['score'] = new_score
footprint['len'] = new_len

return footprint

#this function uses the information provided from the .bed file to look for footprints within the peaks of interest
#as input the information from the original bed file, as well as bigwig file is needed
#the optional parameters window_length, step and percentage are needed as well to use the sliding window algorithm and work with the "background" score
Expand Down Expand Up @@ -398,7 +479,18 @@ def write_to_bed_file(all_footprints, sorted_output_file_name):

#write each footprint line for line to the output file
for footprint in all_footprints:
output_file.write('\t'.join([footprint[1]['chromosom'], str(footprint[1]['start']), str(footprint[1]['end']), footprint[0], str(round(footprint[1]['score'], 6)), '.', str(footprint[1]['len']), str(footprint[1]['max_pos']), ';'.join(footprint[1]['bonus'])]) + '\n')
#validation of the footprints, if there is a problem with some of them, write which one it is
#first check the start and end positions
if footprint[1]['start'] >= footprint[1]['end']:
logger.info("The problem occured with start and end positions. This footprint will not be printed to the output file:")
logger.info(footprint)
#then check the max_pos
elif footprint[1]['max_pos'] == 0:
logger.info("The problem occured with max_pos of the footprint. This footprint will not be printed to the output file:")
logger.info(footprint)
#otherwise everything is fine, write to the output
else:
output_file.write('\t'.join([footprint[1]['chromosom'], str(footprint[1]['start']), str(footprint[1]['end']), footprint[0], str(round(footprint[1]['score'], 6)), '.', str(footprint[1]['len']), str(footprint[1]['max_pos']), ';'.join(footprint[1]['bonus'])]) + '\n')

output_file.close()

Expand Down