Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 126 additions & 32 deletions bin/1.1_footprint_extraction/footprints_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def parse_args():
parser.add_argument('--window_length', default=200, type=int, help='Please enter the length for a window, by defauld 200 bp.')
parser.add_argument('--step', default=100, type=int, help='Please enter a step to move the window, by default 100 bp.')
parser.add_argument('--percentage', default=0, type=int, help='Please enter a percentage to be added to background while searching for footprints, by default 0%%.')
parser.add_argument('--max_bp_between', default=6, type=int, help='Please enter the number of bp allowed to be in between two footprints, by default 6 bp.')
parser.add_argument('--min_gap', default=6, type=int, help='Please enter the number of bp allowed to be in between two footprints, by default 6 bp.')
parser.add_argument('--silent', action='store_true', help='While working with data write the information only into ./footprints_extraction.log.')
args = parser.parse_args()

Expand Down Expand Up @@ -118,7 +118,7 @@ def make_bed_dictionary(bed_file):
#an array containing scores (signals) from the bigwig file; the footprints already saved for the particular peak; information for the bed file:
#chromosom, start and end position, as well as additional information from the original bed file
#the function returns the count for footprints, as well as footprints for the current peak
def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom, footprint_start, footprint_end, bonus_info_from_bed):
def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom, footprint_start, footprint_end, bonus_info_from_bed):

save_current_footprint = False

Expand All @@ -143,6 +143,8 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
else:
max_pos = first_max_pos

max_pos = max_pos + 1 #as the index of an array starts with 0

#calculate the score for the current footprint as mean of all scores from the bigwig file
footprint_score = np.mean(footprint_scores)

Expand Down Expand Up @@ -279,58 +281,139 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe
#if at this point there are still overlaps of footprints, this function will delete them
#the input parameter are: dictionary with footprints within one peak, and the max number of bp allowed to be in between the footprints
#the output is the renewed dictionary containing only the best footprints for the output file
def check_and_merge(peak_footprints, max_bp_between):
def check_and_merge(peak_footprints, min_gap):
#to ensure the merging works well, sort the footprints first arter start and end positions
#the sort can not be applied to a dictionary, we are making a list out of peak_footprints_dict
peak_footprints_list = sorted(peak_footprints.items(), key = lambda x : (x[1]['start'], x[1]['end']), reverse = False)

peak_footprints_new = {}
merged_footprints = {}

for footprint_to_check in peak_footprints.keys():
#we need to check each footprint within this peak with the other footprints for possible merging
#for footprint_to_check in peak_footprints.keys():
for footprint in peak_footprints_list: #work with sorted footprints
footprint_to_check = footprint[0] #save the name of the footprint which we are working with now
start_to_check = peak_footprints[footprint_to_check]['start']
end_to_check = peak_footprints[footprint_to_check]['end']

merge_footprints_left = None

for compared_footprint in peak_footprints.keys():

if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] < max_bp_between:
#make compared_footprint longer
if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] < min_gap:
#make compared_footprint longer: compared_footprint + footprint_to_check
merge_footprints_left = False
break
elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check < max_bp_between:
#make footprint_to_check longer
elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check < min_gap:
#make footprint_to_check longer: footprint_to_check + compared footprint
merge_footprints_left = True
break

if merge_footprints_left: #if the merging left is enabled
#check if this footprint can be merged with the compared_footprint
#if compared footprint is not in peak_footprint_new.keys(), the next loop will check for this footprint. There is no need for doulbe check now
if start_to_check < peak_footprints[compared_footprint]['start'] and compared_footprint in peak_footprints_new.keys():
#update the start position
peak_footprints_new[compared_footprint]['start'] = start_to_check
#update the length
peak_footprints_new[compared_footprint]['len'] = peak_footprints[compared_footprint]['end'] - start_to_check
#update the score
peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2

elif merge_footprints_left == False: #otherwise merge right
#check if the merging is possible
if end_to_check > peak_footprints[compared_footprint]['end'] and compared_footprint in peak_footprints_new.keys():
#update the end position
peak_footprints_new[compared_footprint]['end'] = end_to_check
#update the length
peak_footprints_new[compared_footprint]['len'] = end_to_check - peak_footprints[compared_footprint]['end']
#update the score
peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2
if merge_footprints_left: #the left merging is enabled, start and end of compared_footprint should be smaller than the start of the footprint_to_check
if start_to_check < peak_footprints[compared_footprint]['start']:
if footprint_to_check not in peak_footprints_new.keys():
if any(footprint_to_check in merged_footprints[x] for x in merged_footprints.keys()): #true if footprint_to_check was already merged with someone
for k, v in merged_footprints.items():
if footprint_to_check in v:
main_footprint = k
#make merging using the information from the merged_footprints and peak_footprints_new
#UPDATE
peak_footprints_new[main_footprint] = footprint_update(peak_footprints_new[main_footprint], peak_footprints[compared_footprint]['start'], peak_footprints[main_footprint]['end'], peak_footprints[compared_footprint]['score'])
merged_array = merged_footprints[main_footprint]
merged_array.append(compared_footprint)
merged_footprints[main_footprint] = merged_array
#there are no merged footprints with the footprint_to_check yet, so make a new one
else:
#add the compared footprint and footprint_to_check to the merged_footprints
merged_footprints[footprint_to_check] = merged_footprints.get(footprint_to_check, [])
merged_footprints[footprint_to_check] = [compared_footprint]

peak_footprints_new[footprint_to_check] = peak_footprints.get(footprint_to_check, {})
peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check] #<-- update
#UPDATE
peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score'])
else: #the footprint_to_check is in peak_footprints_new already
#the footprint_to_check can only be the main part of merging before, check it
if footprint_to_check in merged_footprints.keys():
#footprint_to_check was as main for merging already
#UPDATE
peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score'])
#add it to the merged_footprints as well
merged_array = merged_footprints[footprint_to_check]
merged_array.append(compared_footprint)
merged_footprints[footprint_to_check] = merged_array
else:
#the footprint_to check was not merged with anything yet
merged_footprints[footprint_to_check] = merged_footprints.get(footprint_to_check, [])
merged_footprints[footprint_to_check] = [compared_footprint]
#UPDATE
peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score'])
#the right merging is enabled, start and end of compared footprint should be bigger than the start of the footprint_to_check
elif merge_footprints_left == False:
if end_to_check > peak_footprints[compared_footprint]['end']:
if compared_footprint not in peak_footprints_new.keys():
if any(compared_footprint in merged_footprints[x] for x in merged_footprints.keys()):
for k, v in merged_footprints.items():
if compared_footprint in v:
main_footprint = k
#make merging using the information from the merged_footprints and peak_footprints_new
#UPDATE
peak_footprints_new[main_footprint] = footprint_update(peak_footprints_new[main_footprint], peak_footprints[main_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])
#add to the merged_footprints
merged_array = merged_footprints[main_footprint]
merged_array.append(footprint_to_check)
merged_footprints[main_footprint] = merged_array
else:
#make normal update, using data from peak footprints
merged_footprints[compared_footprint] = merged_footprints.get(compared_footprint, [])
merged_footprints[compared_footprint] = [footprint_to_check]

peak_footprints_new[compared_footprint] = peak_footprints.get(compared_footprint, {})
peak_footprints_new[compared_footprint] = peak_footprints[compared_footprint]
#UPDATE
peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])
else:
if compared_footprint in merged_footprints.keys():
#compared_footprint was as main for merging already
#UPDATE
peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])

merged_array = merged_footprints[compared_footprint]
merged_array.append(footprint_to_check)
merged_footprints[compared_footprint] = merged_array

else:
#merge now and add compared_footprint to the merged_footprints
merged_footprints[compared_footprint] = merged_footprints.get(compared_footprint, [])
merged_footprints[compared_footprint] = [compared_footprint]
#UPDATE
peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])

else: #save the current footprint, as it should not be merged
peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, [])
peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check]

return peak_footprints_new

#this function is used to update the footprint that should to be merged with another one
#as input the footprint, needed the update, as well as new start, new end and the score of the merged footprint are passed
#the output of this function is a dictionary containing the new information about the footprint
def footprint_update(footprint, start, end, score):
new_len = end - start
new_score = (footprint['score'] + score) / 2

footprint['start'] = start
footprint['end'] = end
footprint['score'] = new_score
footprint['len'] = new_len

return footprint

#this function uses the information provided from the .bed file to look for footprints within the peaks of interest
#as input the information from the original bed file, as well as bigwig file is needed
#the optional parameters window_length, step and percentage are needed as well to use the sliding window algorithm and work with the "background" score
#the output of this function is a dictionary contains all the found footprints ready to write to the output file
def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage, max_bp_between):
def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage, min_gap):

logger.info('Looking for footprints within peaks...')

Expand All @@ -354,7 +437,7 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage,
peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage)

#double check for overlaps and possibly merging of footprints having up to 5 bp in between
peak_footprints = check_and_merge(peak_footprints, max_bp_between)
peak_footprints = check_and_merge(peak_footprints, min_gap)

for footprint_name in peak_footprints.keys():
all_footprints[footprint_name] = all_footprints.get(footprint_name, {})
Expand Down Expand Up @@ -398,7 +481,18 @@ def write_to_bed_file(all_footprints, sorted_output_file_name):

#write each footprint line for line to the output file
for footprint in all_footprints:
output_file.write('\t'.join([footprint[1]['chromosom'], str(footprint[1]['start']), str(footprint[1]['end']), footprint[0], str(round(footprint[1]['score'], 6)), '.', str(footprint[1]['len']), str(footprint[1]['max_pos']), ';'.join(footprint[1]['bonus'])]) + '\n')
#validation of the footprints, if there is a problem with some of them, write which one it is
#first check the start and end positions
if footprint[1]['start'] >= footprint[1]['end']:
logger.info("The problem occured with start and end positions. This footprint will not be printed to the output file:")
logger.info(footprint)
#then check the max_pos
elif footprint[1]['max_pos'] == 0:
logger.info("The problem occured with max_pos of the footprint. This footprint will not be printed to the output file:")
logger.info(footprint)
#otherwise everything is fine, write to the output
else:
output_file.write('\t'.join([footprint[1]['chromosom'], str(footprint[1]['start']), str(footprint[1]['end']), footprint[0], str(round(footprint[1]['score'], 6)), '.', str(footprint[1]['len']), str(footprint[1]['max_pos']), ';'.join(footprint[1]['bonus'])]) + '\n')

output_file.close()

Expand Down Expand Up @@ -439,7 +533,7 @@ def main():
logger.info("The script footprints_extraction.py was called using these parameters: " + str(vars(args)))

bed_dictionary = make_bed_dictionary(args.bed)
all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage, args.max_bp_between)
all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage, args.min_gap)
write_to_bed_file(all_footprints, args.output_file)

logger.info("the number of peaks: " + str(len(bed_dictionary)))
Expand Down