Skip to content

Commit

Permalink
allowing user to set the parameter for max bp allowed in between the …
Browse files Browse the repository at this point in the history
…footprints; also adding some comments
  • Loading branch information
anastasiia committed Jan 7, 2019
1 parent c452934 commit 7a5d6df
Showing 1 changed file with 25 additions and 28 deletions.
53 changes: 25 additions & 28 deletions bin/1.1_footprint_extraction/footprints_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def parse_args():
parser.add_argument('--window_length', default=200, type=int, help='Please enter the length for a window, by defauld 200 bp.')
parser.add_argument('--step', default=100, type=int, help='Please enter a step to move the window, by default 100 bp.')
parser.add_argument('--percentage', default=0, type=int, help='Please enter a percentage to be added to background while searching for footprints, by default 0%%.')
parser.add_argument('--max_bp_between', default=6, type=int, help='Please enter the number of bp allowed to be in between two footprints, by default 6 bp.')
parser.add_argument('--silent', action='store_true', help='While working with data write the information only into ./footprints_extraction.log.')
args = parser.parse_args()

Expand Down Expand Up @@ -273,65 +274,62 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe

return peak_footprints, footprint_count

def check_and_merge(peak_footprints):
#this function checks if there are footprints with a small amount of bp in between, and if so, merges them together
#if at this point there are still overlaps of footprints, this function will delete them
#the input parameter are: dictionary with footprints within one peak, and the max number of bp allowed to be in between the footprints
#the output is the renewed dictionary containing only the best footprints for the output file
def check_and_merge(peak_footprints, max_bp_between):
peak_footprints_new = {}

for footprint_to_check in peak_footprints.keys():
start_to_check = peak_footprints[footprint_to_check]['start']
end_to_check = peak_footprints[footprint_to_check]['end']

#if not peak_footprints_new: #the dictionary is empty, save the first footprint
# peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, [])
# peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check]
#print(peak_footprints_new.keys())

merge_footprints_left = None

for compared_footprint in peak_footprints.keys():

if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] <= 5: #5 is hardcoded !!!!!! :(
if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] < max_bp_between:
#make compared_footprint longer
print("MERGE", footprint_to_check, compared_footprint)
merge_footprints_left = False
break
elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check <= 5:
elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check < max_bp_between:
#make footprint_to_check longer
print("MERGE", footprint_to_check, compared_footprint)
merge_footprints_left = True
break

if merge_footprints_left: #if it is true
print("merging left")
if start_to_check < peak_footprints[compared_footprint]['start'] and compared_footprint in peak_footprints_new.keys():
print("make longer")
print("before merging" + str(peak_footprints_new[compared_footprint]))
if merge_footprints_left: #if the merging left is enabled
#check if this footprint can be merged with the compared_footprint
#if compared footprint is not in peak_footprint_new.keys(), the next loop will check for this footprint. There is no need for doulbe check now
if start_to_check < peak_footprints[compared_footprint]['start'] and compared_footprint in peak_footprints_new.keys():
#update the start position
peak_footprints_new[compared_footprint]['start'] = start_to_check
#update the length
peak_footprints_new[compared_footprint]['len'] = peak_footprints[compared_footprint]['end'] - start_to_check
#update the score
peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2
print(peak_footprints_new[compared_footprint])

elif merge_footprints_left == False:
print("merging right")
elif merge_footprints_left == False: #otherwise merge right
#check if the merging is possible
if end_to_check > peak_footprints[compared_footprint]['end'] and compared_footprint in peak_footprints_new.keys():
print("make longer")
print("before merging" + str(peak_footprints_new[compared_footprint]))
#update the end position
peak_footprints_new[compared_footprint]['end'] = end_to_check
#update the length
peak_footprints_new[compared_footprint]['len'] = end_to_check - peak_footprints[compared_footprint]['end']
#update the score
peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2
print(peak_footprints_new[compared_footprint])
else:
print("saving " + footprint_to_check)

else: #save the current footprint, as it should not be merged
peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, [])
peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check]

#sys.exit()
return peak_footprints_new

#this function uses the information provided from the .bed file to look for footprints within the peaks of interest
#as input the information from the original bed file, as well as bigwig file is needed
#the optional parameters window_length, step and percentage are needed as well to use the sliding window algorithm and work with the "background" score
#the output of this function is a dictionary contains all the found footprints ready to write to the output file
def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage):
def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage, max_bp_between):

logger.info('Looking for footprints within peaks...')

Expand All @@ -352,11 +350,10 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage)
peak_end = int(positions[1])

scores_in_peak = np.nan_to_num(np.array(list(bw_open.values(chromosom, peak_start, peak_end)))) #save the scores to an array
print()
peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage)

#double check for overlaps and possibly merging of footprints having up to 5 bp in between
peak_footprints = check_and_merge(peak_footprints)
peak_footprints = check_and_merge(peak_footprints, max_bp_between)

for footprint_name in peak_footprints.keys():
all_footprints[footprint_name] = all_footprints.get(footprint_name, {})
Expand Down Expand Up @@ -441,7 +438,7 @@ def main():
logger.info("The script footprints_extraction.py was called using these parameters: " + str(vars(args)))

bed_dictionary = make_bed_dictionary(args.bed)
all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage)
all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage, args.max_bp_between)
write_to_bed_file(all_footprints, args.output_file)

logger.info("the number of peaks: " + str(len(bed_dictionary)))
Expand Down

0 comments on commit 7a5d6df

Please sign in to comment.