From aa1e9f0f761196aa1b665710a900e9f3225e546f Mon Sep 17 00:00:00 2001
From: anastasiia <anastasiia.petrova@mpi-bn.mpg.de>
Date: Mon, 7 Jan 2019 12:47:11 +0100
Subject: [PATCH 1/4] fixing the bug with footprint length

---
 .../footprints_extraction.py                      | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/bin/1.1_footprint_extraction/footprints_extraction.py b/bin/1.1_footprint_extraction/footprints_extraction.py
index fae040e..e636c91 100644
--- a/bin/1.1_footprint_extraction/footprints_extraction.py
+++ b/bin/1.1_footprint_extraction/footprints_extraction.py
@@ -154,6 +154,7 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
 				old_start = peak_footprints[existing_footprint_name]['start']
 				old_end = peak_footprints[existing_footprint_name]['end']
 				old_score = peak_footprints[existing_footprint_name]['score']
+				old_length = peak_footprints[existing_footprint_name]['len']
 
 				if footprint_start >= old_start and footprint_start <= old_end: #the start of the new footprint is between the start and end of an old footprint
 					if footprint_end > old_end: #the new footprint is not completely inside the old one
@@ -163,19 +164,21 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
 
 						peak_footprints[existing_footprint_name]['end'] = footprint_end
 						peak_footprints[existing_footprint_name]['score'] = footprint_score
+						peak_footprints[existing_footprint_name]['len'] = footprint_end - old_start
 						#we can not update the max_pos as we do not have the information about scores array of the existing footprint
 					#else: the new footprint is completely inside the old one, do nothing
 					save_current_footprint = False
 					break
 
 				elif footprint_end >= old_start and footprint_end <= old_end: #the end of the new footprint is between the start and end of an old footprint
-					if footprint_start < old_start: #the new footprint is not completely inside the old one
+					if footprint_start < old_start: #the new footprint is not completely inside the old one						
 						#update the information about the existing footprint
 						#find the average of both scores
 						footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2
 
 						peak_footprints[existing_footprint_name]['start'] = footprint_start
 						peak_footprints[existing_footprint_name]['score'] = footprint_score
+						peak_footprints[existing_footprint_name]['len'] = old_end - footprint_start
 					#else do nothing
 					save_current_footprint = False
 					break
@@ -270,6 +273,11 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe
 
 	return peak_footprints, footprint_count
 
+def check_for_overlap_and_merge(peak_footprints):
+	for footprint_to_check in peak_footprints.keys():
+		print(footprint_to_check, peak_footprints[footprint_to_check])
+	return peak_footprints
+
 #this function uses the information provided from the .bed file to look for footprints within the peaks of interest
 #as input the information from the original bed file, as well as bigwig file is needed
 #the optional parameters window_length, step and percentage are needed as well to use the sliding window algorithm and work with the "background" score
@@ -295,9 +303,12 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage)
 			peak_end = int(positions[1])
 
 			scores_in_peak = np.nan_to_num(np.array(list(bw_open.values(chromosom, peak_start, peak_end)))) #save the scores to an array
-			
+			print()
 			peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage)
 
+			#double check for overlaps and possibly merging of footprints having up to 5 bp in between
+			peak_footprints = check_for_overlap_and_merge(peak_footprints)
+
 			for footprint_name in peak_footprints.keys():
 				all_footprints[footprint_name] = all_footprints.get(footprint_name, {})
 				all_footprints[footprint_name] = peak_footprints[footprint_name]

From c452934c1ff6a68b5621b4a98d44522d7ddc3385 Mon Sep 17 00:00:00 2001
From: anastasiia <anastasiia.petrova@mpi-bn.mpg.de>
Date: Mon, 7 Jan 2019 14:42:18 +0100
Subject: [PATCH 2/4] fixing the bug with overlaps + adding a merging function
 if the footprints have small ammount of bp in between

---
 .../footprints_extraction.py                  | 57 +++++++++++++++++--
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/bin/1.1_footprint_extraction/footprints_extraction.py b/bin/1.1_footprint_extraction/footprints_extraction.py
index e636c91..bcec148 100644
--- a/bin/1.1_footprint_extraction/footprints_extraction.py
+++ b/bin/1.1_footprint_extraction/footprints_extraction.py
@@ -273,10 +273,59 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe
 
 	return peak_footprints, footprint_count
 
-def check_for_overlap_and_merge(peak_footprints):
+def check_and_merge(peak_footprints):
+	peak_footprints_new = {}
+
 	for footprint_to_check in peak_footprints.keys():
-		print(footprint_to_check, peak_footprints[footprint_to_check])
-	return peak_footprints
+		start_to_check = peak_footprints[footprint_to_check]['start']
+		end_to_check = peak_footprints[footprint_to_check]['end']
+
+		#if not peak_footprints_new: #the dictionary is empty, save the first footprint
+		#	peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, [])
+		#	peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check]
+		#print(peak_footprints_new.keys())
+
+		merge_footprints_left = None
+
+		for compared_footprint in peak_footprints.keys():
+
+			if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] <= 5: #5 is hardcoded !!!!!! :(
+				#make compared_footprint longer
+				print("MERGE", footprint_to_check, compared_footprint)
+				merge_footprints_left = False
+				break
+			elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check <= 5:
+				#make footprint_to_check longer
+				print("MERGE", footprint_to_check, compared_footprint)
+				merge_footprints_left = True
+				break
+
+		if merge_footprints_left: #if it is true
+			print("merging left")
+			if start_to_check < peak_footprints[compared_footprint]['start'] and compared_footprint in peak_footprints_new.keys():
+				print("make longer")
+				print("before merging" + str(peak_footprints_new[compared_footprint]))
+				peak_footprints_new[compared_footprint]['start'] = start_to_check
+				peak_footprints_new[compared_footprint]['len'] = peak_footprints[compared_footprint]['end'] - start_to_check
+				peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2
+				print(peak_footprints_new[compared_footprint])
+
+		elif merge_footprints_left == False:
+			print("merging right")
+			if end_to_check > peak_footprints[compared_footprint]['end'] and compared_footprint in peak_footprints_new.keys():
+				print("make longer")
+				print("before merging" + str(peak_footprints_new[compared_footprint]))
+				peak_footprints_new[compared_footprint]['end'] = end_to_check
+				peak_footprints_new[compared_footprint]['len'] = end_to_check - peak_footprints[compared_footprint]['end']
+				peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2
+				print(peak_footprints_new[compared_footprint])
+		else:
+			print("saving " + footprint_to_check)
+			peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, [])
+			peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check]
+
+	#sys.exit()
+	return peak_footprints_new
 
 #this function uses the information provided from the .bed file to look for footprints within the peaks of interest
 #as input the information from the original bed file, as well as bigwig file is needed
@@ -307,7 +356,7 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage)
 			peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage)
 
 			#double check for overlaps and possibly merging of footprints having up to 5 bp in between
-			peak_footprints = check_for_overlap_and_merge(peak_footprints)
+			peak_footprints = check_and_merge(peak_footprints)
 
 			for footprint_name in peak_footprints.keys():
 				all_footprints[footprint_name] = all_footprints.get(footprint_name, {})

From 7a5d6df58f7147b7d8aedd8cea9c2d9435db1cc2 Mon Sep 17 00:00:00 2001
From: anastasiia <anastasiia.petrova@mpi-bn.mpg.de>
Date: Mon, 7 Jan 2019 15:11:08 +0100
Subject: [PATCH 3/4] allowing user to set the parameter for max bp allowed in
 between the footprints; also adding some comments

---
 .../footprints_extraction.py                  | 53 +++++++++----------
 1 file changed, 25 insertions(+), 28 deletions(-)

diff --git a/bin/1.1_footprint_extraction/footprints_extraction.py b/bin/1.1_footprint_extraction/footprints_extraction.py
index bcec148..3e94956 100644
--- a/bin/1.1_footprint_extraction/footprints_extraction.py
+++ b/bin/1.1_footprint_extraction/footprints_extraction.py
@@ -43,6 +43,7 @@ def parse_args():
 	parser.add_argument('--window_length', default=200, type=int, help='Please enter the length for a window, by defauld 200 bp.')
 	parser.add_argument('--step', default=100, type=int, help='Please enter a step to move the window, by default 100 bp.')
 	parser.add_argument('--percentage',  default=0, type=int, help='Please enter a percentage to be added to background while searching for footprints, by default 0%%.')
+	parser.add_argument('--max_bp_between', default=6, type=int, help='Please enter the number of bp allowed to be in between two footprints, by default 6 bp.')
 	parser.add_argument('--silent', action='store_true', help='While working with data write the information only into ./footprints_extraction.log.')
 	args = parser.parse_args()
 
@@ -273,65 +274,62 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe
 
 	return peak_footprints, footprint_count
 
-def check_and_merge(peak_footprints):
+#this function checks if there are footprints with a small amount of bp in between, and if so, merges them together
+#if at this point there are still overlaps of footprints, this function will delete them
+#the input parameter are: dictionary with footprints within one peak, and the max number of bp allowed to be in between the footprints
+#the output is the renewed dictionary containing only the best footprints for the output file
+def check_and_merge(peak_footprints, max_bp_between):
 	peak_footprints_new = {}
 
 	for footprint_to_check in peak_footprints.keys():
 		start_to_check = peak_footprints[footprint_to_check]['start']
 		end_to_check = peak_footprints[footprint_to_check]['end']
 
-		#if not peak_footprints_new: #the dictionary is empty, save the first footprint
-		#	peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, [])
-		#	peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check]
-		#print(peak_footprints_new.keys())
-
 		merge_footprints_left = None
 
 		for compared_footprint in peak_footprints.keys():
 
-			if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] <= 5: #5 is hardcoded !!!!!! :(
+			if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] < max_bp_between:
 				#make compared_footprint longer
-				print("MERGE", footprint_to_check, compared_footprint)
 				merge_footprints_left = False
 				break
-			elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check <= 5:
+			elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check < max_bp_between:
 				#make footprint_to_check longer
-				print("MERGE", footprint_to_check, compared_footprint)
 				merge_footprints_left = True
 				break
 
-		if merge_footprints_left: #if it is true
-			print("merging left")
-			if start_to_check < peak_footprints[compared_footprint]['start'] and compared_footprint in peak_footprints_new.keys():
-				print("make longer")
-				print("before merging" + str(peak_footprints_new[compared_footprint]))
+		if merge_footprints_left: #if the merging left is enabled
+			#check if this footprint can be merged with the compared_footprint
+			#if compared footprint is not in peak_footprint_new.keys(), the next loop will check for this footprint. There is no need for doulbe check now
+			if start_to_check < peak_footprints[compared_footprint]['start'] and compared_footprint in peak_footprints_new.keys(): 
+				#update the start position
 				peak_footprints_new[compared_footprint]['start'] = start_to_check
+				#update the length
 				peak_footprints_new[compared_footprint]['len'] = peak_footprints[compared_footprint]['end'] - start_to_check
+				#update the score
 				peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2
-				print(peak_footprints_new[compared_footprint])
 
-		elif merge_footprints_left == False:
-			print("merging right")
+		elif merge_footprints_left == False: #otherwise merge right
+			#check if the merging is possible
 			if end_to_check > peak_footprints[compared_footprint]['end'] and compared_footprint in peak_footprints_new.keys():
-				print("make longer")
-				print("before merging" + str(peak_footprints_new[compared_footprint]))
+				#update the end position
 				peak_footprints_new[compared_footprint]['end'] = end_to_check
+				#update the length
 				peak_footprints_new[compared_footprint]['len'] = end_to_check - peak_footprints[compared_footprint]['end']
+				#update the score
 				peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2
-				print(peak_footprints_new[compared_footprint])
-		else:
-			print("saving " + footprint_to_check)
+
+		else: #save the current footprint, as it should not be merged
 			peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, [])
 			peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check]
 
-	#sys.exit()
 	return peak_footprints_new
 
 #this function uses the information provided from the .bed file to look for footprints within the peaks of interest
 #as input the information from the original bed file, as well as bigwig file is needed
 #the optional parameters window_length, step and percentage are needed as well to use the sliding window algorithm and work with the "background" score
 #the output of this function is a dictionary contains all the found footprints ready to write to the output file
-def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage):
+def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage, max_bp_between):
 
 	logger.info('Looking for footprints within peaks...')
 
@@ -352,11 +350,10 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage)
 			peak_end = int(positions[1])
 
 			scores_in_peak = np.nan_to_num(np.array(list(bw_open.values(chromosom, peak_start, peak_end)))) #save the scores to an array
-			print()
 			peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage)
 
 			#double check for overlaps and possibly merging of footprints having up to 5 bp in between
-			peak_footprints = check_and_merge(peak_footprints)
+			peak_footprints = check_and_merge(peak_footprints, max_bp_between)
 
 			for footprint_name in peak_footprints.keys():
 				all_footprints[footprint_name] = all_footprints.get(footprint_name, {})
@@ -441,7 +438,7 @@ def main():
 	logger.info("The script footprints_extraction.py was called using these parameters: " + str(vars(args)))
 	
 	bed_dictionary = make_bed_dictionary(args.bed)
-	all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage)
+	all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage, args.max_bp_between)
 	write_to_bed_file(all_footprints, args.output_file)
 
 	logger.info("the number of peaks: " + str(len(bed_dictionary)))

From 0308a0c8a3657769c235b20fff6803aa8b36a38f Mon Sep 17 00:00:00 2001
From: anastasiia <anastasiia.petrova@mpi-bn.mpg.de>
Date: Tue, 8 Jan 2019 12:28:29 +0100
Subject: [PATCH 4/4] =?UTF-8?q?changing=20the=20name=20of=20footprints=20f?=
 =?UTF-8?q?rom=20footprint=5F=20to=20f=5F=20to=20make=20sure=20the=20scrip?=
 =?UTF-8?q?t=20of=20Ren=C3=A9=20works=20well?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 bin/1.1_footprint_extraction/footprints_extraction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/1.1_footprint_extraction/footprints_extraction.py b/bin/1.1_footprint_extraction/footprints_extraction.py
index 3e94956..d1b0f8f 100644
--- a/bin/1.1_footprint_extraction/footprints_extraction.py
+++ b/bin/1.1_footprint_extraction/footprints_extraction.py
@@ -195,7 +195,7 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
 
 		if save_current_footprint == True:
 			#make sure to save this footprint			
-			footprint_name = "footprint_" + str(footprint_count)
+			footprint_name = "f_" + str(footprint_count)
 
 			peak_footprints[footprint_name] = peak_footprints.get(footprint_name, {})
 			peak_footprints[footprint_name] = {'chromosom': chromosom, 'start': footprint_start, 'end': footprint_end, 'score': footprint_score, 'len': len(footprint_scores), 'bonus': bonus_info_from_bed, 'max_pos': max_pos}