From 9a587557b8254044b6a4679e409e3e5eeb407a8b Mon Sep 17 00:00:00 2001
From: anastasiia <anastasiia.petrova@mni.thm.de>
Date: Thu, 27 Dec 2018 21:34:56 +0100
Subject: [PATCH 1/3] Improving comments

Answer for the issue #22
---
 bin/footprints_extraction.py | 122 ++++++++++++++++++++++-------------
 1 file changed, 78 insertions(+), 44 deletions(-)

diff --git a/bin/footprints_extraction.py b/bin/footprints_extraction.py
index 3e73eb5..df7f567 100644
--- a/bin/footprints_extraction.py
+++ b/bin/footprints_extraction.py
@@ -1,6 +1,6 @@
 
 """
-call_peaks uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest.
+footprints_extraction script uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest.
 @author: Anastasiia Petrova
 @contact: anastasiia.petrova(at)mpi-bn.mpg.de
 """
@@ -16,11 +16,16 @@
 import pyBigWig #to work with bigWig files
 import textwrap #to print the help message nice
 
-logger = logging.getLogger('call_peaks')
+#the logger should be callable from all functions, so create it here
+logger = logging.getLogger('footprints_extraction')
 logger.setLevel(logging.INFO)
 
 formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M")
 
+#the function parse_args() reads the input from user, including required and optional parameters
+#if user defines no values for required parameters, the error will be printed and the script exits
+#if user defines no values for optional parameters, the dafault values will be used
+#this function returns a structure called args containing values for all parameters
 def parse_args():
 	
 	parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent('''                           
@@ -29,47 +34,55 @@ def parse_args():
 		'''), epilog='That is what you need to make this script work for you. Enjoy it')
 	
 	required_arguments = parser.add_argument_group('required arguments')
-	required_arguments.add_argument('--bigwig', help='a bigWig-file with scores', required=True)
-	required_arguments.add_argument('--bed', help='provide a file with peaks in .bed format', required=True)
+	required_arguments.add_argument('--bigwig', help='Please provide a path to the bigWig-file with scores.', required=True)
+	required_arguments.add_argument('--bed', help='Please provide a path to the file with peaks in .bed format.', required=True)
 
 	#all other arguments are optional
-	#parser.add_argument('--output_directory',  default='output', const='output', nargs='?', help='output directory, by default ./output/')
-	parser.add_argument('--output_file', default='call_peaks_output.bed', type=str, help='enter a name for the output file, by default ./call_peaks_output.bed')
-	parser.add_argument('--window_length', default=200, type=int, help='enter the length for a window, by defauld 200 bp')
-	parser.add_argument('--step', default=100, type=int, help='enter a step to move the window, by default 100 bp')
-	parser.add_argument('--percentage',  default=0, type=int, help='enter a percentage to be added to background while searching for footprints, by default 0%')
-	parser.add_argument('--silent', action='store_true', help='while working with data write the information only into ./call_peaks_log.log')
+	#parser.add_argument('--output_directory',  default='output', const='output', nargs='?', help='The output directory, by default ./output/')
+	parser.add_argument('--output_file', default='footprints_extraction.bed', type=str, help='Please enter a name for the output file, by default ./footprints_extraction.bed.')
+	parser.add_argument('--window_length', default=200, type=int, help='Please enter the length for a window, by defauld 200 bp.')
+	parser.add_argument('--step', default=100, type=int, help='Please enter a step to move the window, by default 100 bp.')
+	parser.add_argument('--percentage',  default=0, type=int, help='Please enter a percentage to be added to background while searching for footprints, by default 0%.')
+	parser.add_argument('--silent', action='store_true', help='While working with data write the information only into ./footprints_extraction.log.')
 	args = parser.parse_args()
 
 	return args
 
 #this function is currently unused, but may be used in the future version
+#if there are several files producing by this script, the directory containing these files is useful to have
+#this function checks if the directory for the output files exists and if not, the new directory with default name will be created
 def check_directory(directory):
 	if not os.path.exists(directory):
 		os.makedirs(directory)
-		print('a new directory ' + directory + ' was created')
+		print('The new directory ' + directory + ' was created!')
 
 #check if the file to remove exists and if so, delete it
+#if the file does not exist, nothing happens
 def remove_file(file):
 	if os.path.isfile(file):
 		os.remove(file)
 
-#check if the input files exist. We have no possibility to check if the format of the files is right here though
+#check if the input files exist, we will check for the right format of input files later on
+#the input parameter args is a structure from parse_args, containing the input parameters
+#if input files dont exist, the short message will be printed and the exit is forced
 def check_existing_input_files(args):
 
 	#check if the bigWig file exists
 	if not os.path.isfile(args.bigwig):
-		print('there is no such bigWig file ' + args.bigwig + ', the exit is forced')
+		print('Error: there is no such bigWig file ' + args.bigwig + ', the exit is forced!')
 		sys.exit()
 	#check if the bed file exists
 	if not os.path.isfile(args.bed):
-		print('there is no such bed file ' + args.bed + ', the exit is forced')
+		print('Error: there is no such bed file ' + args.bed + ', the exit is forced!')
 		sys.exit()
 
 #make a dictionary out of the .bed file for easy access to the information from the .bed file
+#the input for this function is a path to the bed file
+#the output is a dictionary containing key and values from the given bed file
+#key looks like chr1:123-234, the value contains all the additional information from the bed file
 def make_bed_dictionary(bed_file):
 
-	logger.info('reading of the bed file')
+	logger.info('Reading of the bed file...')
 
 	try: #if i can't proceede this file like so, the input was not a .bed file!
 		bed_dictionary = {}
@@ -85,7 +98,7 @@ def make_bed_dictionary(bed_file):
 
 					bed_dictionary[key] = value
 				else: #this is not a bed file, force exit
-					logger.info('please make sure the input bed file has a right format, the problem occured on the line ' + bed_line)
+					logger.info('Error: please make sure the input bed file has a right format, the problem occured on the line ' + bed_line)
 					sys.exit()
 
 		read_bed_file.close()
@@ -93,20 +106,25 @@ def make_bed_dictionary(bed_file):
 		return bed_dictionary
 
 	except UnicodeDecodeError: #force exit, if input is not a .bed file
-		logger.info('please make sure that the .bed file has a right format! The exit is forced')
+		logger.info('Error: please make sure that the .bed file has a right format! The exit is forced!')
 		sys.exit()
 
 #to save a footprint, find the score and max_pos for this footprint and check for overlapping with other footprints within the current peak
+#the input for this function is the count for footprint to ensure unique name for each saved footprint;
+#an array containing scores (signals) from the bigwig file; the footprints already saved for the particular peak; information for the bed file:
+#chromosom, start and end position, as well as additional information from the original bed file
+#the function returns the count for footprints, as well as footprints for the current peak
 def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom, footprint_start, footprint_end, bonus_info_from_bed):
 
 	save_current_footprint = False
 
-	if len(footprint_scores) > 2: #exclude small regions to not work with them
-
+	#make sure we are working only with regions bigger than 2, in other case we have not enough information to find the footprint
+	if len(footprint_scores) > 2: 
+		
 		#calculate the position with the max score. if there are many positions with the same score, save one from the middle
-
 		first_max_pos = footprint_scores.index(max(footprint_scores))
-		last_max_pos = first_max_pos #assume that there is only one pos with max score
+		#assume that there is only one pos with max score
+		last_max_pos = first_max_pos
 
 		#find the region with the highest score
 		for j in range(first_max_pos, len(footprint_scores)):
@@ -138,7 +156,8 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
 				if footprint_start >= old_start and footprint_start <= old_end: #the start of the new footprint is between the start and end of an old footprint
 					if footprint_end > old_end: #the new footprint is not completely inside the old one
 						#update the information about the existing footprint
-						footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2 #find the average of both scores
+						#find the average of both scores
+						footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2
 
 						peak_footprints[existing_footprint_name]['end'] = footprint_end
 						peak_footprints[existing_footprint_name]['score'] = footprint_score
@@ -150,7 +169,8 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
 				elif footprint_end >= old_start and footprint_end <= old_end: #the end of the new footprint is between the start and end of an old footprint
 					if footprint_start < old_start: #the new footprint is not completely inside the old one
 						#update the information about the existing footprint
-						footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2 #find the average of both scores
+						#find the average of both scores
+						footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2
 
 						peak_footprints[existing_footprint_name]['start'] = footprint_start
 						peak_footprints[existing_footprint_name]['score'] = footprint_score
@@ -180,12 +200,17 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
 
 	return footprint_count, peak_footprints
 
-#apply slide window searching to estimate regions that are higher than the background and can possibly be footprints
+#this function uses a slide window algorithm to estimate regions where the signal is higher than the background and can possibly be footprints
+#as an input this function takes: the footprints already saved for the current peak; the count for footprints to ensure unique name;
+#the information for the bed file (chromosom, start- and end-positions of peak); an array with scores within the peak; the length of the window;
+#as well as the dictionary containing the information from the original bed file and the optional parameters such as step to slide the window and percentage to change the background
+#this function returns the array containing the found footprints within a peak as well as the footprint count to look for footprints in the next peak and ensure the unique names for footprints
 def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary_entry, step, percentage):
-
+	
+	#find the length of the current peak
 	peak_len = len(scores_in_peak)
-	parts = []
-	parts_positions = []
+	parts = [] #an array to save scores of parts
+	parts_positions = [] #an array to save where each part begins in the original peak
 
 	#if necessary divide the peak with help of a sliding window
 	if peak_len <= window_length:
@@ -243,12 +268,15 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe
 
 	return peak_footprints, footprint_count
 
-#use the information provided from the .bed file to look for footprints within the peaks of interest
+#this function uses the information provided from the .bed file to look for footprints within the peaks of interest
+#as input the information from the original bed file, as well as bigwig file is needed
+#the optional parameters window_length, step and percentage are needed as well to use the sliding window algorithm and work with the "background" score
+#the output of this function is a dictionary contains all the found footprints ready to write to the output file
 def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage):
 
-	logger.info('looking for footprints within peaks')
+	logger.info('Looking for footprints within peaks...')
 
-	try:
+	try: #this try considers opening the bigwig file
 		footprint_count = 1
 		all_footprints = {}
 
@@ -277,20 +305,26 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage)
 		return all_footprints
 
 	except RuntimeError: #if i can't work with the bigwig file like so, it was not a bigwig file!
-		logger.info('please make sure that the .bigWig file has a right format! The exit is forced')
+		logger.info('Error: please make sure that the .bigWig file has a right format! The exit is forced!')
 		sys.exit()
 
-#write the found footprints to the .bed file
+#this function writes the found footprints to the .bed file
+#as input the dictionary with all footprints and the name for the output file are needed
+#the function outputs first an unsorted file and finally sorts it, removing the unsorted one
 def write_to_bed_file(all_footprints, sorted_output_file_name):
-	output_file_name = "not_sorted_" + sorted_output_file_name #save in the working directory
-
-	header = ["#chr", "start", "end", "name", "score", "strand", "len", "max_pos", "bonus_info"] #a header to know what is in the columns
+	#save the output file in the working directory
+	output_file_name = "not_sorted_" + sorted_output_file_name
+	
+	#a header to know what is in the columns
+	header = ["#chr", "start", "end", "name", "score", "strand", "len", "max_pos", "bonus_info"]
 
-	output_file = open(output_file_name, 'w') #open a file to write
+	#open a file to write
+	output_file = open(output_file_name, 'w')
 
-	logger.info("print to the output file")
+	logger.info("Printing to the output file...")
 
-	output_file.write('\t'.join(header) + '\n') #write the header
+	#write the header
+	output_file.write('\t'.join(header) + '\n')
 
 	#write each footprint line for line to the output file
 	for footprint in all_footprints:
@@ -299,11 +333,11 @@ def write_to_bed_file(all_footprints, sorted_output_file_name):
 	output_file.close()
 
 	#sort the bed file
-	logger.info('sorting the output file')
+	logger.info('Sorting the output file...')
 
 	os.system("(head -n 2 " + output_file_name + " && tail -n +3 " + output_file_name + " | sort -k1,1V -k2,2n -k3,3n) > " + sorted_output_file_name)
 
-	logger.info('remove the non-sorted file')
+	logger.info('Removing the non-sorted file...')
 
 	remove_file(output_file_name)
 
@@ -317,8 +351,8 @@ def main():
 	#check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user
 	#check_directory(args.output_directory)
 
-	#fh = logging.FileHandler(os.path.join(args.output_directory, "call_peaks_log.log"))
-	fh = logging.FileHandler("call_peaks_log.log")
+	#fh = logging.FileHandler(os.path.join(args.output_directory, "footprints_extraction.log"))
+	fh = logging.FileHandler("footprints_extraction.log")
 	fh.setLevel(logging.INFO)
 	fh.setFormatter(formatter)
 	logger.addHandler(fh)
@@ -332,7 +366,7 @@ def main():
 	if args.silent:
 		logger.removeHandler(ch)
 
-	logger.info("call_peaks.py was called using these parameters: " + str(vars(args)))
+	logger.info("The script footprints_extraction.py was called using these parameters: " + str(vars(args)))
 	
 	bed_dictionary = make_bed_dictionary(args.bed)
 	all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage)
@@ -341,7 +375,7 @@ def main():
 	logger.info("the number of peaks: " + str(len(bed_dictionary)))
 	logger.info("the number of footprints: " + str(len(all_footprints)))
 
-	logger.info("call_peaks needed %s minutes to generate the output" % (round((time.time() - start)/60, 2)))
+	logger.info("It took footprints_extraction.py %s minutes to generate the output." % (round((time.time() - start)/60, 2)))
 	
 	for handler in logger.handlers:
 		handler.close()

From e22492251ce7f3af01e8e99b264c3a724108b19c Mon Sep 17 00:00:00 2001
From: anastasiia <anastasiia.petrova@mni.thm.de>
Date: Wed, 2 Jan 2019 21:54:54 +0100
Subject: [PATCH 2/3] fix for the help message called using -h

---
 bin/footprints_extraction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/footprints_extraction.py b/bin/footprints_extraction.py
index df7f567..415e589 100644
--- a/bin/footprints_extraction.py
+++ b/bin/footprints_extraction.py
@@ -42,7 +42,7 @@ def parse_args():
 	parser.add_argument('--output_file', default='footprints_extraction.bed', type=str, help='Please enter a name for the output file, by default ./footprints_extraction.bed.')
 	parser.add_argument('--window_length', default=200, type=int, help='Please enter the length for a window, by defauld 200 bp.')
 	parser.add_argument('--step', default=100, type=int, help='Please enter a step to move the window, by default 100 bp.')
-	parser.add_argument('--percentage',  default=0, type=int, help='Please enter a percentage to be added to background while searching for footprints, by default 0%.')
+	parser.add_argument('--percentage',  default=0, type=int, help='Please enter a percentage to be added to background while searching for footprints, by default 0%%.')
 	parser.add_argument('--silent', action='store_true', help='While working with data write the information only into ./footprints_extraction.log.')
 	args = parser.parse_args()
 

From 36d775627b7028e56d509198c0feeb90c8b256f6 Mon Sep 17 00:00:00 2001
From: anastasiia <anastasiia.petrova@mni.thm.de>
Date: Wed, 2 Jan 2019 23:24:29 +0100
Subject: [PATCH 3/3] fixing the bug with the output directory name

---
 bin/footprints_extraction.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/bin/footprints_extraction.py b/bin/footprints_extraction.py
index 415e589..47f8075 100644
--- a/bin/footprints_extraction.py
+++ b/bin/footprints_extraction.py
@@ -312,8 +312,15 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage)
 #as input the dictionary with all footprints and the name for the output file are needed
 #the function outputs first an unsorted file and finally sorts it, removing the unsorted one
 def write_to_bed_file(all_footprints, sorted_output_file_name):
-	#save the output file in the working directory
-	output_file_name = "not_sorted_" + sorted_output_file_name
+	
+	#extract the name of directory where we want to save the file
+	output_directory = os.path.dirname(sorted_output_file_name)
+	
+	#make sure the file is in .bed format
+	output_name = "not_sorted_" + os.path.splitext(os.path.basename(sorted_output_file_name))[0] + ".bed"
+
+	#save the output file in the working directory or in the directory provided by the user
+	output_file_name = (os.path.join(output_directory, output_name))
 	
 	#a header to know what is in the columns
 	header = ["#chr", "start", "end", "name", "score", "strand", "len", "max_pos", "bonus_info"]