Skip to content

Improving comments in the footprints_extraction.py script #24

Merged
merged 3 commits into from
Jan 3, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
127 changes: 84 additions & 43 deletions bin/footprints_extraction.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

"""
call_peaks uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest.
footprints_extraction script uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest.
@author: Anastasiia Petrova
@contact: anastasiia.petrova(at)mpi-bn.mpg.de
"""
Expand All @@ -16,11 +16,16 @@
import pyBigWig #to work with bigWig files
import textwrap #to print the help message nice

logger = logging.getLogger('call_peaks')
#the logger should be callable from all functions, so create it here
logger = logging.getLogger('footprints_extraction')
logger.setLevel(logging.INFO)

formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M")

#the function parse_args() reads the input from user, including required and optional parameters
#if user defines no values for required parameters, the error will be printed and the script exits
#if user defines no values for optional parameters, the dafault values will be used
#this function returns a structure called args containing values for all parameters
def parse_args():

parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent('''
Expand All @@ -29,47 +34,55 @@ def parse_args():
'''), epilog='That is what you need to make this script work for you. Enjoy it')

required_arguments = parser.add_argument_group('required arguments')
required_arguments.add_argument('--bigwig', help='a bigWig-file with scores', required=True)
required_arguments.add_argument('--bed', help='provide a file with peaks in .bed format', required=True)
required_arguments.add_argument('--bigwig', help='Please provide a path to the bigWig-file with scores.', required=True)
required_arguments.add_argument('--bed', help='Please provide a path to the file with peaks in .bed format.', required=True)

#all other arguments are optional
#parser.add_argument('--output_directory', default='output', const='output', nargs='?', help='output directory, by default ./output/')
parser.add_argument('--output_file', default='call_peaks_output.bed', type=str, help='enter a name for the output file, by default ./call_peaks_output.bed')
parser.add_argument('--window_length', default=200, type=int, help='enter the length for a window, by defauld 200 bp')
parser.add_argument('--step', default=100, type=int, help='enter a step to move the window, by default 100 bp')
parser.add_argument('--percentage', default=0, type=int, help='enter a percentage to be added to background while searching for footprints, by default 0%')
parser.add_argument('--silent', action='store_true', help='while working with data write the information only into ./call_peaks_log.log')
#parser.add_argument('--output_directory', default='output', const='output', nargs='?', help='The output directory, by default ./output/')
parser.add_argument('--output_file', default='footprints_extraction.bed', type=str, help='Please enter a name for the output file, by default ./footprints_extraction.bed.')
parser.add_argument('--window_length', default=200, type=int, help='Please enter the length for a window, by defauld 200 bp.')
parser.add_argument('--step', default=100, type=int, help='Please enter a step to move the window, by default 100 bp.')
parser.add_argument('--percentage', default=0, type=int, help='Please enter a percentage to be added to background while searching for footprints, by default 0%%.')
parser.add_argument('--silent', action='store_true', help='While working with data write the information only into ./footprints_extraction.log.')
args = parser.parse_args()

return args

#this function is currently unused, but may be used in the future version
#if there are several files producing by this script, the directory containing these files is useful to have
#this function checks if the directory for the output files exists and if not, the new directory with default name will be created
def check_directory(directory):
if not os.path.exists(directory):
os.makedirs(directory)
print('a new directory ' + directory + ' was created')
print('The new directory ' + directory + ' was created!')

#check if the file to remove exists and if so, delete it
#if the file does not exist, nothing happens
def remove_file(file):
if os.path.isfile(file):
os.remove(file)

#check if the input files exist. We have no possibility to check if the format of the files is right here though
#check if the input files exist, we will check for the right format of input files later on
#the input parameter args is a structure from parse_args, containing the input parameters
#if input files dont exist, the short message will be printed and the exit is forced
def check_existing_input_files(args):

#check if the bigWig file exists
if not os.path.isfile(args.bigwig):
print('there is no such bigWig file ' + args.bigwig + ', the exit is forced')
print('Error: there is no such bigWig file ' + args.bigwig + ', the exit is forced!')
sys.exit()
#check if the bed file exists
if not os.path.isfile(args.bed):
print('there is no such bed file ' + args.bed + ', the exit is forced')
print('Error: there is no such bed file ' + args.bed + ', the exit is forced!')
sys.exit()

#make a dictionary out of the .bed file for easy access to the information from the .bed file
#the input for this function is a path to the bed file
#the output is a dictionary containing key and values from the given bed file
#key looks like chr1:123-234, the value contains all the additional information from the bed file
def make_bed_dictionary(bed_file):

logger.info('reading of the bed file')
logger.info('Reading of the bed file...')

try: #if i can't proceede this file like so, the input was not a .bed file!
bed_dictionary = {}
Expand All @@ -85,28 +98,33 @@ def make_bed_dictionary(bed_file):

bed_dictionary[key] = value
else: #this is not a bed file, force exit
logger.info('please make sure the input bed file has a right format, the problem occured on the line ' + bed_line)
logger.info('Error: please make sure the input bed file has a right format, the problem occured on the line ' + bed_line)
sys.exit()

read_bed_file.close()

return bed_dictionary

except UnicodeDecodeError: #force exit, if input is not a .bed file
logger.info('please make sure that the .bed file has a right format! The exit is forced')
logger.info('Error: please make sure that the .bed file has a right format! The exit is forced!')
sys.exit()

#to save a footprint, find the score and max_pos for this footprint and check for overlapping with other footprints within the current peak
#the input for this function is the count for footprint to ensure unique name for each saved footprint;
#an array containing scores (signals) from the bigwig file; the footprints already saved for the particular peak; information for the bed file:
#chromosom, start and end position, as well as additional information from the original bed file
#the function returns the count for footprints, as well as footprints for the current peak
def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom, footprint_start, footprint_end, bonus_info_from_bed):

save_current_footprint = False

if len(footprint_scores) > 2: #exclude small regions to not work with them

#make sure we are working only with regions bigger than 2, in other case we have not enough information to find the footprint
if len(footprint_scores) > 2:

#calculate the position with the max score. if there are many positions with the same score, save one from the middle

first_max_pos = footprint_scores.index(max(footprint_scores))
last_max_pos = first_max_pos #assume that there is only one pos with max score
#assume that there is only one pos with max score
last_max_pos = first_max_pos

#find the region with the highest score
for j in range(first_max_pos, len(footprint_scores)):
Expand Down Expand Up @@ -138,7 +156,8 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
if footprint_start >= old_start and footprint_start <= old_end: #the start of the new footprint is between the start and end of an old footprint
if footprint_end > old_end: #the new footprint is not completely inside the old one
#update the information about the existing footprint
footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2 #find the average of both scores
#find the average of both scores
footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2

peak_footprints[existing_footprint_name]['end'] = footprint_end
peak_footprints[existing_footprint_name]['score'] = footprint_score
Expand All @@ -150,7 +169,8 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
elif footprint_end >= old_start and footprint_end <= old_end: #the end of the new footprint is between the start and end of an old footprint
if footprint_start < old_start: #the new footprint is not completely inside the old one
#update the information about the existing footprint
footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2 #find the average of both scores
#find the average of both scores
footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2

peak_footprints[existing_footprint_name]['start'] = footprint_start
peak_footprints[existing_footprint_name]['score'] = footprint_score
Expand Down Expand Up @@ -180,12 +200,17 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom

return footprint_count, peak_footprints

#apply slide window searching to estimate regions that are higher than the background and can possibly be footprints
#this function uses a slide window algorithm to estimate regions where the signal is higher than the background and can possibly be footprints
#as an input this function takes: the footprints already saved for the current peak; the count for footprints to ensure unique name;
#the information for the bed file (chromosom, start- and end-positions of peak); an array with scores within the peak; the length of the window;
#as well as the dictionary containing the information from the original bed file and the optional parameters such as step to slide the window and percentage to change the background
#this function returns the array containing the found footprints within a peak as well as the footprint count to look for footprints in the next peak and ensure the unique names for footprints
def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary_entry, step, percentage):


#find the length of the current peak
peak_len = len(scores_in_peak)
parts = []
parts_positions = []
parts = [] #an array to save scores of parts
parts_positions = [] #an array to save where each part begins in the original peak

#if necessary divide the peak with help of a sliding window
if peak_len <= window_length:
Expand Down Expand Up @@ -243,12 +268,15 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe

return peak_footprints, footprint_count

#use the information provided from the .bed file to look for footprints within the peaks of interest
#this function uses the information provided from the .bed file to look for footprints within the peaks of interest
#as input the information from the original bed file, as well as bigwig file is needed
#the optional parameters window_length, step and percentage are needed as well to use the sliding window algorithm and work with the "background" score
#the output of this function is a dictionary contains all the found footprints ready to write to the output file
def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage):

logger.info('looking for footprints within peaks')
logger.info('Looking for footprints within peaks...')

try:
try: #this try considers opening the bigwig file
footprint_count = 1
all_footprints = {}

Expand Down Expand Up @@ -277,20 +305,33 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage)
return all_footprints

except RuntimeError: #if i can't work with the bigwig file like so, it was not a bigwig file!
logger.info('please make sure that the .bigWig file has a right format! The exit is forced')
logger.info('Error: please make sure that the .bigWig file has a right format! The exit is forced!')
sys.exit()

#write the found footprints to the .bed file
#this function writes the found footprints to the .bed file
#as input the dictionary with all footprints and the name for the output file are needed
#the function outputs first an unsorted file and finally sorts it, removing the unsorted one
def write_to_bed_file(all_footprints, sorted_output_file_name):
output_file_name = "not_sorted_" + sorted_output_file_name #save in the working directory

#extract the name of directory where we want to save the file
output_directory = os.path.dirname(sorted_output_file_name)

#make sure the file is in .bed format
output_name = "not_sorted_" + os.path.splitext(os.path.basename(sorted_output_file_name))[0] + ".bed"

header = ["#chr", "start", "end", "name", "score", "strand", "len", "max_pos", "bonus_info"] #a header to know what is in the columns
#save the output file in the working directory or in the directory provided by the user
output_file_name = (os.path.join(output_directory, output_name))

#a header to know what is in the columns
header = ["#chr", "start", "end", "name", "score", "strand", "len", "max_pos", "bonus_info"]

output_file = open(output_file_name, 'w') #open a file to write
#open a file to write
output_file = open(output_file_name, 'w')

logger.info("print to the output file")
logger.info("Printing to the output file...")

output_file.write('\t'.join(header) + '\n') #write the header
#write the header
output_file.write('\t'.join(header) + '\n')

#write each footprint line for line to the output file
for footprint in all_footprints:
Expand All @@ -299,11 +340,11 @@ def write_to_bed_file(all_footprints, sorted_output_file_name):
output_file.close()

#sort the bed file
logger.info('sorting the output file')
logger.info('Sorting the output file...')

os.system("(head -n 2 " + output_file_name + " && tail -n +3 " + output_file_name + " | sort -k1,1V -k2,2n -k3,3n) > " + sorted_output_file_name)

logger.info('remove the non-sorted file')
logger.info('Removing the non-sorted file...')

remove_file(output_file_name)

Expand All @@ -317,8 +358,8 @@ def main():
#check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user
#check_directory(args.output_directory)

#fh = logging.FileHandler(os.path.join(args.output_directory, "call_peaks_log.log"))
fh = logging.FileHandler("call_peaks_log.log")
#fh = logging.FileHandler(os.path.join(args.output_directory, "footprints_extraction.log"))
fh = logging.FileHandler("footprints_extraction.log")
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)
Expand All @@ -332,7 +373,7 @@ def main():
if args.silent:
logger.removeHandler(ch)

logger.info("call_peaks.py was called using these parameters: " + str(vars(args)))
logger.info("The script footprints_extraction.py was called using these parameters: " + str(vars(args)))

bed_dictionary = make_bed_dictionary(args.bed)
all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage)
Expand All @@ -341,7 +382,7 @@ def main():
logger.info("the number of peaks: " + str(len(bed_dictionary)))
logger.info("the number of footprints: " + str(len(all_footprints)))

logger.info("call_peaks needed %s minutes to generate the output" % (round((time.time() - start)/60, 2)))
logger.info("It took footprints_extraction.py %s minutes to generate the output." % (round((time.time() - start)/60, 2)))

for handler in logger.handlers:
handler.close()
Expand Down