Skip to content

Commit

Permalink
final commenting for the important functions in the script
Browse files Browse the repository at this point in the history
  • Loading branch information
anastasiia committed Nov 21, 2018
1 parent d810baf commit 492d7f6
Showing 1 changed file with 61 additions and 67 deletions.
128 changes: 61 additions & 67 deletions call_peaks.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,35 +43,19 @@ def parse_args():

return args

#this function is currently unused, but may be used in the future version
def check_directory(directory):
if not os.path.exists(directory):
os.makedirs(directory)
#logger.info('a new directory ' + directory + ' was created')
print('a new directory ' + directory + ' was created')

#if there are chars that are not allowed, they will be replaced with '_', to the possibly invalid names there will be added '_' at the beginning of the name
def check_name(name_to_test):
badchars= re.compile(r'[^A-Za-z0-9_. ]+|^\.|\.$|^ | $|^$')
badnames= re.compile(r'(aux|com[1-9]|con|lpt[1-9]|prn)(\.|$)')

#replace all the chars that are not allowed with '_'
name = badchars.sub('_', name_to_test)

#check for the reserved by the os names
if badnames.match(name):
name = '_' + name
return name

#check if the file to remove exists and if so, delete it
def remove_file(file):
if os.path.isfile(file):
os.remove(file)

def make_name_from_path(full_path, output_directory, ending):
return os.path.join(output_directory, get_name_from_path(full_path) + ending)

def get_name_from_path(full_path):
return os.path.splitext(os.path.basename(full_path))[0]

#check if the input files exist. We have no possibility to check if the format of the files is right here though
def check_existing_input_files(args):

#check if the bigWig file exists
Expand All @@ -84,37 +68,44 @@ def check_existing_input_files(args):
print('there is no such bed file ' + args.bed + ', the exit is forced')
sys.exit()

#make a dictionary out of the .bed file for easy access to the information from the .bed file
def make_bed_dictionary(bed_file):

logger.info('reading of the bed file')

bed_dictionary = {}
try: #if i can't proceede this file like so, the input was not a .bed file!
bed_dictionary = {}

with open(bed_file) as read_bed_file:
for bed_line in read_bed_file:
bed_line_array = re.split(r'\t', bed_line.rstrip('\n'))
if bed_line_array[1].isdigit() and bed_line_array[2].isdigit() and int(bed_line_array[1]) <= int(bed_line_array[2]): #in the real bedfile the second column is a start position, and the third column is an end position, so we are checking if these are integers and if the start position is smaller than the end one
key = bed_line_array[0] + ":" + bed_line_array[1] + "-" + bed_line_array[2]
value = []
for i in range(3, len(bed_line_array)):
value.append(bed_line_array[i])

with open(bed_file) as read_bed_file:
for bed_line in read_bed_file:
bed_line_array = re.split(r'\t', bed_line.rstrip('\n'))
if bed_line_array[1].isdigit() and bed_line_array[2].isdigit() and int(bed_line_array[1]) <= int(bed_line_array[2]): #in the real bedfile the second column is a start position, and the third column is an end position, so we are checking if these are integers and if the start position is smaller than the end one
key = bed_line_array[0] + ":" + bed_line_array[1] + "-" + bed_line_array[2]
value = []
for i in range(3, len(bed_line_array)):
value.append(bed_line_array[i])
bed_dictionary[key] = value
else: #this is not a bed file, force exit
logger.info('please make sure the input bed file has a right format, the problem occured on the line ' + bed_line)
sys.exit()

bed_dictionary[key] = value
else: #this is not a bed file, force exit
logger.info('please make sure the input bed file has a right format, the problem occured on the line ' + bed_line)
sys.exit()
read_bed_file.close()

read_bed_file.close()
return bed_dictionary

return bed_dictionary
except UnicodeDecodeError: #force exit, if input is not a .bed file
logger.info('please make sure that the .bed file has a right format! The exit is forced')
sys.exit()

#to save a footprint, find the score and max_pos for this footprint and chech for overlapping with other footprints within the current peak
def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom, footprint_start, footprint_end, bonus_info_from_bed):

save_current_footprint = False

if len(footprint_scores) > 2: #exclude small regions to not work with them

#-------------- max score position
#calculate the position with the max score. if there are many positions with the same score, save one from the middle

first_max_pos = footprint_scores.index(max(footprint_scores))
last_max_pos = first_max_pos #assume that there is only one pos with max score
Expand All @@ -131,14 +122,11 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
max_pos = int((last_max_pos - first_max_pos) / 2)
else:
max_pos = first_max_pos
#-------------------------------------

#calculate the score for the current footprint as mean of all scores from the bigwig file
footprint_score = np.mean(footprint_scores)

search_key = str(chromosom) + ":" + str(footprint_start) + "-" + str(footprint_end)

#--------------- checking for existing and overlapping footprints

#checking for existing and overlapping footprints
if len(peak_footprints.keys()) == 0: # hey, this is the first footprint!
save_current_footprint = True

Expand Down Expand Up @@ -179,12 +167,10 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
break

else: #so this is a new footprint that has no overlaps with the others

save_current_footprint = True

if save_current_footprint == True:
#make sure to save this footprint

#make sure to save this footprint
footprint_name = "footprint_" + str(footprint_count)

peak_footprints[footprint_name] = peak_footprints.get(footprint_name, {})
Expand All @@ -196,6 +182,7 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom

return footprint_count, peak_footprints

#apply slide window searching to estimate regions that are higher than the background and can possibly be footprints
def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary_entry, step, percentage):

peak_len = len(scores_in_peak)
Expand All @@ -221,21 +208,20 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe

pos = pos + step

#look in each window and save the footprints
#look in each window and try to save the footprints
for j in range(len(parts)):

window = parts[j]

#-------------- add some percent to the background to avoid the noice we don't want to have
#add some percent to the background to avoid the noice we don't want to have
bw_peak_background = np.mean(window) #find the mean of all scores within one peak
part = (percentage*bw_peak_background)/100 #x procent of the background
bw_peak_background = bw_peak_background + part
#----------------------------------------------------------

check_position = parts_positions[j] #the start position not within the window, but within the peak!!!
footprint_start = check_position #for each footprint
footprint_scores = [] #for each footprint

#look on each position inside the window
for i in range(len(window)):
position = i + 1 #calculate the relative position for a score
score = window[i] #extract one score from the list
Expand All @@ -259,39 +245,46 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe

return peak_footprints, footprint_count

#use the information provided from the .bed file to look for footprints within the peaks of interest
def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage):

logger.info('looking for footprints within peaks')

footprint_count = 1
all_footprints = {}
try:
footprint_count = 1
all_footprints = {}

bw_open = pyBigWig.open(bw_file)

bw_open = pyBigWig.open(bw_file)
for header in bed_dictionary:

for header in bed_dictionary:
peak_footprints = {}

peak_footprints = {}
header_splitted = re.split(r':', header)
chromosom = header_splitted[0]
positions = re.split(r'-', header_splitted[-1])
peak_start = int(positions[0])
peak_end = int(positions[1])

header_splitted = re.split(r':', header)
chromosom = header_splitted[0]
positions = re.split(r'-', header_splitted[-1])
peak_start = int(positions[0])
peak_end = int(positions[1])
scores_in_peak = np.nan_to_num(np.array(list(bw_open.values(chromosom, peak_start, peak_end)))) #save the scores to an array

scores_in_peak = np.nan_to_num(np.array(list(bw_open.values(chromosom, peak_start, peak_end)))) #save the scores to an array
peak_len = len(scores_in_peak)

peak_len = len(scores_in_peak)
peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage)

peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage)
for footprint_name in peak_footprints.keys():
all_footprints[footprint_name] = all_footprints.get(footprint_name, {})
all_footprints[footprint_name] = peak_footprints[footprint_name]

for footprint_name in peak_footprints.keys():
all_footprints[footprint_name] = all_footprints.get(footprint_name, {})
all_footprints[footprint_name] = peak_footprints[footprint_name]
all_footprints = sorted(all_footprints.items(), key = lambda x : (x[1]['chromosom'], x[1]['start']), reverse = False)

all_footprints = sorted(all_footprints.items(), key = lambda x : (x[1]['chromosom'], x[1]['start']), reverse = False)
return all_footprints

return all_footprints
except RuntimeError: #if i can't work with the bigwig file like so, it was not a bigwig file!
logger.info('please make sure that the .bigWig file has a right format! The exit is forced')
sys.exit()

#write the found footprints to the .bed file
def write_to_bed_file(all_footprints, sorted_output_file_name):
output_file_name = "not_sorted_" + sorted_output_file_name #save in the working directory

Expand All @@ -303,6 +296,7 @@ def write_to_bed_file(all_footprints, sorted_output_file_name):

output_file.write('\t'.join(header) + '\n') #write the header

#write each footprint line for line to the output file
for footprint in all_footprints:
output_file.write('\t'.join([footprint[1]['chromosom'], str(footprint[1]['start']), str(footprint[1]['end']), footprint[0], str(round(footprint[1]['score'], 6)), str(footprint[1]['len']), str(footprint[1]['max_pos']), '\t'.join(footprint[1]['bonus'])]) + '\n')

Expand Down Expand Up @@ -342,8 +336,8 @@ def main():
if args.silent:
logger.removeHandler(ch)

logger.info("call_peaks.py was called using these parameters:")
logger.info(vars(args))
logger.info("call_peaks.py was called using these parameters: " + str(vars(args)))
#logger.info(vars(args))

bed_dictionary = make_bed_dictionary(args.bed)
all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage)
Expand Down

0 comments on commit 492d7f6

Please sign in to comment.