Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
master_project_JLU2018/bin/1.1_footprint_extraction/footprints_extraction.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
549 lines (433 sloc)
29 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
footprints_extraction script uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest. | |
@author: Anastasiia Petrova | |
@contact: anastasiia.petrova(at)mpi-bn.mpg.de | |
""" | |
import argparse #for parsing the parameters | |
import sys #for example to exit if a problem occured | |
import os #for example to check the existing path of a file | |
import re #for example to split a string | |
import time #to calculate time needed to proceed the data | |
import logging #to write informaiton about the programm run | |
import numpy as np #to calculate mean for example | |
from collections import defaultdict #to create nested dictionaries | |
import pyBigWig #to work with bigWig files | |
import textwrap #to print the help message nice | |
#the logger should be callable from all functions, so create it here | |
logger = logging.getLogger('footprints_extraction') | |
logger.setLevel(logging.INFO) | |
formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M") | |
#the function parse_args() reads the input from user, including required and optional parameters | |
#if user defines no values for required parameters, the error will be printed and the script exits | |
#if user defines no values for optional parameters, the dafault values will be used | |
#this function returns a structure called args containing values for all parameters | |
def parse_args(): | |
parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent(''' | |
This script produces a file with footprints in .bed format from the file with scores in .bigWig format and a corresponding .bed file with peaks of interest. | |
'''), epilog='That is what you need to make this script work for you. Enjoy it') | |
required_arguments = parser.add_argument_group('required arguments') | |
required_arguments.add_argument('--bigwig', help='Please provide a path to the bigWig-file with scores.', required=True) | |
required_arguments.add_argument('--bed', help='Please provide a path to the file with peaks in .bed format.', required=True) | |
#all other arguments are optional | |
#parser.add_argument('--output_directory', default='output', const='output', nargs='?', help='The output directory, by default ./output/') | |
parser.add_argument('--output_file', default='footprints_extraction.bed', type=str, help='Please enter a name for the output file, by default ./footprints_extraction.bed.') | |
parser.add_argument('--window_length', default=200, type=int, help='Please enter the length for a window, by defauld 200 bp.') | |
parser.add_argument('--step', default=100, type=int, help='Please enter a step to move the window, by default 100 bp.') | |
parser.add_argument('--percentage', default=0, type=int, help='Please enter a percentage to be added to background while searching for footprints, by default 0%%.') | |
parser.add_argument('--min_gap', default=6, type=int, help='Please enter the number of bp allowed to be in between two footprints, by default 6 bp.') | |
parser.add_argument('--silent', action='store_true', help='While working with data write the information only into ./footprints_extraction.log.') | |
args = parser.parse_args() | |
return args | |
#this function checks if the directory for the output files exists and if not, the new directory with default name will be created | |
#the input for this function is a directory to check | |
def check_directory(directory): | |
if not os.path.exists(directory): | |
os.makedirs(directory) | |
logger.info('The new directory ' + directory + ' was created!') | |
#check if the file to remove exists and if so, delete it | |
#if the file does not exist, nothing happens | |
def remove_file(file): | |
if os.path.isfile(file): | |
os.remove(file) | |
#check if the input files exist, we will check for the right format of input files later on | |
#the input parameter args is a structure from parse_args, containing the input parameters | |
#if input files dont exist, the short message will be printed and the exit is forced | |
def check_existing_input_files(args): | |
#check if the bigWig file exists | |
if not os.path.isfile(args.bigwig): | |
print('Error: there is no such bigWig file ' + args.bigwig + ', the exit is forced!') | |
sys.exit() | |
#check if the bed file exists | |
if not os.path.isfile(args.bed): | |
print('Error: there is no such bed file ' + args.bed + ', the exit is forced!') | |
sys.exit() | |
#make a dictionary out of the .bed file for easy access to the information from the .bed file | |
#the input for this function is a path to the bed file | |
#the output is a dictionary containing key and values from the given bed file | |
#key looks like chr1:123-234, the value contains all the additional information from the bed file | |
def make_bed_dictionary(bed_file): | |
logger.info('Reading of the bed file...') | |
try: #if i can't proceede this file like so, the input was not a .bed file! | |
bed_dictionary = {} | |
with open(bed_file) as read_bed_file: | |
for bed_line in read_bed_file: | |
bed_line_array = re.split(r'\t', bed_line.rstrip('\n')) | |
if bed_line_array[1].isdigit() and bed_line_array[2].isdigit() and int(bed_line_array[1]) <= int(bed_line_array[2]): #in the real bedfile the second column is a start position, and the third column is an end position, so we are checking if these are integers and if the start position is smaller than the end one | |
key = bed_line_array[0] + ":" + bed_line_array[1] + "-" + bed_line_array[2] | |
value = [] | |
for i in range(3, len(bed_line_array)): | |
value.append(bed_line_array[i]) | |
if not value: #if there is no bonus information in the original bed file, add a "." to the coulmn in the output bed file | |
value = ["."] | |
bed_dictionary[key] = value | |
else: #this is not a bed file, force exit | |
logger.info('Error: please make sure the input bed file has a right format, the problem occured on the line ' + bed_line) | |
sys.exit() | |
read_bed_file.close() | |
return bed_dictionary | |
except UnicodeDecodeError: #force exit, if input is not a .bed file | |
logger.info('Error: please make sure that the .bed file has a right format! The exit is forced!') | |
sys.exit() | |
#to save a footprint, find the score and max_pos for this footprint and check for overlapping with other footprints within the current peak | |
#the input for this function is the count for footprint to ensure unique name for each saved footprint; | |
#an array containing scores (signals) from the bigwig file; the footprints already saved for the particular peak; information for the bed file: | |
#chromosom, start and end position, as well as additional information from the original bed file | |
#the function returns the count for footprints, as well as footprints for the current peak | |
def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom, footprint_start, footprint_end, bonus_info_from_bed): | |
save_current_footprint = False | |
#make sure we are working only with regions bigger than 2, in other case we have not enough information to find the footprint | |
if len(footprint_scores) > 2: | |
#calculate the position with the max score. if there are many positions with the same score, save one from the middle | |
first_max_pos = footprint_scores.index(max(footprint_scores)) | |
#assume that there is only one pos with max score | |
last_max_pos = first_max_pos | |
#find the region with the highest score | |
for j in range(first_max_pos, len(footprint_scores)): | |
if footprint_scores[j] < first_max_pos: | |
last_max_pos = j | |
else: | |
last_max_pos = len(footprint_scores) | |
if first_max_pos != last_max_pos: | |
#find a pos in the middle of these both | |
max_pos = int((last_max_pos - first_max_pos) / 2) | |
else: | |
max_pos = first_max_pos | |
max_pos = max_pos + 1 #as the index of an array starts with 0 | |
#calculate the score for the current footprint as mean of all scores from the bigwig file | |
footprint_score = np.mean(footprint_scores) | |
#checking for existing and overlapping footprints | |
if len(peak_footprints.keys()) == 0: # hey, this is the first footprint! | |
save_current_footprint = True | |
else: # there are already footprints in this peak, so look for overlaps | |
for existing_footprint_name in peak_footprints.keys(): | |
old_start = peak_footprints[existing_footprint_name]['start'] | |
old_end = peak_footprints[existing_footprint_name]['end'] | |
old_score = peak_footprints[existing_footprint_name]['score'] | |
old_length = peak_footprints[existing_footprint_name]['len'] | |
if footprint_start >= old_start and footprint_start <= old_end: #the start of the new footprint is between the start and end of an old footprint | |
if footprint_end > old_end: #the new footprint is not completely inside the old one | |
#update the information about the existing footprint | |
#find the average of both scores | |
footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2 | |
peak_footprints[existing_footprint_name]['end'] = footprint_end | |
peak_footprints[existing_footprint_name]['score'] = footprint_score | |
peak_footprints[existing_footprint_name]['len'] = footprint_end - old_start | |
#we can not update the max_pos as we do not have the information about scores array of the existing footprint | |
#else: the new footprint is completely inside the old one, do nothing | |
save_current_footprint = False | |
break | |
elif footprint_end >= old_start and footprint_end <= old_end: #the end of the new footprint is between the start and end of an old footprint | |
if footprint_start < old_start: #the new footprint is not completely inside the old one | |
#update the information about the existing footprint | |
#find the average of both scores | |
footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2 | |
peak_footprints[existing_footprint_name]['start'] = footprint_start | |
peak_footprints[existing_footprint_name]['score'] = footprint_score | |
peak_footprints[existing_footprint_name]['len'] = old_end - footprint_start | |
#else do nothing | |
save_current_footprint = False | |
break | |
elif footprint_start <= old_start and footprint_end >= old_end: #the old footprint is exactly between the start and end positions of the new footprint | |
#update the information about the existing footprint | |
peak_footprints[existing_footprint_name] = {'chromosom': chromosom, 'start': footprint_start, 'end': footprint_end, 'score': footprint_score, 'len': len(footprint_scores), 'bonus': bonus_info_from_bed, 'max_pos': max_pos} | |
save_current_footprint = False | |
break | |
else: #so this is a new footprint that has no overlaps with the others | |
save_current_footprint = True | |
if save_current_footprint == True: | |
#make sure to save this footprint | |
footprint_name = "f_" + str(footprint_count) | |
peak_footprints[footprint_name] = peak_footprints.get(footprint_name, {}) | |
peak_footprints[footprint_name] = {'chromosom': chromosom, 'start': footprint_start, 'end': footprint_end, 'score': footprint_score, 'len': len(footprint_scores), 'bonus': bonus_info_from_bed, 'max_pos': max_pos} | |
footprint_count = footprint_count + 1 | |
#else do nothing | |
return footprint_count, peak_footprints | |
#this function uses a slide window algorithm to estimate regions where the signal is higher than the background and can possibly be footprints | |
#as an input this function takes: the footprints already saved for the current peak; the count for footprints to ensure unique name; | |
#the information for the bed file (chromosom, start- and end-positions of peak); an array with scores within the peak; the length of the window; | |
#as well as the dictionary containing the information from the original bed file and the optional parameters such as step to slide the window and percentage to change the background | |
#this function returns the array containing the found footprints within a peak as well as the footprint count to look for footprints in the next peak and ensure the unique names for footprints | |
def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary_entry, step, percentage): | |
#find the length of the current peak | |
peak_len = len(scores_in_peak) | |
parts = [] #an array to save scores of parts | |
parts_positions = [] #an array to save where each part begins in the original peak | |
#if necessary divide the peak with help of a sliding window | |
if peak_len <= window_length: | |
window_length = peak_len | |
parts.append(scores_in_peak) | |
parts_positions.append(0) | |
else: | |
pos = 0 | |
while pos < (peak_len - step): | |
if (pos + window_length) >= len(scores_in_peak): | |
part = scores_in_peak[pos:] | |
else: | |
part = scores_in_peak[pos:pos + window_length] | |
if len(part) != 1: #otherwise it makes no sense to look on the mean within this part and look for footprints | |
parts.append(part) | |
parts_positions.append(pos) | |
pos = pos + step | |
#look in each window and try to save the footprints | |
for j in range(len(parts)): | |
window = parts[j] | |
#add some percent to the background to avoid the noice we don't want to have | |
bw_peak_background = np.mean(window) #find the mean of all scores within one peak | |
part = (percentage*bw_peak_background)/100 #x procent of the background | |
bw_peak_background = bw_peak_background + part | |
check_position = parts_positions[j] #the start position not within the window, but within the peak!!! | |
footprint_start = check_position #for each footprint | |
footprint_scores = [] #for each footprint | |
#look on each position inside the window | |
for i in range(len(window)): | |
position = i + 1 #calculate the relative position for a score | |
score = window[i] #extract one score from the list | |
if score >= bw_peak_background: | |
if position != (check_position + 1): #if this position is not the next with the last position we have checked | |
#save the last footprint | |
if check_position != 0: #if this is not the start of the first footprint within this peak | |
footprint_count, peak_footprints = save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom, footprint_start + peak_start + parts_positions[j], check_position + peak_start + parts_positions[j], bed_dictionary_entry) | |
#start a new footprint | |
footprint_start = position | |
footprint_scores = [] | |
check_position = position | |
footprint_scores.append(score) #save the current score | |
check_position = position | |
footprint_count, peak_footprints = save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom, footprint_start + peak_start + parts_positions[j], check_position + peak_start + parts_positions[j], bed_dictionary_entry) #save the last footprint | |
return peak_footprints, footprint_count | |
#this function checks if there are footprints with a small amount of bp in between, and if so, merges them together | |
#if at this point there are still overlaps of footprints, this function will delete them | |
#the input parameter are: dictionary with footprints within one peak, and the max number of bp allowed to be in between the footprints | |
#the output is the renewed dictionary containing only the best footprints for the output file | |
def check_and_merge(peak_footprints, min_gap): | |
#to ensure the merging works well, sort the footprints first arter start and end positions | |
#the sort can not be applied to a dictionary, we are making a list out of peak_footprints_dict | |
peak_footprints_list = sorted(peak_footprints.items(), key = lambda x : (x[1]['start'], x[1]['end']), reverse = False) | |
peak_footprints_new = {} | |
merged_footprints = {} | |
#we need to check each footprint within this peak with the other footprints for possible merging | |
#for footprint_to_check in peak_footprints.keys(): | |
for footprint in peak_footprints_list: #work with sorted footprints | |
footprint_to_check = footprint[0] #save the name of the footprint which we are working with now | |
start_to_check = peak_footprints[footprint_to_check]['start'] | |
end_to_check = peak_footprints[footprint_to_check]['end'] | |
merge_footprints_left = None | |
for compared_footprint in peak_footprints.keys(): | |
if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] < min_gap: | |
#make compared_footprint longer: compared_footprint + footprint_to_check | |
merge_footprints_left = False | |
break | |
elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check < min_gap: | |
#make footprint_to_check longer: footprint_to_check + compared footprint | |
merge_footprints_left = True | |
break | |
if merge_footprints_left: #the left merging is enabled, start and end of compared_footprint should be smaller than the start of the footprint_to_check | |
if start_to_check < peak_footprints[compared_footprint]['start']: | |
if footprint_to_check not in peak_footprints_new.keys(): | |
if any(footprint_to_check in merged_footprints[x] for x in merged_footprints.keys()): #true if footprint_to_check was already merged with someone | |
for k, v in merged_footprints.items(): | |
if footprint_to_check in v: | |
main_footprint = k | |
#make merging using the information from the merged_footprints and peak_footprints_new | |
#UPDATE | |
peak_footprints_new[main_footprint] = footprint_update(peak_footprints_new[main_footprint], peak_footprints[compared_footprint]['start'], peak_footprints[main_footprint]['end'], peak_footprints[compared_footprint]['score']) | |
merged_array = merged_footprints[main_footprint] | |
merged_array.append(compared_footprint) | |
merged_footprints[main_footprint] = merged_array | |
#there are no merged footprints with the footprint_to_check yet, so make a new one | |
else: | |
#add the compared footprint and footprint_to_check to the merged_footprints | |
merged_footprints[footprint_to_check] = merged_footprints.get(footprint_to_check, []) | |
merged_footprints[footprint_to_check] = [compared_footprint] | |
peak_footprints_new[footprint_to_check] = peak_footprints.get(footprint_to_check, {}) | |
peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check] #<-- update | |
#UPDATE | |
peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score']) | |
else: #the footprint_to_check is in peak_footprints_new already | |
#the footprint_to_check can only be the main part of merging before, check it | |
if footprint_to_check in merged_footprints.keys(): | |
#footprint_to_check was as main for merging already | |
#UPDATE | |
peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score']) | |
#add it to the merged_footprints as well | |
merged_array = merged_footprints[footprint_to_check] | |
merged_array.append(compared_footprint) | |
merged_footprints[footprint_to_check] = merged_array | |
else: | |
#the footprint_to check was not merged with anything yet | |
merged_footprints[footprint_to_check] = merged_footprints.get(footprint_to_check, []) | |
merged_footprints[footprint_to_check] = [compared_footprint] | |
#UPDATE | |
peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score']) | |
#the right merging is enabled, start and end of compared footprint should be bigger than the start of the footprint_to_check | |
elif merge_footprints_left == False: | |
if end_to_check > peak_footprints[compared_footprint]['end']: | |
if compared_footprint not in peak_footprints_new.keys(): | |
if any(compared_footprint in merged_footprints[x] for x in merged_footprints.keys()): | |
for k, v in merged_footprints.items(): | |
if compared_footprint in v: | |
main_footprint = k | |
#make merging using the information from the merged_footprints and peak_footprints_new | |
#UPDATE | |
peak_footprints_new[main_footprint] = footprint_update(peak_footprints_new[main_footprint], peak_footprints[main_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score']) | |
#add to the merged_footprints | |
merged_array = merged_footprints[main_footprint] | |
merged_array.append(footprint_to_check) | |
merged_footprints[main_footprint] = merged_array | |
else: | |
#make normal update, using data from peak footprints | |
merged_footprints[compared_footprint] = merged_footprints.get(compared_footprint, []) | |
merged_footprints[compared_footprint] = [footprint_to_check] | |
peak_footprints_new[compared_footprint] = peak_footprints.get(compared_footprint, {}) | |
peak_footprints_new[compared_footprint] = peak_footprints[compared_footprint] | |
#UPDATE | |
peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score']) | |
else: | |
if compared_footprint in merged_footprints.keys(): | |
#compared_footprint was as main for merging already | |
#UPDATE | |
peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score']) | |
merged_array = merged_footprints[compared_footprint] | |
merged_array.append(footprint_to_check) | |
merged_footprints[compared_footprint] = merged_array | |
else: | |
#merge now and add compared_footprint to the merged_footprints | |
merged_footprints[compared_footprint] = merged_footprints.get(compared_footprint, []) | |
merged_footprints[compared_footprint] = [compared_footprint] | |
#UPDATE | |
peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score']) | |
else: #save the current footprint, as it should not be merged | |
peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, []) | |
peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check] | |
return peak_footprints_new | |
#this function is used to update the footprint that should to be merged with another one | |
#as input the footprint, needed the update, as well as new start, new end and the score of the merged footprint are passed | |
#the output of this function is a dictionary containing the new information about the footprint | |
def footprint_update(footprint, start, end, score): | |
new_len = end - start | |
new_score = (footprint['score'] + score) / 2 | |
footprint['start'] = start | |
footprint['end'] = end | |
footprint['score'] = new_score | |
footprint['len'] = new_len | |
return footprint | |
#this function uses the information provided from the .bed file to look for footprints within the peaks of interest | |
#as input the information from the original bed file, as well as bigwig file is needed | |
#the optional parameters window_length, step and percentage are needed as well to use the sliding window algorithm and work with the "background" score | |
#the output of this function is a dictionary contains all the found footprints ready to write to the output file | |
def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage, min_gap): | |
logger.info('Looking for footprints within peaks...') | |
try: #this try considers opening the bigwig file | |
footprint_count = 1 | |
all_footprints = {} | |
bw_open = pyBigWig.open(bw_file) | |
for header in bed_dictionary: | |
peak_footprints = {} | |
header_splitted = re.split(r':', header) | |
chromosom = header_splitted[0] | |
positions = re.split(r'-', header_splitted[-1]) | |
peak_start = int(positions[0]) | |
peak_end = int(positions[1]) | |
scores_in_peak = np.nan_to_num(np.array(list(bw_open.values(chromosom, peak_start, peak_end)))) #save the scores to an array | |
peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage) | |
#double check for overlaps and possibly merging of footprints having up to 5 bp in between | |
peak_footprints = check_and_merge(peak_footprints, min_gap) | |
for footprint_name in peak_footprints.keys(): | |
all_footprints[footprint_name] = all_footprints.get(footprint_name, {}) | |
all_footprints[footprint_name] = peak_footprints[footprint_name] | |
all_footprints = sorted(all_footprints.items(), key = lambda x : (x[1]['chromosom'], x[1]['start']), reverse = False) | |
return all_footprints | |
except RuntimeError: #if i can't work with the bigwig file like so, it was not a bigwig file! | |
logger.info('Error: please make sure that the .bigWig file has a right format! The exit is forced!') | |
sys.exit() | |
#this function writes the found footprints to the .bed file | |
#as input the dictionary with all footprints and the name for the output file are needed | |
#the function outputs first an unsorted file and finally sorts it, removing the unsorted one | |
def write_to_bed_file(all_footprints, sorted_output_file_name): | |
#extract the name of directory where we want to save the file | |
output_directory = os.path.dirname(sorted_output_file_name) | |
#check if there is a directory provided | |
if output_directory: | |
check_directory(output_directory) | |
#make sure the file is in .bed format | |
output_name = "not_sorted_" + os.path.splitext(os.path.basename(sorted_output_file_name))[0] + ".bed" | |
#save the output file in the working directory or in the directory provided by the user | |
output_file_name = (os.path.join(output_directory, output_name)) | |
#a header to know what is in the columns | |
header = ["#chr", "start", "end", "name", "score", "strand", "len", "max_pos", "bonus_info"] | |
#open a file to write | |
output_file = open(output_file_name, 'w') | |
logger.info("Printing to the output file...") | |
#write the header | |
output_file.write('\t'.join(header) + '\n') | |
#write each footprint line for line to the output file | |
for footprint in all_footprints: | |
#validation of the footprints, if there is a problem with some of them, write which one it is | |
#first check the start and end positions | |
if footprint[1]['start'] >= footprint[1]['end']: | |
logger.info("The problem occured with start and end positions. This footprint will not be printed to the output file:") | |
logger.info(footprint) | |
#then check the max_pos | |
elif footprint[1]['max_pos'] == 0: | |
logger.info("The problem occured with max_pos of the footprint. This footprint will not be printed to the output file:") | |
logger.info(footprint) | |
#otherwise everything is fine, write to the output | |
else: | |
output_file.write('\t'.join([footprint[1]['chromosom'], str(footprint[1]['start']), str(footprint[1]['end']), footprint[0], str(round(footprint[1]['score'], 6)), '.', str(footprint[1]['len']), str(footprint[1]['max_pos']), ';'.join(footprint[1]['bonus'])]) + '\n') | |
output_file.close() | |
#sort the bed file | |
logger.info('Sorting the output file...') | |
os.system("(head -n 2 " + output_file_name + " && tail -n +3 " + output_file_name + " | sort -k1,1V -k2,2n -k3,3n) > " + sorted_output_file_name) | |
logger.info('Removing the non-sorted file...') | |
remove_file(output_file_name) | |
def main(): | |
start = time.time() | |
args = parse_args() | |
check_existing_input_files(args) | |
#check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user | |
#check_directory(args.output_directory) | |
#fh = logging.FileHandler(os.path.join(args.output_directory, "footprints_extraction.log")) | |
fh = logging.FileHandler("footprints_extraction.log") | |
fh.setLevel(logging.INFO) | |
fh.setFormatter(formatter) | |
logger.addHandler(fh) | |
ch = logging.StreamHandler() | |
ch.setLevel(logging.INFO) | |
ch.setFormatter(formatter) | |
logger.addHandler(ch) | |
#if user do not want to see the information about the status of jobs, remove the handler, that writes to the terminal | |
if args.silent: | |
logger.removeHandler(ch) | |
logger.info("The script footprints_extraction.py was called using these parameters: " + str(vars(args))) | |
bed_dictionary = make_bed_dictionary(args.bed) | |
all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage, args.min_gap) | |
write_to_bed_file(all_footprints, args.output_file) | |
logger.info("the number of peaks: " + str(len(bed_dictionary))) | |
logger.info("the number of footprints: " + str(len(all_footprints))) | |
logger.info("It took footprints_extraction.py %s minutes to generate the output." % (round((time.time() - start)/60, 2))) | |
for handler in logger.handlers: | |
handler.close() | |
logger.removeFilter(handler) | |
if __name__ == "__main__": | |
main() |