diff --git a/find_exons.py b/find_exons.py new file mode 100644 index 0000000..285690e --- /dev/null +++ b/find_exons.py @@ -0,0 +1,118 @@ + +""" +call_peaks uses the uncontinuous score from a bigWig file to estimate peaks +@author: Anastasiia Petrova +@contact: anastasiia.petrova(at)mpi-bn.mpg.de +""" + +import argparse +import sys +import os +import re +import time +import multiprocessing +import logging +import subprocess +from Bio import SeqIO +import Bio.SeqIO.FastaIO as bio +import numpy as np +from collections import defaultdict +from scipy import stats +import pyBigWig +from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni +import matplotlib.pyplot as plt +import random +import textwrap + +logger = logging.getLogger('find_exons') +logger.setLevel(logging.INFO) + +formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M") + +def parse_args(): + + parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent(''' + + This script produces plots and a tsv-file to show the number of exons for the list of genes of interest. + '''), epilog='That is what you need to make this script work for you. Enjoy it') + + required_arguments = parser.add_argument_group('required arguments') + required_arguments.add_argument('--gtf_genes', help='a .gtf file with genes', required=True) + required_arguments.add_argument('--gtf_exons', help='a .gtf file with exons', required=True) + required_arguments.add_argument('--gene_list', help='names of genes of interest', required=True) + + #all other arguments are optional + parser.add_argument('--output_directory', default='output', const='output', nargs='?', help='output directory, by default ./output/') + parser.add_argument('--silent', action='store_true', help='while working with data write the information only into ./call_peaks_log.txt') + args = parser.parse_args() + + return args + +def check_directory(directory): + if not os.path.exists(directory): + os.makedirs(directory) + #logger.info('a new directory ' + directory + ' was created') + print('a new directory ' + directory + ' was created') + +def remove_file(file): + if os.path.isfile(file): + os.remove(file) + +def make_name_from_path(full_path, output_directory, ending): + return os.path.join(output_directory, get_name_from_path(full_path) + ending) + +def get_name_from_path(full_path): + return os.path.splitext(os.path.basename(full_path))[0] + +def check_existing_input_files(args): + + if not os.path.isfile(args.gtf_genes): + #logger.info('please make sure the both files with conditions to compare exist') + print('please make sure the .gtf file with genes exists') + sys.exit() + + if not os.path.isfile(args.gtf_exons): + #logger.info('please make sure the both files with conditions to compare exist') + print('please make sure the .gtf file with exons exists') + sys.exit() + + if not os.path.isfile(args.gene_list): + print('please make sure the file with genes of interest exists') + sys.exit() + +def main(): + + start = time.time() + + args = parse_args() + + check_existing_input_files(args) + + #check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user + check_directory(args.output_directory) + + fh = logging.FileHandler(os.path.join(args.output_directory, "find_exons_log.txt")) + fh.setLevel(logging.INFO) + fh.setFormatter(formatter) + logger.addHandler(fh) + + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + ch.setFormatter(formatter) + logger.addHandler(ch) + + #if user do not want to see the information about the status of jobs, remove the handler, that writes to the terminal + if args.silent: + logger.removeHandler(ch) + + logger.info("find_exons.py was called using these parameters:") + logger.info(vars(args)) + + #logger.info("find_exons needed %s seconds to generate the output" % (time.time() - start)) + + for handler in logger.handlers: + handler.close() + logger.removeFilter(handler) + +if __name__ == "__main__": + main() \ No newline at end of file