diff --git a/visualize.py b/visualize.py new file mode 100644 index 0000000..59f9ad9 --- /dev/null +++ b/visualize.py @@ -0,0 +1,179 @@ + +""" +This visualises the information found by find_exons.py +@author: Anastasiia Petrova +@contact: anastasiia.petrova(at)mpi-bn.mpg.de +""" + +import argparse +import sys +import os +import re +import time +import multiprocessing +import logging +import subprocess +from Bio import SeqIO +import Bio.SeqIO.FastaIO as bio +import numpy as np +from collections import defaultdict +from scipy import stats +import pyBigWig +from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni +import matplotlib.pyplot as plt +import random +import textwrap +import seaborn as sns + +#from gtfparse import read_gtf #makes the logger to look different o_o + +logger = logging.getLogger('visualise') +logger.setLevel(logging.INFO) + +formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M") + +def parse_args(): + + parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent(''' + + This script produces plots and a tsv-file to show the number of exons for the list of genes of interest. + '''), epilog='That is what you need to make this script work for you. Enjoy it') + + required_arguments = parser.add_argument_group('required arguments') + required_arguments.add_argument('--correlation', help='a .txt file with correlation between genes length and the length of exons', required=True) + + #all other arguments are optional + #parser.add_argument('--genes_file', help='names of genes of interest') + + args = parser.parse_args() + + return args + +def check_existing_input_files(args): + + print('checking the input file') + + if not os.path.isfile(args.correlation): + print('please make sure the .txt file with genes-exons correlation exists') + sys.exit() + +def make_box_plot(group1, group2, gene_names, scores, output_directory, figure_name, color_name): + #first make two groups + group_one = [] + group_two = [] + + for name in group1: + group_one.append(scores[gene_names.index(name)]) + + for name in group2: + group_two.append(scores[gene_names.index(name)]) + + plt.figure() + plt.boxplot([group_one, group_two]) + + plt.show() + + + +def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name): + + the_max = max(max(x_array), max(y_array)) + + plt.xlim(0, the_max) + plt.ylim(0, the_max) + + fig, ax = plt.subplots() + plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5) + plt.plot(x_array, y_array, 'o', color = color_name) + + plt.xlabel("whole gene length") + plt.ylabel("sum length of exons") + + for i,j,name in zip(x_array, y_array, gene_names): + ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small') + + fig.savefig(os.path.join(output_directory, figure_name)) + + #plt.show() + +def make_reg_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name): + + the_max = max(max(x_array), max(y_array)) + + #plt.xlim(0, the_max) + #plt.ylim(0, the_max) + + #fig, ax = plt.subplots() + #plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5) + #plt.plot(x_array, y_array, 'o', color = color_name) + sns.set() + sns.regplot(np.array(x_array), np.array(y_array)) + sns.plt.show() + + #plt.xlabel("whole gene length") + #plt.ylabel("sum length of exons") + + #for i,j,name in zip(x_array, y_array, gene_names): + # ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small') + + #fig.savefig(os.path.join(output_directory, figure_name)) + + #plt.show() + +def read_correlation(correlation_file): + gene_names = [] + scores = [] + gene_lengths = [] + exons_lengths = [] + with open(correlation_file) as read_cor_file: + for cor_line in read_cor_file: + if not cor_line.startswith('#'): + cor_line_array = re.split(r'\t', cor_line.rstrip('\n')) + gene_names.append(cor_line_array[0]) + scores.append(float(cor_line_array[1])) + gene_lengths.append(int(cor_line_array[2])) + exons_lengths.append(int(cor_line_array[3])) + + read_cor_file.close() + + return gene_names, scores, gene_lengths, exons_lengths + +def main(): + + start = time.time() + + args = parse_args() + + check_existing_input_files(args) + + #check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user + #check_directory(args.output_directory) + + fh = logging.FileHandler("visualize_log.txt") + fh.setLevel(logging.INFO) + fh.setFormatter(formatter) + logger.addHandler(fh) + + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + ch.setFormatter(formatter) + logger.addHandler(ch) + + gene_names, scores, gene_lengths, exons_lengths = read_correlation(args.correlation) + + logger.info("making a plot") + #make_plot(gene_lengths, exons_lengths, gene_names, "./output", "test1.png", 'gold') + + group1 = ["OTX1", "OTX2", "PITX1", "GSC2", "NFYA", "NFYB", "DUX4"] #important genes in early development + group2 = ["MYCL", "LTBP3", "ERGIC3", "GCM2", "LHX8", "HIF1A", "FLI1"] #not so significant for early dev + + make_box_plot(group1, group2, gene_names, scores, "./output", "test2.png", 'gold') + + logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2))) + + for handler in logger.handlers: + handler.close() + logger.removeFilter(handler) + +if __name__ == "__main__": + main() \ No newline at end of file