Skip to content

Commit

Permalink
adding a small script to visualize the results of the find_exons.py
Browse files Browse the repository at this point in the history
  • Loading branch information
anastasiia committed Nov 26, 2018
1 parent a0e3033 commit 405e31b
Showing 1 changed file with 179 additions and 0 deletions.
179 changes: 179 additions & 0 deletions visualize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@

"""
This visualises the information found by find_exons.py
@author: Anastasiia Petrova
@contact: anastasiia.petrova(at)mpi-bn.mpg.de
"""

import argparse
import sys
import os
import re
import time
import multiprocessing
import logging
import subprocess
from Bio import SeqIO
import Bio.SeqIO.FastaIO as bio
import numpy as np
from collections import defaultdict
from scipy import stats
import pyBigWig
from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni
import matplotlib.pyplot as plt
import random
import textwrap
import seaborn as sns

#from gtfparse import read_gtf #makes the logger to look different o_o

logger = logging.getLogger('visualise')
logger.setLevel(logging.INFO)

formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M")

def parse_args():

parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent('''
This script produces plots and a tsv-file to show the number of exons for the list of genes of interest.
'''), epilog='That is what you need to make this script work for you. Enjoy it')

required_arguments = parser.add_argument_group('required arguments')
required_arguments.add_argument('--correlation', help='a .txt file with correlation between genes length and the length of exons', required=True)

#all other arguments are optional
#parser.add_argument('--genes_file', help='names of genes of interest')

args = parser.parse_args()

return args

def check_existing_input_files(args):

print('checking the input file')

if not os.path.isfile(args.correlation):
print('please make sure the .txt file with genes-exons correlation exists')
sys.exit()

def make_box_plot(group1, group2, gene_names, scores, output_directory, figure_name, color_name):
#first make two groups
group_one = []
group_two = []

for name in group1:
group_one.append(scores[gene_names.index(name)])

for name in group2:
group_two.append(scores[gene_names.index(name)])

plt.figure()
plt.boxplot([group_one, group_two])

plt.show()



def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):

the_max = max(max(x_array), max(y_array))

plt.xlim(0, the_max)
plt.ylim(0, the_max)

fig, ax = plt.subplots()
plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5)
plt.plot(x_array, y_array, 'o', color = color_name)

plt.xlabel("whole gene length")
plt.ylabel("sum length of exons")

for i,j,name in zip(x_array, y_array, gene_names):
ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small')

fig.savefig(os.path.join(output_directory, figure_name))

#plt.show()

def make_reg_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):

the_max = max(max(x_array), max(y_array))

#plt.xlim(0, the_max)
#plt.ylim(0, the_max)

#fig, ax = plt.subplots()
#plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5)
#plt.plot(x_array, y_array, 'o', color = color_name)
sns.set()
sns.regplot(np.array(x_array), np.array(y_array))
sns.plt.show()

#plt.xlabel("whole gene length")
#plt.ylabel("sum length of exons")

#for i,j,name in zip(x_array, y_array, gene_names):
# ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small')

#fig.savefig(os.path.join(output_directory, figure_name))

#plt.show()

def read_correlation(correlation_file):
gene_names = []
scores = []
gene_lengths = []
exons_lengths = []
with open(correlation_file) as read_cor_file:
for cor_line in read_cor_file:
if not cor_line.startswith('#'):
cor_line_array = re.split(r'\t', cor_line.rstrip('\n'))
gene_names.append(cor_line_array[0])
scores.append(float(cor_line_array[1]))
gene_lengths.append(int(cor_line_array[2]))
exons_lengths.append(int(cor_line_array[3]))

read_cor_file.close()

return gene_names, scores, gene_lengths, exons_lengths

def main():

start = time.time()

args = parse_args()

check_existing_input_files(args)

#check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user
#check_directory(args.output_directory)

fh = logging.FileHandler("visualize_log.txt")
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)

gene_names, scores, gene_lengths, exons_lengths = read_correlation(args.correlation)

logger.info("making a plot")
#make_plot(gene_lengths, exons_lengths, gene_names, "./output", "test1.png", 'gold')

group1 = ["OTX1", "OTX2", "PITX1", "GSC2", "NFYA", "NFYB", "DUX4"] #important genes in early development
group2 = ["MYCL", "LTBP3", "ERGIC3", "GCM2", "LHX8", "HIF1A", "FLI1"] #not so significant for early dev

make_box_plot(group1, group2, gene_names, scores, "./output", "test2.png", 'gold')

logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2)))

for handler in logger.handlers:
handler.close()
logger.removeFilter(handler)

if __name__ == "__main__":
main()

0 comments on commit 405e31b

Please sign in to comment.