-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adding a small script to visualize the results of the find_exons.py
- Loading branch information
1 parent
a0e3033
commit 405e31b
Showing
1 changed file
with
179 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
|
||
""" | ||
This visualises the information found by find_exons.py | ||
@author: Anastasiia Petrova | ||
@contact: anastasiia.petrova(at)mpi-bn.mpg.de | ||
""" | ||
|
||
import argparse | ||
import sys | ||
import os | ||
import re | ||
import time | ||
import multiprocessing | ||
import logging | ||
import subprocess | ||
from Bio import SeqIO | ||
import Bio.SeqIO.FastaIO as bio | ||
import numpy as np | ||
from collections import defaultdict | ||
from scipy import stats | ||
import pyBigWig | ||
from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni | ||
import matplotlib.pyplot as plt | ||
import random | ||
import textwrap | ||
import seaborn as sns | ||
|
||
#from gtfparse import read_gtf #makes the logger to look different o_o | ||
|
||
logger = logging.getLogger('visualise') | ||
logger.setLevel(logging.INFO) | ||
|
||
formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M") | ||
|
||
def parse_args(): | ||
|
||
parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent(''' | ||
This script produces plots and a tsv-file to show the number of exons for the list of genes of interest. | ||
'''), epilog='That is what you need to make this script work for you. Enjoy it') | ||
|
||
required_arguments = parser.add_argument_group('required arguments') | ||
required_arguments.add_argument('--correlation', help='a .txt file with correlation between genes length and the length of exons', required=True) | ||
|
||
#all other arguments are optional | ||
#parser.add_argument('--genes_file', help='names of genes of interest') | ||
|
||
args = parser.parse_args() | ||
|
||
return args | ||
|
||
def check_existing_input_files(args): | ||
|
||
print('checking the input file') | ||
|
||
if not os.path.isfile(args.correlation): | ||
print('please make sure the .txt file with genes-exons correlation exists') | ||
sys.exit() | ||
|
||
def make_box_plot(group1, group2, gene_names, scores, output_directory, figure_name, color_name): | ||
#first make two groups | ||
group_one = [] | ||
group_two = [] | ||
|
||
for name in group1: | ||
group_one.append(scores[gene_names.index(name)]) | ||
|
||
for name in group2: | ||
group_two.append(scores[gene_names.index(name)]) | ||
|
||
plt.figure() | ||
plt.boxplot([group_one, group_two]) | ||
|
||
plt.show() | ||
|
||
|
||
|
||
def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name): | ||
|
||
the_max = max(max(x_array), max(y_array)) | ||
|
||
plt.xlim(0, the_max) | ||
plt.ylim(0, the_max) | ||
|
||
fig, ax = plt.subplots() | ||
plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5) | ||
plt.plot(x_array, y_array, 'o', color = color_name) | ||
|
||
plt.xlabel("whole gene length") | ||
plt.ylabel("sum length of exons") | ||
|
||
for i,j,name in zip(x_array, y_array, gene_names): | ||
ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small') | ||
|
||
fig.savefig(os.path.join(output_directory, figure_name)) | ||
|
||
#plt.show() | ||
|
||
def make_reg_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name): | ||
|
||
the_max = max(max(x_array), max(y_array)) | ||
|
||
#plt.xlim(0, the_max) | ||
#plt.ylim(0, the_max) | ||
|
||
#fig, ax = plt.subplots() | ||
#plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5) | ||
#plt.plot(x_array, y_array, 'o', color = color_name) | ||
sns.set() | ||
sns.regplot(np.array(x_array), np.array(y_array)) | ||
sns.plt.show() | ||
|
||
#plt.xlabel("whole gene length") | ||
#plt.ylabel("sum length of exons") | ||
|
||
#for i,j,name in zip(x_array, y_array, gene_names): | ||
# ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small') | ||
|
||
#fig.savefig(os.path.join(output_directory, figure_name)) | ||
|
||
#plt.show() | ||
|
||
def read_correlation(correlation_file): | ||
gene_names = [] | ||
scores = [] | ||
gene_lengths = [] | ||
exons_lengths = [] | ||
with open(correlation_file) as read_cor_file: | ||
for cor_line in read_cor_file: | ||
if not cor_line.startswith('#'): | ||
cor_line_array = re.split(r'\t', cor_line.rstrip('\n')) | ||
gene_names.append(cor_line_array[0]) | ||
scores.append(float(cor_line_array[1])) | ||
gene_lengths.append(int(cor_line_array[2])) | ||
exons_lengths.append(int(cor_line_array[3])) | ||
|
||
read_cor_file.close() | ||
|
||
return gene_names, scores, gene_lengths, exons_lengths | ||
|
||
def main(): | ||
|
||
start = time.time() | ||
|
||
args = parse_args() | ||
|
||
check_existing_input_files(args) | ||
|
||
#check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user | ||
#check_directory(args.output_directory) | ||
|
||
fh = logging.FileHandler("visualize_log.txt") | ||
fh.setLevel(logging.INFO) | ||
fh.setFormatter(formatter) | ||
logger.addHandler(fh) | ||
|
||
ch = logging.StreamHandler() | ||
ch.setLevel(logging.INFO) | ||
ch.setFormatter(formatter) | ||
logger.addHandler(ch) | ||
|
||
gene_names, scores, gene_lengths, exons_lengths = read_correlation(args.correlation) | ||
|
||
logger.info("making a plot") | ||
#make_plot(gene_lengths, exons_lengths, gene_names, "./output", "test1.png", 'gold') | ||
|
||
group1 = ["OTX1", "OTX2", "PITX1", "GSC2", "NFYA", "NFYB", "DUX4"] #important genes in early development | ||
group2 = ["MYCL", "LTBP3", "ERGIC3", "GCM2", "LHX8", "HIF1A", "FLI1"] #not so significant for early dev | ||
|
||
make_box_plot(group1, group2, gene_names, scores, "./output", "test2.png", 'gold') | ||
|
||
logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2))) | ||
|
||
for handler in logger.handlers: | ||
handler.close() | ||
logger.removeFilter(handler) | ||
|
||
if __name__ == "__main__": | ||
main() |