Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
"""
This visualises the information found by find_exons.py
@author: Anastasiia Petrova
@contact: anastasiia.petrova(at)mpi-bn.mpg.de
"""
import argparse
import sys
import os
import re
import time
import multiprocessing
import logging
import subprocess
import numpy as np
from collections import defaultdict
from scipy import stats
#from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni
import matplotlib.pyplot as plt
import random
import textwrap
#from gtfparse import read_gtf #makes the logger to look different o_o
logger = logging.getLogger('visualize')
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M")
def parse_args():
parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent('''
This script produces plots to show the number of exons for the list of genes of interest.
'''), epilog='That is what you need to make this script work for you. Enjoy it')
required_arguments = parser.add_argument_group('required arguments')
required_arguments.add_argument('--correlation', help='a .txt file with correlation between genes length and the length of exons', required=True)
#all other arguments are optional
parser.add_argument('--variable', type=int, default=1, help='enter how many chars in the name of a gene can be different from the name of gene you are searching for')
parser.add_argument('--groups', nargs='*', dest='groups', help='enter one or more files containing groups of genes you are interested in, the genes should be separated with tabs or printed in one column')
parser.add_argument('--rna_seq', help='enter the file with rna seq data')
parser.add_argument('--bindetect', help='enter the file with results from TOBIAS')
parser.add_argument('--group_size', type=int, default=10, help='if the file from TOBIAS was provided, enter the size of groups to create, by default 10')
parser.add_argument('--output', help='enter the name of the output file, by default ./boxplot', default='./boxplot')
args = parser.parse_args()
return args
def find_directory(args):
return os.path.splitext(os.path.dirname(args.output))[0]
def get_name_from_path(full_path):
return os.path.splitext(os.path.basename(full_path))[0]
def check_existing_input_files(args):
print('checking the input file')
if not os.path.isfile(args.correlation):
print('please make sure the .txt file with genes-exons correlation exists')
sys.exit()
if args.rna_seq and not os.path.isfile(args.rna_seq):
print('please make sure the file with RNA-seq data exists')
sys.exit()
groups = []
if args.bindetect:
if not os.path.isfile(args.bindetect):
print('please make sure the file with information from TOBIAS exists')
sys.exit()
else:
if not args.groups:
print('please provide your own groups for visualisation or the file from TOBIAS to make the groups')
sys.exit()
else:
for group_file in args.groups:
if not os.path.isfile(group_file):
print('the path for the input file ' + str(group_file) + ' could not be opened')
sys.exit()
else:
#group_name = "group_" + str(args.groups.index(group_file) + 1)
one_group = []
with open(group_file) as opened_group_file:
for line in opened_group_file:
line_array = re.split(r'\t', line.rstrip('\n'))
for gene in line_array:
if ' ' in gene:
print('please make sure that genes in the file ' + str(group_file) + ' are separated with tabs or are written each on a new line')
sys.exit()
else:
one_group.append(gene)
opened_group_file.close()
groups.append(one_group)
return groups
def compare_with_each(groups, groups_names):
if len(groups) >= 2:
start = 1
for group in groups:
for i in range(start, len(groups)):
make_ttest(group, groups[i], groups_names[groups.index(group)], groups_names[i])
start += 1
else:
logger.info("There is nothing to compare!")
def make_ttest(group1, group2, group1_name, group2_name):
stat_score, p_value = stats.ttest_ind(group1, group2) #, equal_var = False)
if p_value <= 0.05:
logger.info("The average of " + str(group1_name) + " is NOT EQUAL with " + str(group2_name))
logger.info("Ttest p-value is " + str(p_value))
else:
logger.info("The average of " + str(group1_name) + " is EQUAL with " + str(group2_name))
logger.info("Ttest p-value is " + str(p_value))
def make_box_plot(groups_files_names, groups_genes_of_interest, output_directory, figure_name):
groups_names = []
for group_file in groups_files_names:
groups_names.append(get_name_from_path(group_file))
data = []
for genes_of_interest in groups_genes_of_interest:
group_data = []
#first sort the genes_of_interest dictionary
sorted_genes_of_interest = sorted(genes_of_interest, key = lambda x: np.mean(genes_of_interest[x]))
for gene in sorted_genes_of_interest:
group_data.extend(genes_of_interest[gene])
data.append(group_data)
logger.info("Making a statistical test")
compare_with_each(data, groups_names)
fig, ax = plt.subplots(figsize=(10,10))
bp = ax.boxplot(data, notch=False, sym='o', vert=True, whis=1.5, patch_artist=True)
color_name = 'navy'
for patch in bp['boxes']:
patch.set_facecolor(color_name)
# Add a horizontal grid to the plot, but make it very light in color
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
ax.set_title('The visualization of correlation between gene and exons length')
ax.set_xlabel('Genes groups')
ax.set_ylabel('Percentage of exons len')
ax.set_xticklabels(groups_names, rotation=45, fontsize=8)
fig.savefig(os.path.join(output_directory, figure_name))
logger.info("saving the plot to the file " + str(os.path.join(output_directory, figure_name)))
plt.show()
def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):
the_max = max(max(x_array), max(y_array))
plt.xlim(0, the_max)
plt.ylim(0, the_max)
fig, ax = plt.subplots()
plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5)
plt.plot(x_array, y_array, 'o', color = color_name)
plt.xlabel("whole gene length")
plt.ylabel("sum length of exons")
for i,j,name in zip(x_array, y_array, gene_names):
ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small')
fig.savefig(os.path.join(output_directory, figure_name))
#plt.show()
def read_correlation(correlation_file, groups, variable, rna_dict):
#read the text file and make a dictionary containing genes with array of their scores
genes_to_show_groups = []
for genes_of_interest in groups:
genes_to_show = {}
for gene_name in genes_of_interest:
orig_gene_name = gene_name.lower()
genes_to_show[orig_gene_name] = genes_to_show.get(orig_gene_name, []) #get a place in the dictionary for this gene
#first look for the genes in the gtf_genes file
genes_family = subprocess.getoutput("grep -i " + gene_name + " " + correlation_file)
if len(genes_family) != 0:
logger.info("working with the gene " + gene_name)
#the score is in [3], the tsl is in [2], the ensemle_gene_id is in [7]
for genes_line in re.split(r'\n', genes_family):
genes_array = re.split(r'\t', genes_line)
if genes_array[7] in rna_dict.keys():
#if this geneis is saved in the rnaseq data
check_gene_name = genes_array[0].lower()
if check_gene_name == orig_gene_name:
#this is a perfect match <3
genes_to_show[check_gene_name].append(float(genes_array[3])) #append score to the array
elif check_gene_name.startswith(orig_gene_name) and len(genes_array[0]) <= len(gene_name) + variable: #the found gene is by default 1 char longer than the gene name
if check_gene_name in genes_to_show.keys():
genes_to_show[check_gene_name].append(float(genes_array[3]))
else: #this gene is not yet saved
#make a new place in the dictionary for this gene
genes_to_show[check_gene_name] = genes_to_show.get(check_gene_name, [])
genes_to_show[check_gene_name].append(float(genes_array[3]))
else:
logger.info(gene_name + " has no corresponding information from the RNA-seq data and will not be considered while making a plot")
else:
logger.info("no similar genes to " + gene_name + " were found")
#delete genes without information from the dictionary
genes_to_show = dict([(k, v) for k, v in genes_to_show.items() if len(v) > 0])
genes_to_show_groups.append(genes_to_show)
return genes_to_show_groups
def read_bindetect(output_directory, bindetect, group_size):
#first sort the file after the column mDuxNeg_mDuxPos_change
#use the unix sort sort -k5nr
logger.info("sorting the bindetect file after the column mDuxNeg_mDuxPos_change")
sorted_bindetect = os.path.join(output_directory, "sorted_" + os.path.splitext(os.path.basename(bindetect))[0] + ".txt")
os.system("sort -k5nr " + bindetect + " > " + sorted_bindetect)
logger.info("sorted file is " + str(sorted_bindetect))
def read_rna_seq(rna_seq):
rna_dict = {}
with open(rna_seq) as read_file:
for line in read_file:
line_array = re.split(r'\s', line.rstrip('\n'))
#ensid is in the first column, the basemean is in the second
rna_dict[line_array[0]] = line_array[1]
read_file.close()
return rna_dict
def main():
start = time.time()
args = parse_args()
groups = check_existing_input_files(args)
output_directory = find_directory(args)
#check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user
#check_directory(args.output_directory)
fh = logging.FileHandler(os.path.join(output_directory, "visualize.log"))
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)
rna_dict = {}
if args.rna_seq:
rna_dict = read_rna_seq(args.rna_seq)
#read_bindetect(output_directory, args.bindetect, args.group_size)
genes_to_show_groups = read_correlation(args.correlation, groups, args.variable, rna_dict)
logger.info("making a plot")
make_box_plot(args.groups, genes_to_show_groups, output_directory, get_name_from_path(args.output))
logger.info("visualize.py needed %s seconds to generate the output" % (round(time.time() - start, 2)))
for handler in logger.handlers:
handler.close()
logger.removeFilter(handler)
if __name__ == "__main__":
main()