This visualises the information found by
@author: Anastasiia Petrova
@contact: anastasiia.petrova(at)
import argparse
import sys
import os
import re
import time
import multiprocessing
import logging
import subprocess
import numpy as np
from collections import defaultdict
from scipy import stats
#from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni
import matplotlib.pyplot as plt
import random
import textwrap
#from gtfparse import read_gtf #makes the logger to look different o_o
logger = logging.getLogger('visualize')
formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M")
def parse_args():
parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent('''
This script produces plots to show the number of exons for the list of genes of interest.
'''), epilog='That is what you need to make this script work for you. Enjoy it')
required_arguments = parser.add_argument_group('required arguments')
required_arguments.add_argument('--correlation', help='a .txt file with correlation between genes length and the length of exons', required=True)
#all other arguments are optional
parser.add_argument('--variable', type=int, default=1, help='enter how many chars in the name of a gene can be different from the name of gene you are searching for')
parser.add_argument('--groups', nargs='*', dest='groups', help='enter one or more files containing groups of genes you are interested in, the genes should be separated with tabs or printed in one column')
parser.add_argument('--rna_seq', help='enter the file with rna seq data')
parser.add_argument('--bindetect', help='enter the file with results from TOBIAS')
parser.add_argument('--group_size', type=int, default=10, help='if the file from TOBIAS was provided, enter the size of groups to create, by default 10')
parser.add_argument('--output', help='enter the name of the output file, by default ./boxplot', default='./boxplot')
args = parser.parse_args()
return args
def find_directory(args):
return os.path.splitext(os.path.dirname(args.output))[0]
def get_name_from_path(full_path):
return os.path.splitext(os.path.basename(full_path))[0]
def check_existing_input_files(args):
print('checking the input file')
if not os.path.isfile(args.correlation):
print('please make sure the .txt file with genes-exons correlation exists')
if args.rna_seq and not os.path.isfile(args.rna_seq):
print('please make sure the file with RNA-seq data exists')
groups = []
if args.bindetect:
if not os.path.isfile(args.bindetect):
print('please make sure the file with information from TOBIAS exists')
if not args.groups:
print('please provide your own groups for visualisation or the file from TOBIAS to make the groups')
for group_file in args.groups:
if not os.path.isfile(group_file):
print('the path for the input file ' + str(group_file) + ' could not be opened')
#group_name = "group_" + str(args.groups.index(group_file) + 1)
one_group = []
with open(group_file) as opened_group_file:
for line in opened_group_file:
line_array = re.split(r'\t', line.rstrip('\n'))
for gene in line_array:
if ' ' in gene:
print('please make sure that genes in the file ' + str(group_file) + ' are separated with tabs or are written each on a new line')
return groups
def compare_with_each(groups, groups_names):
if len(groups) >= 2:
start = 1
for group in groups:
for i in range(start, len(groups)):
make_ttest(group, groups[i], groups_names[groups.index(group)], groups_names[i])
start += 1
else:"There is nothing to compare!")
def make_ttest(group1, group2, group1_name, group2_name):
stat_score, p_value = stats.ttest_ind(group1, group2) #, equal_var = False)
if p_value <= 0.05:"The average of " + str(group1_name) + " is NOT EQUAL with " + str(group2_name))"Ttest p-value is " + str(p_value))
else:"The average of " + str(group1_name) + " is EQUAL with " + str(group2_name))"Ttest p-value is " + str(p_value))
def make_box_plot(groups_files_names, groups_genes_of_interest, output_directory, figure_name):
groups_names = []
for group_file in groups_files_names:
data = []
for genes_of_interest in groups_genes_of_interest:
group_data = []
#first sort the genes_of_interest dictionary
sorted_genes_of_interest = sorted(genes_of_interest, key = lambda x: np.mean(genes_of_interest[x]))
for gene in sorted_genes_of_interest:
data.append(group_data)"Making a statistical test")
compare_with_each(data, groups_names)
fig, ax = plt.subplots(figsize=(10,10))
bp = ax.boxplot(data, notch=False, sym='o', vert=True, whis=1.5, patch_artist=True)
color_name = 'navy'
for patch in bp['boxes']:
# Add a horizontal grid to the plot, but make it very light in color
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
ax.set_title('The visualization of correlation between gene and exons length')
ax.set_xlabel('Genes groups')
ax.set_ylabel('Percentage of exons len')
ax.set_xticklabels(groups_names, rotation=45, fontsize=8)
fig.savefig(os.path.join(output_directory, figure_name))"saving the plot to the file " + str(os.path.join(output_directory, figure_name)))
def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):
the_max = max(max(x_array), max(y_array))
plt.xlim(0, the_max)
plt.ylim(0, the_max)
fig, ax = plt.subplots()
plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5)
plt.plot(x_array, y_array, 'o', color = color_name)
plt.xlabel("whole gene length")
plt.ylabel("sum length of exons")
for i,j,name in zip(x_array, y_array, gene_names):
ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small')
fig.savefig(os.path.join(output_directory, figure_name))
def read_correlation(correlation_file, groups, variable, rna_dict):
#read the text file and make a dictionary containing genes with array of their scores
genes_to_show_groups = []
for genes_of_interest in groups:
genes_to_show = {}
for gene_name in genes_of_interest:
orig_gene_name = gene_name.lower()
genes_to_show[orig_gene_name] = genes_to_show.get(orig_gene_name, []) #get a place in the dictionary for this gene
#first look for the genes in the gtf_genes file
genes_family = subprocess.getoutput("grep -i " + gene_name + " " + correlation_file)
if len(genes_family) != 0:"working with the gene " + gene_name)
#the score is in [3], the tsl is in [2], the ensemle_gene_id is in [7]
for genes_line in re.split(r'\n', genes_family):
genes_array = re.split(r'\t', genes_line)
if genes_array[7] in rna_dict.keys():
#if this geneis is saved in the rnaseq data
check_gene_name = genes_array[0].lower()
if check_gene_name == orig_gene_name:
#this is a perfect match <3
genes_to_show[check_gene_name].append(float(genes_array[3])) #append score to the array
elif check_gene_name.startswith(orig_gene_name) and len(genes_array[0]) <= len(gene_name) + variable: #the found gene is by default 1 char longer than the gene name
if check_gene_name in genes_to_show.keys():
else: #this gene is not yet saved
#make a new place in the dictionary for this gene
genes_to_show[check_gene_name] = genes_to_show.get(check_gene_name, [])
else: + " has no corresponding information from the RNA-seq data and will not be considered while making a plot")
else:"no similar genes to " + gene_name + " were found")
#delete genes without information from the dictionary
genes_to_show = dict([(k, v) for k, v in genes_to_show.items() if len(v) > 0])
return genes_to_show_groups
def read_bindetect(output_directory, bindetect, group_size):
#first sort the file after the column mDuxNeg_mDuxPos_change
#use the unix sort sort -k5nr"sorting the bindetect file after the column mDuxNeg_mDuxPos_change")
sorted_bindetect = os.path.join(output_directory, "sorted_" + os.path.splitext(os.path.basename(bindetect))[0] + ".txt")
os.system("sort -k5nr " + bindetect + " > " + sorted_bindetect)"sorted file is " + str(sorted_bindetect))
def read_rna_seq(rna_seq):
rna_dict = {}
with open(rna_seq) as read_file:
for line in read_file:
line_array = re.split(r'\s', line.rstrip('\n'))
#ensid is in the first column, the basemean is in the second
rna_dict[line_array[0]] = line_array[1]
return rna_dict
def main():
start = time.time()
args = parse_args()
groups = check_existing_input_files(args)
output_directory = find_directory(args)
#check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user
fh = logging.FileHandler(os.path.join(output_directory, "visualize.log"))
ch = logging.StreamHandler()
rna_dict = {}
if args.rna_seq:
rna_dict = read_rna_seq(args.rna_seq)
#read_bindetect(output_directory, args.bindetect, args.group_size)
genes_to_show_groups = read_correlation(args.correlation, groups, args.variable, rna_dict)"making a plot")
make_box_plot(args.groups, genes_to_show_groups, output_directory, get_name_from_path(args.output))" needed %s seconds to generate the output" % (round(time.time() - start, 2)))
for handler in logger.handlers:
if __name__ == "__main__":