visualize.py


"""
This visualises the information found by find_exons.py
@author: Anastasiia Petrova
@contact: anastasiia.petrova(at)mpi-bn.mpg.de
"""

import argparse
import sys
import os
import re
import time
import multiprocessing
import logging
import subprocess
import numpy as np
from collections import defaultdict
from scipy import stats
#from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni
import matplotlib.pyplot as plt
import random
import textwrap

#from gtfparse import read_gtf #makes the logger to look different o_o

logger = logging.getLogger('visualize')
logger.setLevel(logging.INFO)

formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M")

def parse_args():

	parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent('''

		This script produces plots to show the number of exons for the list of genes of interest.
		'''), epilog='That is what you need to make this script work for you. Enjoy it')

	required_arguments = parser.add_argument_group('required arguments')
	required_arguments.add_argument('--correlation', help='a .txt file with correlation between genes length and the length of exons', required=True)
	#all other arguments are optional
	parser.add_argument('--variable', type=int, default=1, help='enter how many chars in the name of a gene can be different from the name of gene you are searching for')
	parser.add_argument('--groups', nargs='*', dest='groups', help='enter one or more files containing groups of genes you are interested in, the genes should be separated with tabs or printed in one column')
	parser.add_argument('--rna_seq', help='enter the file with rna seq data')
	parser.add_argument('--bindetect', help='enter the file with results from TOBIAS')
	parser.add_argument('--group_size', type=int, default=10, help='if the file from TOBIAS was provided, enter the size of groups to create, by default 10')
	parser.add_argument('--output', help='enter the name of the output file, by default ./boxplot', default='./boxplot')

	args = parser.parse_args()

	return args

def find_directory(args):
	return os.path.splitext(os.path.dirname(args.output))[0]

def get_name_from_path(full_path):
	return os.path.splitext(os.path.basename(full_path))[0]

def check_existing_input_files(args):

	print('checking the input file')

	if not os.path.isfile(args.correlation):
		print('please make sure the .txt file with genes-exons correlation exists')
		sys.exit()

	if args.rna_seq and not os.path.isfile(args.rna_seq):
		print('please make sure the file with RNA-seq data exists')
		sys.exit()

	groups = []
	if args.bindetect:
		if not os.path.isfile(args.bindetect):
			print('please make sure the file with information from TOBIAS exists')
			sys.exit()
	else:
		if not args.groups:
			print('please provide your own groups for visualisation or the file from TOBIAS to make the groups')
			sys.exit()
		else:
			for group_file in args.groups:
				if not os.path.isfile(group_file):
					print('the path for the input file ' + str(group_file) + ' could not be opened')
					sys.exit()
				else:
					#group_name = "group_" + str(args.groups.index(group_file) + 1)
					one_group = []
					with open(group_file) as opened_group_file:
						for line in opened_group_file:
							line_array = re.split(r'\t', line.rstrip('\n'))
							for gene in line_array:
								if ' ' in gene:
									print('please make sure that genes in the file ' + str(group_file) + ' are separated with tabs or are written each on a new line')
									sys.exit()
								else:
									one_group.append(gene)
					opened_group_file.close()
					groups.append(one_group)

	return groups

def compare_with_each(groups, groups_names):
	if len(groups) >= 2:
		start = 1
		for group in groups:
			for i in range(start, len(groups)):
				make_ttest(group, groups[i], groups_names[groups.index(group)], groups_names[i])
			start += 1
	else:
		logger.info("There is nothing to compare!")

def make_ttest(group1, group2, group1_name, group2_name):
	stat_score, p_value = stats.ttest_ind(group1, group2) #, equal_var = False)
	if p_value <= 0.05:
		logger.info("The average of " + str(group1_name) + " is NOT EQUAL with " + str(group2_name))
		logger.info("Ttest p-value is " + str(p_value))
	else:
		logger.info("The average of " + str(group1_name) + " is EQUAL with " + str(group2_name))
		logger.info("Ttest p-value is " + str(p_value))

def make_box_plot(groups_files_names, groups_genes_of_interest, output_directory, figure_name):

	groups_names = []

	for group_file in groups_files_names:
		groups_names.append(get_name_from_path(group_file))

	data = []

	for genes_of_interest in groups_genes_of_interest:

		group_data = []

		#first sort the genes_of_interest dictionary
		sorted_genes_of_interest = sorted(genes_of_interest, key = lambda x: np.mean(genes_of_interest[x]))

		for gene in sorted_genes_of_interest:
			group_data.extend(genes_of_interest[gene])

		data.append(group_data)

	logger.info("Making a statistical test")

	compare_with_each(data, groups_names)

	fig, ax = plt.subplots(figsize=(10,10))

	bp = ax.boxplot(data, notch=False, sym='o', vert=True, whis=1.5, patch_artist=True)

	color_name = 'navy'
	for patch in bp['boxes']:
		patch.set_facecolor(color_name)

	# Add a horizontal grid to the plot, but make it very light in color
	ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)

	ax.set_title('The visualization of correlation between gene and exons length')
	ax.set_xlabel('Genes groups')
	ax.set_ylabel('Percentage of exons len')

	ax.set_xticklabels(groups_names, rotation=45, fontsize=8)

	fig.savefig(os.path.join(output_directory, figure_name))
	logger.info("saving the plot to the file " + str(os.path.join(output_directory, figure_name)))

	plt.show()

def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):

	the_max = max(max(x_array), max(y_array))

	plt.xlim(0, the_max)
	plt.ylim(0, the_max)

	fig, ax = plt.subplots()
	plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5)
	plt.plot(x_array, y_array, 'o', color = color_name)

	plt.xlabel("whole gene length")
	plt.ylabel("sum length of exons")

	for i,j,name in zip(x_array, y_array, gene_names):
		ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small')

	fig.savefig(os.path.join(output_directory, figure_name))

	#plt.show()

def read_correlation(correlation_file, groups, variable, rna_dict):

	#read the text file and make a dictionary containing genes with array of their scores
	genes_to_show_groups = []

	for genes_of_interest in groups:
		genes_to_show = {}

		for gene_name in genes_of_interest:

			orig_gene_name = gene_name.lower()

			genes_to_show[orig_gene_name] = genes_to_show.get(orig_gene_name, []) #get a place in the dictionary for this gene

			#first look for the genes in the gtf_genes file
			genes_family = subprocess.getoutput("grep -i " + gene_name + " " + correlation_file)

			if len(genes_family) != 0:
				logger.info("working with the gene " + gene_name)

				#the score is in [3], the tsl is in [2], the ensemle_gene_id is in [7]

				for genes_line in re.split(r'\n', genes_family):
					genes_array = re.split(r'\t', genes_line)

					if genes_array[7] in rna_dict.keys():
					#if this geneis is saved in the rnaseq data
						check_gene_name = genes_array[0].lower()

						if check_gene_name == orig_gene_name:
							#this is a perfect match <3
							genes_to_show[check_gene_name].append(float(genes_array[3])) #append score to the array
						elif check_gene_name.startswith(orig_gene_name) and len(genes_array[0]) <= len(gene_name) + variable: #the found gene is by default 1 char longer than the gene name
							if check_gene_name in genes_to_show.keys():
								genes_to_show[check_gene_name].append(float(genes_array[3]))
							else: #this gene is not yet saved
								#make a new place in the dictionary for this gene
								genes_to_show[check_gene_name] = genes_to_show.get(check_gene_name, [])
								genes_to_show[check_gene_name].append(float(genes_array[3]))
					else:
						logger.info(gene_name + " has no corresponding information from the RNA-seq data and will not be considered while making a plot")

			else:
				logger.info("no similar genes to " + gene_name + " were found")

		#delete genes without information from the dictionary
		genes_to_show = dict([(k, v) for k, v in genes_to_show.items() if len(v) > 0])
		genes_to_show_groups.append(genes_to_show)

	return genes_to_show_groups

def read_bindetect(output_directory, bindetect, group_size):
	#first sort the file after the column mDuxNeg_mDuxPos_change
	#use the unix sort sort -k5nr
	logger.info("sorting the bindetect file after the column mDuxNeg_mDuxPos_change")
	sorted_bindetect = os.path.join(output_directory, "sorted_" + os.path.splitext(os.path.basename(bindetect))[0] + ".txt")
	os.system("sort -k5nr " + bindetect + " > " + sorted_bindetect)
	logger.info("sorted file is " + str(sorted_bindetect))

def read_rna_seq(rna_seq):
	rna_dict = {}

	with open(rna_seq) as read_file:
		for line in read_file:
			line_array = re.split(r'\s', line.rstrip('\n'))
			#ensid is in the first column, the basemean is in the second
			rna_dict[line_array[0]] = line_array[1]

		read_file.close()

	return rna_dict

def main():

	start = time.time()

	args = parse_args()

	groups = check_existing_input_files(args)
	output_directory = find_directory(args)

	#check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user
	#check_directory(args.output_directory)

	fh = logging.FileHandler(os.path.join(output_directory, "visualize.log"))
	fh.setLevel(logging.INFO)
	fh.setFormatter(formatter)
	logger.addHandler(fh)

	ch = logging.StreamHandler()
	ch.setLevel(logging.INFO)
	ch.setFormatter(formatter)
	logger.addHandler(ch)

	rna_dict = {}
	if args.rna_seq:
		rna_dict =  read_rna_seq(args.rna_seq)

	#read_bindetect(output_directory, args.bindetect, args.group_size)

	genes_to_show_groups = read_correlation(args.correlation, groups, args.variable, rna_dict)

	logger.info("making a plot")

	make_box_plot(args.groups, genes_to_show_groups, output_directory, get_name_from_path(args.output))

	logger.info("visualize.py needed %s seconds to generate the output" % (round(time.time() - start, 2)))

	for handler in logger.handlers:
		handler.close()
		logger.removeFilter(handler)

if __name__ == "__main__":
	main()

	"""
	This visualises the information found by find_exons.py
	@author: Anastasiia Petrova
	@contact: anastasiia.petrova(at)mpi-bn.mpg.de
	"""

	import argparse
	import sys
	import os
	import re
	import time
	import multiprocessing
	import logging
	import subprocess
	import numpy as np
	from collections import defaultdict
	from scipy import stats
	#from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni
	import matplotlib.pyplot as plt
	import random
	import textwrap

	#from gtfparse import read_gtf #makes the logger to look different o_o

	logger = logging.getLogger('visualize')
	logger.setLevel(logging.INFO)

	formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M")

	def parse_args():

	parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent('''

	This script produces plots to show the number of exons for the list of genes of interest.
	'''), epilog='That is what you need to make this script work for you. Enjoy it')

	required_arguments = parser.add_argument_group('required arguments')
	required_arguments.add_argument('--correlation', help='a .txt file with correlation between genes length and the length of exons', required=True)
	#all other arguments are optional
	parser.add_argument('--variable', type=int, default=1, help='enter how many chars in the name of a gene can be different from the name of gene you are searching for')
	parser.add_argument('--groups', nargs='*', dest='groups', help='enter one or more files containing groups of genes you are interested in, the genes should be separated with tabs or printed in one column')
	parser.add_argument('--rna_seq', help='enter the file with rna seq data')
	parser.add_argument('--bindetect', help='enter the file with results from TOBIAS')
	parser.add_argument('--group_size', type=int, default=10, help='if the file from TOBIAS was provided, enter the size of groups to create, by default 10')
	parser.add_argument('--output', help='enter the name of the output file, by default ./boxplot', default='./boxplot')

	args = parser.parse_args()

	return args

	def find_directory(args):
	return os.path.splitext(os.path.dirname(args.output))[0]

	def get_name_from_path(full_path):
	return os.path.splitext(os.path.basename(full_path))[0]

	def check_existing_input_files(args):

	print('checking the input file')

	if not os.path.isfile(args.correlation):
	print('please make sure the .txt file with genes-exons correlation exists')
	sys.exit()

	if args.rna_seq and not os.path.isfile(args.rna_seq):
	print('please make sure the file with RNA-seq data exists')
	sys.exit()

	groups = []
	if args.bindetect:
	if not os.path.isfile(args.bindetect):
	print('please make sure the file with information from TOBIAS exists')
	sys.exit()
	else:
	if not args.groups:
	print('please provide your own groups for visualisation or the file from TOBIAS to make the groups')
	sys.exit()
	else:
	for group_file in args.groups:
	if not os.path.isfile(group_file):
	print('the path for the input file ' + str(group_file) + ' could not be opened')
	sys.exit()
	else:
	#group_name = "group_" + str(args.groups.index(group_file) + 1)
	one_group = []
	with open(group_file) as opened_group_file:
	for line in opened_group_file:
	line_array = re.split(r'\t', line.rstrip('\n'))
	for gene in line_array:
	if ' ' in gene:
	print('please make sure that genes in the file ' + str(group_file) + ' are separated with tabs or are written each on a new line')
	sys.exit()
	else:
	one_group.append(gene)
	opened_group_file.close()
	groups.append(one_group)

	return groups

	def compare_with_each(groups, groups_names):
	if len(groups) >= 2:
	start = 1
	for group in groups:
	for i in range(start, len(groups)):
	make_ttest(group, groups[i], groups_names[groups.index(group)], groups_names[i])
	start += 1
	else:
	logger.info("There is nothing to compare!")

	def make_ttest(group1, group2, group1_name, group2_name):
	stat_score, p_value = stats.ttest_ind(group1, group2) #, equal_var = False)
	if p_value <= 0.05:
	logger.info("The average of " + str(group1_name) + " is NOT EQUAL with " + str(group2_name))
	logger.info("Ttest p-value is " + str(p_value))
	else:
	logger.info("The average of " + str(group1_name) + " is EQUAL with " + str(group2_name))
	logger.info("Ttest p-value is " + str(p_value))

	def make_box_plot(groups_files_names, groups_genes_of_interest, output_directory, figure_name):

	groups_names = []

	for group_file in groups_files_names:
	groups_names.append(get_name_from_path(group_file))

	data = []

	for genes_of_interest in groups_genes_of_interest:

	group_data = []

	#first sort the genes_of_interest dictionary
	sorted_genes_of_interest = sorted(genes_of_interest, key = lambda x: np.mean(genes_of_interest[x]))

	for gene in sorted_genes_of_interest:
	group_data.extend(genes_of_interest[gene])

	data.append(group_data)

	logger.info("Making a statistical test")

	compare_with_each(data, groups_names)

	fig, ax = plt.subplots(figsize=(10,10))

	bp = ax.boxplot(data, notch=False, sym='o', vert=True, whis=1.5, patch_artist=True)

	color_name = 'navy'
	for patch in bp['boxes']:
	patch.set_facecolor(color_name)

	# Add a horizontal grid to the plot, but make it very light in color
	ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)

	ax.set_title('The visualization of correlation between gene and exons length')
	ax.set_xlabel('Genes groups')
	ax.set_ylabel('Percentage of exons len')

	ax.set_xticklabels(groups_names, rotation=45, fontsize=8)

	fig.savefig(os.path.join(output_directory, figure_name))
	logger.info("saving the plot to the file " + str(os.path.join(output_directory, figure_name)))

	plt.show()

	def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):

	the_max = max(max(x_array), max(y_array))

	plt.xlim(0, the_max)
	plt.ylim(0, the_max)

	fig, ax = plt.subplots()
	plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5)
	plt.plot(x_array, y_array, 'o', color = color_name)

	plt.xlabel("whole gene length")
	plt.ylabel("sum length of exons")

	for i,j,name in zip(x_array, y_array, gene_names):
	ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small')

	fig.savefig(os.path.join(output_directory, figure_name))

	#plt.show()

	def read_correlation(correlation_file, groups, variable, rna_dict):

	#read the text file and make a dictionary containing genes with array of their scores
	genes_to_show_groups = []

	for genes_of_interest in groups:
	genes_to_show = {}

	for gene_name in genes_of_interest:

	orig_gene_name = gene_name.lower()

	genes_to_show[orig_gene_name] = genes_to_show.get(orig_gene_name, []) #get a place in the dictionary for this gene

	#first look for the genes in the gtf_genes file
	genes_family = subprocess.getoutput("grep -i " + gene_name + " " + correlation_file)

	if len(genes_family) != 0:
	logger.info("working with the gene " + gene_name)

	#the score is in [3], the tsl is in [2], the ensemle_gene_id is in [7]

	for genes_line in re.split(r'\n', genes_family):
	genes_array = re.split(r'\t', genes_line)

	if genes_array[7] in rna_dict.keys():
	#if this geneis is saved in the rnaseq data
	check_gene_name = genes_array[0].lower()

	if check_gene_name == orig_gene_name:
	#this is a perfect match <3
	genes_to_show[check_gene_name].append(float(genes_array[3])) #append score to the array
	elif check_gene_name.startswith(orig_gene_name) and len(genes_array[0]) <= len(gene_name) + variable: #the found gene is by default 1 char longer than the gene name
	if check_gene_name in genes_to_show.keys():
	genes_to_show[check_gene_name].append(float(genes_array[3]))
	else: #this gene is not yet saved
	#make a new place in the dictionary for this gene
	genes_to_show[check_gene_name] = genes_to_show.get(check_gene_name, [])
	genes_to_show[check_gene_name].append(float(genes_array[3]))
	else:
	logger.info(gene_name + " has no corresponding information from the RNA-seq data and will not be considered while making a plot")

	else:
	logger.info("no similar genes to " + gene_name + " were found")

	#delete genes without information from the dictionary
	genes_to_show = dict([(k, v) for k, v in genes_to_show.items() if len(v) > 0])
	genes_to_show_groups.append(genes_to_show)

	return genes_to_show_groups

	def read_bindetect(output_directory, bindetect, group_size):
	#first sort the file after the column mDuxNeg_mDuxPos_change
	#use the unix sort sort -k5nr
	logger.info("sorting the bindetect file after the column mDuxNeg_mDuxPos_change")
	sorted_bindetect = os.path.join(output_directory, "sorted_" + os.path.splitext(os.path.basename(bindetect))[0] + ".txt")
	os.system("sort -k5nr " + bindetect + " > " + sorted_bindetect)
	logger.info("sorted file is " + str(sorted_bindetect))

	def read_rna_seq(rna_seq):
	rna_dict = {}

	with open(rna_seq) as read_file:
	for line in read_file:
	line_array = re.split(r'\s', line.rstrip('\n'))
	#ensid is in the first column, the basemean is in the second
	rna_dict[line_array[0]] = line_array[1]

	read_file.close()

	return rna_dict

	def main():

	start = time.time()

	args = parse_args()

	groups = check_existing_input_files(args)
	output_directory = find_directory(args)

	#check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user
	#check_directory(args.output_directory)

	fh = logging.FileHandler(os.path.join(output_directory, "visualize.log"))
	fh.setLevel(logging.INFO)
	fh.setFormatter(formatter)
	logger.addHandler(fh)

	ch = logging.StreamHandler()
	ch.setLevel(logging.INFO)
	ch.setFormatter(formatter)
	logger.addHandler(ch)

	rna_dict = {}
	if args.rna_seq:
	rna_dict = read_rna_seq(args.rna_seq)

	#read_bindetect(output_directory, args.bindetect, args.group_size)

	genes_to_show_groups = read_correlation(args.correlation, groups, args.variable, rna_dict)

	logger.info("making a plot")

	make_box_plot(args.groups, genes_to_show_groups, output_directory, get_name_from_path(args.output))

	logger.info("visualize.py needed %s seconds to generate the output" % (round(time.time() - start, 2)))

	for handler in logger.handlers:
	handler.close()
	logger.removeFilter(handler)

	if __name__ == "__main__":
	main()