Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
find_exons_number/visualize.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
301 lines (218 sloc)
10.6 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This visualises the information found by find_exons.py | |
@author: Anastasiia Petrova | |
@contact: anastasiia.petrova(at)mpi-bn.mpg.de | |
""" | |
import argparse | |
import sys | |
import os | |
import re | |
import time | |
import multiprocessing | |
import logging | |
import subprocess | |
import numpy as np | |
from collections import defaultdict | |
from scipy import stats | |
#from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni | |
import matplotlib.pyplot as plt | |
import random | |
import textwrap | |
#from gtfparse import read_gtf #makes the logger to look different o_o | |
logger = logging.getLogger('visualize') | |
logger.setLevel(logging.INFO) | |
formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M") | |
def parse_args(): | |
parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent(''' | |
This script produces plots to show the number of exons for the list of genes of interest. | |
'''), epilog='That is what you need to make this script work for you. Enjoy it') | |
required_arguments = parser.add_argument_group('required arguments') | |
required_arguments.add_argument('--correlation', help='a .txt file with correlation between genes length and the length of exons', required=True) | |
#all other arguments are optional | |
parser.add_argument('--variable', type=int, default=1, help='enter how many chars in the name of a gene can be different from the name of gene you are searching for') | |
parser.add_argument('--groups', nargs='*', dest='groups', help='enter one or more files containing groups of genes you are interested in, the genes should be separated with tabs or printed in one column') | |
parser.add_argument('--rna_seq', help='enter the file with rna seq data') | |
parser.add_argument('--bindetect', help='enter the file with results from TOBIAS') | |
parser.add_argument('--group_size', type=int, default=10, help='if the file from TOBIAS was provided, enter the size of groups to create, by default 10') | |
parser.add_argument('--output', help='enter the name of the output file, by default ./boxplot', default='./boxplot') | |
args = parser.parse_args() | |
return args | |
def find_directory(args): | |
return os.path.splitext(os.path.dirname(args.output))[0] | |
def get_name_from_path(full_path): | |
return os.path.splitext(os.path.basename(full_path))[0] | |
def check_existing_input_files(args): | |
print('checking the input file') | |
if not os.path.isfile(args.correlation): | |
print('please make sure the .txt file with genes-exons correlation exists') | |
sys.exit() | |
if args.rna_seq and not os.path.isfile(args.rna_seq): | |
print('please make sure the file with RNA-seq data exists') | |
sys.exit() | |
groups = [] | |
if args.bindetect: | |
if not os.path.isfile(args.bindetect): | |
print('please make sure the file with information from TOBIAS exists') | |
sys.exit() | |
else: | |
if not args.groups: | |
print('please provide your own groups for visualisation or the file from TOBIAS to make the groups') | |
sys.exit() | |
else: | |
for group_file in args.groups: | |
if not os.path.isfile(group_file): | |
print('the path for the input file ' + str(group_file) + ' could not be opened') | |
sys.exit() | |
else: | |
#group_name = "group_" + str(args.groups.index(group_file) + 1) | |
one_group = [] | |
with open(group_file) as opened_group_file: | |
for line in opened_group_file: | |
line_array = re.split(r'\t', line.rstrip('\n')) | |
for gene in line_array: | |
if ' ' in gene: | |
print('please make sure that genes in the file ' + str(group_file) + ' are separated with tabs or are written each on a new line') | |
sys.exit() | |
else: | |
one_group.append(gene) | |
opened_group_file.close() | |
groups.append(one_group) | |
return groups | |
def compare_with_each(groups, groups_names): | |
if len(groups) >= 2: | |
start = 1 | |
for group in groups: | |
for i in range(start, len(groups)): | |
make_ttest(group, groups[i], groups_names[groups.index(group)], groups_names[i]) | |
start += 1 | |
else: | |
logger.info("There is nothing to compare!") | |
def make_ttest(group1, group2, group1_name, group2_name): | |
stat_score, p_value = stats.ttest_ind(group1, group2) #, equal_var = False) | |
if p_value <= 0.05: | |
logger.info("The average of " + str(group1_name) + " is NOT EQUAL with " + str(group2_name)) | |
logger.info("Ttest p-value is " + str(p_value)) | |
else: | |
logger.info("The average of " + str(group1_name) + " is EQUAL with " + str(group2_name)) | |
logger.info("Ttest p-value is " + str(p_value)) | |
def make_box_plot(groups_files_names, groups_genes_of_interest, output_directory, figure_name): | |
groups_names = [] | |
for group_file in groups_files_names: | |
groups_names.append(get_name_from_path(group_file)) | |
data = [] | |
for genes_of_interest in groups_genes_of_interest: | |
group_data = [] | |
#first sort the genes_of_interest dictionary | |
sorted_genes_of_interest = sorted(genes_of_interest, key = lambda x: np.mean(genes_of_interest[x])) | |
for gene in sorted_genes_of_interest: | |
group_data.extend(genes_of_interest[gene]) | |
data.append(group_data) | |
logger.info("Making a statistical test") | |
compare_with_each(data, groups_names) | |
fig, ax = plt.subplots(figsize=(10,10)) | |
bp = ax.boxplot(data, notch=False, sym='o', vert=True, whis=1.5, patch_artist=True) | |
color_name = 'navy' | |
for patch in bp['boxes']: | |
patch.set_facecolor(color_name) | |
# Add a horizontal grid to the plot, but make it very light in color | |
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) | |
ax.set_title('The visualization of correlation between gene and exons length') | |
ax.set_xlabel('Genes groups') | |
ax.set_ylabel('Percentage of exons len') | |
ax.set_xticklabels(groups_names, rotation=45, fontsize=8) | |
fig.savefig(os.path.join(output_directory, figure_name)) | |
logger.info("saving the plot to the file " + str(os.path.join(output_directory, figure_name))) | |
plt.show() | |
def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name): | |
the_max = max(max(x_array), max(y_array)) | |
plt.xlim(0, the_max) | |
plt.ylim(0, the_max) | |
fig, ax = plt.subplots() | |
plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5) | |
plt.plot(x_array, y_array, 'o', color = color_name) | |
plt.xlabel("whole gene length") | |
plt.ylabel("sum length of exons") | |
for i,j,name in zip(x_array, y_array, gene_names): | |
ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small') | |
fig.savefig(os.path.join(output_directory, figure_name)) | |
#plt.show() | |
def read_correlation(correlation_file, groups, variable, rna_dict): | |
#read the text file and make a dictionary containing genes with array of their scores | |
genes_to_show_groups = [] | |
for genes_of_interest in groups: | |
genes_to_show = {} | |
for gene_name in genes_of_interest: | |
orig_gene_name = gene_name.lower() | |
genes_to_show[orig_gene_name] = genes_to_show.get(orig_gene_name, []) #get a place in the dictionary for this gene | |
#first look for the genes in the gtf_genes file | |
genes_family = subprocess.getoutput("grep -i " + gene_name + " " + correlation_file) | |
if len(genes_family) != 0: | |
logger.info("working with the gene " + gene_name) | |
#the score is in [3], the tsl is in [2], the ensemle_gene_id is in [7] | |
for genes_line in re.split(r'\n', genes_family): | |
genes_array = re.split(r'\t', genes_line) | |
if genes_array[7] in rna_dict.keys(): | |
#if this geneis is saved in the rnaseq data | |
check_gene_name = genes_array[0].lower() | |
if check_gene_name == orig_gene_name: | |
#this is a perfect match <3 | |
genes_to_show[check_gene_name].append(float(genes_array[3])) #append score to the array | |
elif check_gene_name.startswith(orig_gene_name) and len(genes_array[0]) <= len(gene_name) + variable: #the found gene is by default 1 char longer than the gene name | |
if check_gene_name in genes_to_show.keys(): | |
genes_to_show[check_gene_name].append(float(genes_array[3])) | |
else: #this gene is not yet saved | |
#make a new place in the dictionary for this gene | |
genes_to_show[check_gene_name] = genes_to_show.get(check_gene_name, []) | |
genes_to_show[check_gene_name].append(float(genes_array[3])) | |
else: | |
logger.info(gene_name + " has no corresponding information from the RNA-seq data and will not be considered while making a plot") | |
else: | |
logger.info("no similar genes to " + gene_name + " were found") | |
#delete genes without information from the dictionary | |
genes_to_show = dict([(k, v) for k, v in genes_to_show.items() if len(v) > 0]) | |
genes_to_show_groups.append(genes_to_show) | |
return genes_to_show_groups | |
def read_bindetect(output_directory, bindetect, group_size): | |
#first sort the file after the column mDuxNeg_mDuxPos_change | |
#use the unix sort sort -k5nr | |
logger.info("sorting the bindetect file after the column mDuxNeg_mDuxPos_change") | |
sorted_bindetect = os.path.join(output_directory, "sorted_" + os.path.splitext(os.path.basename(bindetect))[0] + ".txt") | |
os.system("sort -k5nr " + bindetect + " > " + sorted_bindetect) | |
logger.info("sorted file is " + str(sorted_bindetect)) | |
def read_rna_seq(rna_seq): | |
rna_dict = {} | |
with open(rna_seq) as read_file: | |
for line in read_file: | |
line_array = re.split(r'\s', line.rstrip('\n')) | |
#ensid is in the first column, the basemean is in the second | |
rna_dict[line_array[0]] = line_array[1] | |
read_file.close() | |
return rna_dict | |
def main(): | |
start = time.time() | |
args = parse_args() | |
groups = check_existing_input_files(args) | |
output_directory = find_directory(args) | |
#check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user | |
#check_directory(args.output_directory) | |
fh = logging.FileHandler(os.path.join(output_directory, "visualize.log")) | |
fh.setLevel(logging.INFO) | |
fh.setFormatter(formatter) | |
logger.addHandler(fh) | |
ch = logging.StreamHandler() | |
ch.setLevel(logging.INFO) | |
ch.setFormatter(formatter) | |
logger.addHandler(ch) | |
rna_dict = {} | |
if args.rna_seq: | |
rna_dict = read_rna_seq(args.rna_seq) | |
#read_bindetect(output_directory, args.bindetect, args.group_size) | |
genes_to_show_groups = read_correlation(args.correlation, groups, args.variable, rna_dict) | |
logger.info("making a plot") | |
make_box_plot(args.groups, genes_to_show_groups, output_directory, get_name_from_path(args.output)) | |
logger.info("visualize.py needed %s seconds to generate the output" % (round(time.time() - start, 2))) | |
for handler in logger.handlers: | |
handler.close() | |
logger.removeFilter(handler) | |
if __name__ == "__main__": | |
main() |