diff --git a/find_exons.py b/find_exons.py index 4505180..f9ecd38 100644 --- a/find_exons.py +++ b/find_exons.py @@ -23,6 +23,7 @@ import matplotlib.pyplot as plt import random import textwrap +import seaborn as sns #from gtfparse import read_gtf #makes the logger to look different o_o @@ -109,18 +110,21 @@ def check_existing_input_files(args): def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): + gene_groups = {} + logger.info("looking for genes of interest in gtf files") if exact: - grep = "grep -w " + grep = "grep -w -i " else: - grep = "grep " + grep = "grep -i " all_genes = {} for gene_name in genes_of_interest: + gene_groups[gene_name] = [] #first look for the genes in the gtf_genes file - small_gtf_genes = subprocess.getoutput(grep + gtf_genes + " -e \"" + gene_name + "\"")#find the first line of the input file + small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes)#find the first line of the input file if len(small_gtf_genes) != 0: @@ -138,12 +142,15 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): exons = [] #all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons} all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons} + gene_groups[gene_name].append(gene_full_name) #find the nubmer and the length of all exons for each gene of interest - small_gtf_exons = subprocess.getoutput("grep " + gtf_exons + " -e \"" + gene_name + "\"") + small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons) for exons_line in re.split(r'\n', small_gtf_exons): + #print(exons_line) genes_array = re.split(r'\t', exons_line) + #print(genes_array) exon_start = int(genes_array[3]) exon_end = int(genes_array[4]) exon_length = exon_end - exon_start @@ -170,7 +177,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): print("None of the given list of genes was found in the .gtf file") sys.exit() else: - return all_genes + return all_genes, gene_groups def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name): @@ -193,7 +200,31 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color plt.show() -def plot_and_output(all_genes, output_directory): +def make_reg_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name): + + the_max = max(max(x_array), max(y_array)) + + #plt.xlim(0, the_max) + #plt.ylim(0, the_max) + + #fig, ax = plt.subplots() + #plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5) + #plt.plot(x_array, y_array, 'o', color = color_name) + sns.set() + sns.regplot(np.array(x_array), np.array(y_array)) + sns.plt.show() + + #plt.xlabel("whole gene length") + #plt.ylabel("sum length of exons") + + #for i,j,name in zip(x_array, y_array, gene_names): + # ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small') + + #fig.savefig(os.path.join(output_directory, figure_name)) + + #plt.show() + +def plot_and_output(all_genes, output_directory, gene_groups): logger.info("preparing for plotting") @@ -206,6 +237,7 @@ def plot_and_output(all_genes, output_directory): for gene in all_genes: #first sort the exons array exons_sorted_after_length = sorted(all_genes[gene]['exons'], key=lambda x: x[0], reverse=True) + #then sort after exon number and transcription support level sorted_exons = sorted(exons_sorted_after_length, key = lambda x: (x[1], x[2])) check_exon = 1 @@ -220,11 +252,30 @@ def plot_and_output(all_genes, output_directory): score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) + # TODO make one nice dictionary instead of all these gene_names.append(gene) gene_lengths.append(all_genes[gene]['gene_length']) exons_lengths.append(sum_length) scores.append(score) + logger.info('extracting the best genes') + + print(gene_names) + + for gene_original in gene_groups.keys(): + #look inside the nice dictionary for the gene with the biggest score and if the scores are the same, look for the biggest length + #if we found one gene to save, than make all these arrays with gene_names, gene_lengths and everything + check_score = 0 + + gene_to_save = '' + for test_gene in gene_groups[gene_original]: + gene_index = gene_names.index(test_gene) + + print(gene_original) + #look for this gene group within all gene_names and + print(gene_groups[gene_original]) + print(gene_names.index('AL589880.1')) + logger.info("writing the output file") #now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score @@ -268,10 +319,12 @@ def main(): logger.info("find_exons.py was called using these parameters:") logger.info(vars(args)) - all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact) - plot_and_output(all_genes, args.output_directory) - logger.info("find_exons needed %s seconds to generate the output" % (time.time() - start)) + all_genes, gene_groups = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact) + plot_and_output(all_genes, args.output_directory, gene_groups) + + + logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2))) for handler in logger.handlers: handler.close()