changing the idea on how to check for the best gene

anastasiia · Nov 23, 2018 · 8a3ab2e · 8a3ab2e
1 parent 4a31126
commit 8a3ab2e
Showing 1 changed file with 62 additions and 9 deletions.
diff --git a/find_exons.py b/find_exons.py
@@ -23,6 +23,7 @@
 import matplotlib.pyplot as plt
 import random
 import textwrap
+import seaborn as sns
 
 #from gtfparse import read_gtf #makes the logger to look different o_o
 
@@ -109,18 +110,21 @@ def check_existing_input_files(args):
 
 def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 
+	gene_groups = {}
+
 	logger.info("looking for genes of interest in gtf files")
 
 	if exact:
-		grep = "grep -w "
+		grep = "grep -w -i "
 	else:
-		grep = "grep "
+		grep = "grep -i "
 
 	all_genes = {}
 
 	for gene_name in genes_of_interest:
+		gene_groups[gene_name] = []
 		#first look for the genes in the gtf_genes file
-		small_gtf_genes = subprocess.getoutput(grep + gtf_genes + " -e \"" + gene_name + "\"")#find the first line of the input file
+		small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes)#find the first line of the input file
 
 		if len(small_gtf_genes) != 0:
 
@@ -138,12 +142,15 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 				exons = []
 				#all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons}
 				all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons}
+				gene_groups[gene_name].append(gene_full_name)
 
 			#find the nubmer and the length of all exons for each gene of interest
-			small_gtf_exons = subprocess.getoutput("grep " + gtf_exons + " -e \"" + gene_name + "\"")
+			small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons)
 
 			for exons_line in re.split(r'\n', small_gtf_exons):
+				#print(exons_line)
 				genes_array = re.split(r'\t', exons_line)
+				#print(genes_array)
 				exon_start = int(genes_array[3])
 				exon_end = int(genes_array[4])
 				exon_length = exon_end - exon_start
@@ -170,7 +177,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 		print("None of the given list of genes was found in the .gtf file")
 		sys.exit()
 	else:
-		return all_genes
+		return all_genes, gene_groups
 
 def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):
 
@@ -193,7 +200,31 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color
 
 	plt.show()
 
-def plot_and_output(all_genes, output_directory):
+def make_reg_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):
+
+	the_max = max(max(x_array), max(y_array))
+
+	#plt.xlim(0, the_max)
+	#plt.ylim(0, the_max)
+
+	#fig, ax = plt.subplots()
+	#plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5)
+	#plt.plot(x_array, y_array, 'o', color = color_name)
+	sns.set()
+	sns.regplot(np.array(x_array), np.array(y_array))
+	sns.plt.show()
+
+	#plt.xlabel("whole gene length")
+	#plt.ylabel("sum length of exons")
+
+	#for i,j,name in zip(x_array, y_array, gene_names):
+	#	ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small')
+
+	#fig.savefig(os.path.join(output_directory, figure_name))
+
+	#plt.show()
+
+def plot_and_output(all_genes, output_directory, gene_groups):
 
 	logger.info("preparing for plotting")
 
@@ -206,6 +237,7 @@ def plot_and_output(all_genes, output_directory):
 	for gene in all_genes:
 		#first sort the exons array
 		exons_sorted_after_length = sorted(all_genes[gene]['exons'], key=lambda x: x[0], reverse=True)
+		#then sort after exon number and transcription support level
 		sorted_exons = sorted(exons_sorted_after_length, key = lambda x: (x[1], x[2]))
 
 		check_exon = 1
@@ -220,11 +252,30 @@ def plot_and_output(all_genes, output_directory):
 
 		score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
 
+		# TODO make one nice dictionary instead of all these
 		gene_names.append(gene)
 		gene_lengths.append(all_genes[gene]['gene_length'])
 		exons_lengths.append(sum_length)
 		scores.append(score)
 
+	logger.info('extracting the best genes')
+
+	print(gene_names)
+
+	for gene_original in gene_groups.keys():
+		#look inside the nice dictionary for the gene with the biggest score and if the scores are the same, look for the biggest length
+		#if we found one gene to save, than make all these arrays with gene_names, gene_lengths and everything
+		check_score = 0
+
+		gene_to_save = ''
+		for test_gene in gene_groups[gene_original]:
+			gene_index = gene_names.index(test_gene)
+
+		print(gene_original)
+		#look for this gene group within all gene_names and 
+		print(gene_groups[gene_original])
+		print(gene_names.index('AL589880.1'))
+
 	logger.info("writing the output file")
 
 	#now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score
@@ -268,10 +319,12 @@ def main():
 	logger.info("find_exons.py was called using these parameters:")
 	logger.info(vars(args))
 
-	all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact)
-	plot_and_output(all_genes, args.output_directory)
 
-	logger.info("find_exons needed %s seconds to generate the output" % (time.time() - start))
+	all_genes, gene_groups = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact)
+	plot_and_output(all_genes, args.output_directory, gene_groups)
+
+
+	logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2)))
 
 	for handler in logger.handlers:
 		handler.close()