excluding pseudogenes while searching and saving only the best gene t…

…o the output file
anastasiia · Nov 26, 2018 · 0b7318e · 0b7318e
1 parent 8a3ab2e
commit 0b7318e
Showing 1 changed file with 82 additions and 15 deletions.
diff --git a/find_exons.py b/find_exons.py
@@ -122,11 +122,12 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 	all_genes = {}
 
 	for gene_name in genes_of_interest:
-		gene_groups[gene_name] = []
+
 		#first look for the genes in the gtf_genes file
-		small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes)#find the first line of the input file
+		small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes + " | grep -v \"pseudogene\"")#find the first line of the input file
 
 		if len(small_gtf_genes) != 0:
+			gene_groups[gene_name] = []
 
 			for genes_line in re.split(r'\n', small_gtf_genes):
 				genes_array = re.split(r'\t', genes_line)
@@ -145,7 +146,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 				gene_groups[gene_name].append(gene_full_name)
 
 			#find the nubmer and the length of all exons for each gene of interest
-			small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons)
+			small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons + " | grep -v \"pseudogene\"")
 
 			for exons_line in re.split(r'\n', small_gtf_exons):
 				#print(exons_line)
@@ -168,7 +169,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 							tsl = int(tsl)
 
 				if gene_full_name in all_genes.keys():
-					all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl])
+					all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end])
 
 		else:
 			logger.info("The gene " + gene_name + " was not found in the .gtf file with genes")
@@ -235,46 +236,112 @@ def plot_and_output(all_genes, output_directory, gene_groups):
 
 	#find the sum length of exons for each gene
 	for gene in all_genes:
+		"""
 		#first sort the exons array
 		exons_sorted_after_length = sorted(all_genes[gene]['exons'], key=lambda x: x[0], reverse=True)
 		#then sort after exon number and transcription support level
 		sorted_exons = sorted(exons_sorted_after_length, key = lambda x: (x[1], x[2]))
+		"""
+		#TRY sort after the start and and positions
+		sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[3], x[4]))
+
+		#print(sorted_exons)
 
 		check_exon = 1
 		sum_length = 0
 		last_exon = sorted_exons[-1][1] 
 
+		"""
 		while check_exon <= last_exon:
-			if sorted_exons[0][1] == check_exon:
+			if sorted_exons[0][1] == check_exon: #check the current exon_number
 				sum_length = sum_length + sorted_exons[0][0]
 				sorted_exons = [x for x in sorted_exons if x[1] != check_exon] #cut the current sorted_exons array so that there are no occurences of the current exon there
 			check_exon = check_exon + 1
+		"""
+
+		#TRY i am trying to work with the positions of exons to reduce overlaps
+		start_pos = int(sorted_exons[0][3]) #save the start position of the first exon
+		end_pos = int(sorted_exons[0][4]) #save the end position of the first exon
+
+		#if gene == "AL589843.2":
+		#	print(sorted_exons)
+		#	print(start_pos)
+		#	print(end_pos)
+		#	print()
+
+		for i in range(1, len(sorted_exons)): #start with the second exon, as we have already saved the first one
+			if int(sorted_exons[i][3]) < end_pos: #the current exon is overlapping the previous one
+				if not int(sorted_exons[i][4]) <= end_pos:
+					end_pos = int(sorted_exons[i][4]) #the new end position is the one of the current exon
+					if i == len(sorted_exons) - 1:
+						#this is the last exon and an overlap, so save it
+						previous_len = end_pos - start_pos
+						sum_length = sum_length + previous_len
+						#if gene == "AL589843.2":
+						#	print("set new end pos ", end_pos)
+						#	print(sum_length)
+						#	print()
+				#else do nothing
+			else: #the current exon does not overlap the previous one
+				#save the length of the previous exon
+				previous_len = end_pos - start_pos
+				sum_length = sum_length + previous_len
+				#and save the new start and end positions to check for overlaps
+				start_pos = int(sorted_exons[i][3])
+				end_pos = int(sorted_exons[i][4])
+				#if gene == "AL589843.2":
+				#	print(sum_length)
+				#	print(start_pos)
+				#	print(end_pos)
+				#	print()
 
 		score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
 
-		# TODO make one nice dictionary instead of all these
+		all_genes[gene]['sum_length'] = sum_length
+		all_genes[gene]['score'] = score
+
+		"""
+		#for test purposes
 		gene_names.append(gene)
 		gene_lengths.append(all_genes[gene]['gene_length'])
 		exons_lengths.append(sum_length)
 		scores.append(score)
+		"""
 
+	#"""
 	logger.info('extracting the best genes')
 
-	print(gene_names)
+	#print(gene_names)
 
 	for gene_original in gene_groups.keys():
 		#look inside the nice dictionary for the gene with the biggest score and if the scores are the same, look for the biggest length
 		#if we found one gene to save, than make all these arrays with gene_names, gene_lengths and everything
-		check_score = 0
-
+		check_score = 0.0
 		gene_to_save = ''
-		for test_gene in gene_groups[gene_original]:
-			gene_index = gene_names.index(test_gene)
 
-		print(gene_original)
-		#look for this gene group within all gene_names and 
-		print(gene_groups[gene_original])
-		print(gene_names.index('AL589880.1'))
+		for test_gene in gene_groups[gene_original]:
+			#print(all_genes[test_gene])
+			if float(all_genes[test_gene]['score']) > check_score:
+				#print("score is bigger ", check_score, all_genes[test_gene]['score'])
+				check_score = float(all_genes[test_gene]['score'])
+				gene_to_save = test_gene
+			elif float(all_genes[test_gene]['score']) == check_score:
+				#print("score is the same ", check_score, all_genes[test_gene]['score'])
+				#check for the gene length
+				if gene_to_save != '' and int(all_genes[test_gene]['gene_length']) > int(all_genes[gene_to_save]['gene_length']):
+					gene_to_save = test_gene
+			#else do nothing
+
+		if gene_to_save == '': #the score for this gene is 0.0, so no best gene occurence could be found
+			gene_to_save = gene_original
+
+		gene_names.append(gene_to_save)
+		gene_lengths.append(all_genes[gene_to_save]['gene_length'])
+		exons_lengths.append(all_genes[gene_to_save]['sum_length'])
+		scores.append(all_genes[gene_to_save]['score'])
+
+		#print("i am saving this gene ", all_genes[gene_to_save])
+	#"""
 
 	logger.info("writing the output file")