adding transcript name to the output file and do not chose the best t…

…ranscript, just save everything
anastasiia · Nov 26, 2018 · 3549b97 · 3549b97
1 parent 50efe41
commit 3549b97
Showing 1 changed file with 46 additions and 12 deletions.
diff --git a/find_exons.py b/find_exons.py
@@ -124,9 +124,10 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 	for gene_name in genes_of_interest:
 
 		#first look for the genes in the gtf_genes file
-		small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes + " | grep -v \"pseudogene\"")#find the first line of the input file
+		small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes + " | grep \"protein_coding\"") #" | grep -v \"pseudogene\"")#find the first line of the input file
 
 		if len(small_gtf_genes) != 0:
+			logger.info("working with the gene " + gene_name)
 			gene_groups[gene_name] = []
 
 			for genes_line in re.split(r'\n', small_gtf_genes):
@@ -146,7 +147,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 				gene_groups[gene_name].append(gene_full_name)
 
 			#find the nubmer and the length of all exons for each gene of interest
-			small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons + " | grep -v \"pseudogene\"")
+			small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons + " | grep \"protein_coding\"")
 
 			for exons_line in re.split(r'\n', small_gtf_exons):
 				#print(exons_line)
@@ -167,9 +168,11 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 							tsl = 6
 						else:
 							tsl = int(tsl)
+					elif attribut.startswith("transcript_name"):
+						transcript_name = re.split(r'\s', attribut)[1].replace('"', '')
 
 				if gene_full_name in all_genes.keys():
-					all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end])
+					all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name])
 
 		else:
 			logger.info("The gene " + gene_name + " was not found in the .gtf file with genes")
@@ -234,6 +237,9 @@ def plot_and_output(all_genes, output_directory, gene_groups):
 	exons_lengths = []
 	scores = []
 
+	#TRY
+	genes_with_transcript_names = {}
+
 	#find the sum length of exons for each gene
 	for gene in all_genes:
 		"""
@@ -242,23 +248,26 @@ def plot_and_output(all_genes, output_directory, gene_groups):
 		#then sort after exon number and transcription support level
 		sorted_exons = sorted(exons_sorted_after_length, key = lambda x: (x[1], x[2]))
 		"""
+		"""
 		#TRY sort after the start and and positions
 		sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[3], x[4]))
-
-		#print(sorted_exons)
+		"""
+		#TRY sort after transcript name and then after the exon number, and then after transcription support level
+		sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[5], x[1], x[2]))
 
 		check_exon = 1
 		sum_length = 0
-		last_exon = sorted_exons[-1][1] 
 
 		"""
+		last_exon = sorted_exons[-1][1] 
+
 		while check_exon <= last_exon:
 			if sorted_exons[0][1] == check_exon: #check the current exon_number
 				sum_length = sum_length + sorted_exons[0][0]
 				sorted_exons = [x for x in sorted_exons if x[1] != check_exon] #cut the current sorted_exons array so that there are no occurences of the current exon there
 			check_exon = check_exon + 1
 		"""
-
+		"""
 		#TRY i am trying to work with the positions of exons to reduce overlaps
 		start_pos = int(sorted_exons[0][3]) #save the start position of the first exon
 		end_pos = int(sorted_exons[0][4]) #save the end position of the first exon
@@ -294,11 +303,28 @@ def plot_and_output(all_genes, output_directory, gene_groups):
 				#	print(start_pos)
 				#	print(end_pos)
 				#	print()
+		"""
+		transcript_name = sorted_exons[0][5] #save the transcript name of the first exon
+		sum_length = sorted_exons[0][0] #save the length of the best first exon
+		#genes_with_transcript_names[transcript_name] = genes_with_transcript_names.get(transcript_name, {})
+
+		for i in range(1, len(sorted_exons)):
+			if sorted_exons[i][5] != transcript_name or i == len(sorted_exons) - 1: #this is a new transcript or the last one
+				#save the previous transcript_name
+				score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
+				genes_with_transcript_names[transcript_name] = genes_with_transcript_names.get(transcript_name, {})
+				genes_with_transcript_names[transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score}
+				transcript_name = sorted_exons[i][5]
+				sum_length = sorted_exons[i][0]
+			else: #this is still the same transcript name
+				#save the length of this exon
+				sum_length = sum_length + sorted_exons[i][0]
 
-		score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
-
+
+		"""
 		all_genes[gene]['sum_length'] = sum_length
 		all_genes[gene]['score'] = score
+		"""
 
 		"""
 		#for test purposes
@@ -308,7 +334,7 @@ def plot_and_output(all_genes, output_directory, gene_groups):
 		scores.append(score)
 		"""
 
-	#"""
+	"""
 	logger.info('extracting the best genes')
 
 	#print(gene_names)
@@ -341,17 +367,25 @@ def plot_and_output(all_genes, output_directory, gene_groups):
 		scores.append(all_genes[gene_to_save]['score'])
 	
 		#print("i am saving this gene ", all_genes[gene_to_save])
-	#"""
+	"""
+	#sorted_dict = sorted(dict_motifs_p_values.items(), key = lambda x : (x[1]['adjusted_p_value']), reverse = False)
+
+	genes_with_transcript_names = sorted(genes_with_transcript_names.items(), key = lambda x: (x[1]['gene'], x[0]))
 
 	logger.info("writing the output file")
 
 	#now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score
 	output_file = open(os.path.join(output_directory, "genes_exons_correlation.txt"), 'w')
-	header = ["#gene_name", "exons_len_percentage", "gene_length", "exons_sum_length"]
+	header = ["#gene_name", "transcript_name", "exons_len_percentage", "gene_length", "exons_sum_length"]
 	output_file.write('\t'.join(header) + '\n') #write the header
 
+	"""
 	for i in range(len(gene_names)):
 		output_file.write('\t'.join([gene_names[i], str(scores[i]), str(gene_lengths[i]), str(exons_lengths[i])]) + '\n')
+	"""
+
+	for transcript in genes_with_transcript_names:
+		output_file.write('\t'.join([transcript[1]['gene'], transcript[0], str(transcript[1]['score']),str(transcript[1]['gene_length']), str(transcript[1]['sum_length'])]) + '\n')
 
 	output_file.close()