From b10fba0c1afb76bd5089440979d3eceeb64683d0 Mon Sep 17 00:00:00 2001
From: anastasiia <anastasiia.petrova@mpi-bn.mpg.de>
Date: Thu, 27 Dec 2018 23:26:36 +0100
Subject: [PATCH] adding two new columns gene_id and transcript_id to the
 output file

---
 find_exons.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/find_exons.py b/find_exons.py
index 9bd28dd..5487a29 100644
--- a/find_exons.py
+++ b/find_exons.py
@@ -140,9 +140,14 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 						tsl = int(tsl)
 				elif attribut.startswith("transcript_name"):
 					transcript_name = re.split(r'\s', attribut)[1].replace('"', '')
+				elif attribut.startswith("transcript_id"):
+					transcript_id = re.split(r'\s', attribut)[1].replace('"', '')
+				elif attribut.startswith("gene_id"):
+					gene_id = re.split(r'\s', attribut)[1].replace('"', '')
 
 			if gene_full_name in all_genes.keys():
-				all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name])
+				all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name, transcript_id])
+				all_genes[gene_full_name]['gene_id'] = gene_id
 	else:
 		logger.info("none of the requested genes were found in the .gtf file")
 		sys.exit()
@@ -191,6 +196,7 @@ def write_outputfile(all_genes, output_directory):
 			tsls = []
 
 			transcript_name = sorted_exons[0][5] #save the transcript name of the first exon
+			transcript_id = sorted_exons[0][6] #save the transcript_id of the first exon
 			sum_length = sorted_exons[0][0] #save the length of the first exon
 			tsls.append(sorted_exons[0][2]) #transcript support level of the first exon
 
@@ -208,11 +214,12 @@ def write_outputfile(all_genes, output_directory):
 
 					gene_transcript_name = gene + "_" + transcript_name
 					genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {})
-					genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number}
+					genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number, 'gene_id': all_genes[gene]['gene_id'], 'transcript_id': transcript_id}
 
 					tsls = []
 					tsls.append(sorted_exons[i][2])
 					transcript_name = sorted_exons[i][5]
+					transcript_id = sorted_exons[i][6]
 					sum_length = sorted_exons[i][0]
 					
 				elif i == len(sorted_exons) - 1: #this is the last transcript
@@ -228,7 +235,7 @@ def write_outputfile(all_genes, output_directory):
 
 					gene_transcript_name = gene + "_" + transcript_name
 					genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {})
-					genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number}
+					genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number, 'gene_id': all_genes[gene]['gene_id'], 'transcript_id': transcript_id}
 
 				else: #this is still the same transcript name
 					tsls.append(sorted_exons[i][2]) #save the tsl to find the average tsl later on
@@ -242,11 +249,11 @@ def write_outputfile(all_genes, output_directory):
 
 	#now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score
 	output_file = open(os.path.join(output_directory, "genes_exons_correlation.txt"), 'w')
-	header = ["#gene_name", "transcript_name", "mean_tsl", "exons_len_percentage", "gene_length", "exons_number", "exons_sum_length"]
+	header = ["#gene_name", "transcript_name", "mean_tsl", "exons_len_percentage", "gene_length", "exons_number", "exons_sum_length", "gene_id", "transcript_id"]
 	output_file.write('\t'.join(header) + '\n') #write the header
 
 	for transcript in genes_with_transcript_names:
-		output_file.write('\t'.join([transcript[1]['gene'], transcript[0].replace(transcript[1]['gene'] + '_', ''), str(transcript[1]['mean_tsl']), str(transcript[1]['score']), str(transcript[1]['gene_length']), str(transcript[1]['exons_number']), str(transcript[1]['sum_length'])]) + '\n')
+		output_file.write('\t'.join([transcript[1]['gene'], transcript[0].replace(transcript[1]['gene'] + '_', ''), str(transcript[1]['mean_tsl']), str(transcript[1]['score']), str(transcript[1]['gene_length']), str(transcript[1]['exons_number']), str(transcript[1]['sum_length']), str(transcript[1]['gene_id']), str(transcript[1]['transcript_id'])]) + '\n')
 
 	output_file.close()