From 1c199b0f806a1e4601f19fcd872d297cf90d3fe4 Mon Sep 17 00:00:00 2001
From: anastasiia <anastasiia.petrova@mpi-bn.mpg.de>
Date: Mon, 10 Dec 2018 15:26:45 +0100
Subject: [PATCH] adding the mean_tsl to the output file

---
 find_exons.py | 62 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 17 deletions(-)

diff --git a/find_exons.py b/find_exons.py
index a36e9fd..297607d 100644
--- a/find_exons.py
+++ b/find_exons.py
@@ -105,7 +105,7 @@ def check_existing_input_files(args):
 	return genes_of_interest
 
 def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
-	logger.info("working with the gtf files")
+	logger.info("working with gtf files")
 
 	all_genes = {}
 
@@ -115,10 +115,10 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 	for gene in genes_of_interest:
 		grep = grep + "-e " + gene + ". " #. makes grep to look for genes starting with the name from the list of genes of interest
 
-	print(grep)
+	print(grep + "gtf_genes" + "| grep \"protein_coding\" | grep \"transcript_type \\\"protein_coding\\\"\"")
 
 	#finally add to the grep command so that we only look for protein coding
-	small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"")
+	small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") #| grep \"transcript_type \"protein_coding\"\"" i dont know how to handle it :(
 	if len(small_gtf_genes) != 0:
 		#save the genes to the dictionary
 		for genes_line in re.split(r'\n', small_gtf_genes):
@@ -160,16 +160,16 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 					transcript_name = re.split(r'\s', attribut)[1].replace('"', '')
 
 			if gene_full_name in all_genes.keys():
-				print(gene_full_name)
+				#print(gene_full_name)
 				#print(all_genes[gene_full_name])
 				all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name])
 	else:
 		logger.info("none of the requested genes were found in the .gtf file")
 		sys.exit()
 
-	logger.info(len(re.split(r'\n', small_gtf_genes)))
-	logger.info(len(re.split(r'\n', small_gtf_exons)))
-	logger.info(len(all_genes))
+	#logger.info(len(re.split(r'\n', small_gtf_genes)))
+	#logger.info(len(re.split(r'\n', small_gtf_exons)))
+	#logger.info(len(all_genes))
 	#sys.exit()
 
 	return all_genes
@@ -195,7 +195,7 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color
 	
 	plt.show()
 
-def plot_and_output(all_genes, output_directory):
+def write_outputfile(all_genes, output_directory):
 
 	#logger.info("preparing for plotting")
 
@@ -214,22 +214,50 @@ def plot_and_output(all_genes, output_directory):
 			check_exon = 1
 			sum_length = 0
 
-			print(sorted_exons[0])
+			tsls = []
+
 			transcript_name = sorted_exons[0][5] #save the transcript name of the first exon
 			sum_length = sorted_exons[0][0] #save the length of the best first exon
+			tsls.append(sorted_exons[0][2]) #transcript support level of the first exon
 
 			for i in range(1, len(sorted_exons)):
-				if sorted_exons[i][5] != transcript_name or i == len(sorted_exons) - 1: #this is a new transcript or the last one
+				if sorted_exons[i][5] != transcript_name: #this is a new transcript
 					#save the previous transcript_name
-					score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
+					score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) #which is actually the percentage of the sum length of the exons and the whole length of the gene
+
+					mean_tsl = "%.2f" % round(np.mean(tsls), 2) #find the average tsl for this transcript
+					if mean_tsl > 5:
+						mean_tsl = "NA"
+
+					exons_number = len(tsls)
+
 					gene_transcript_name = gene + "_" + transcript_name
 					genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {})
-					genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score} #anzahl von exons
+					genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number} #anzahl von exons
+
+					tsls = []
+					tsls.append(sorted_exons[i][2])
 					transcript_name = sorted_exons[i][5]
 					sum_length = sorted_exons[i][0]
-				else: #this is still the same transcript name
-					#save the length of this exon
+					
+				elif i == len(sorted_exons) - 1: #this is the last transcript
 					sum_length = sum_length + sorted_exons[i][0]
+					score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
+
+					mean_tsl = "%.2f" % round(np.mean(tsls), 2) #find the average tsl for this transcript
+					if mean_tsl > 5:
+						mean_tsl = "NA"
+
+					exons_number = len(tsls)
+
+					gene_transcript_name = gene + "_" + transcript_name
+					genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {})
+					genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number}
+
+				else: #this is still the same transcript name
+					tsls.append(sorted_exons[i][2]) #save the tsl to find the average tsl later on
+					sum_length = sum_length + sorted_exons[i][0] #save the length of this exon
+					
 		#else do nothing, there are no exons found corresponding to this gene
 
 	genes_with_transcript_names = sorted(genes_with_transcript_names.items(), key = lambda x: (x[1]['gene'], x[0]))
@@ -238,11 +266,11 @@ def plot_and_output(all_genes, output_directory):
 
 	#now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score
 	output_file = open(os.path.join(output_directory, "genes_exons_correlation.txt"), 'w')
-	header = ["#gene_name", "transcript_name", "exons_len_percentage", "gene_length", "exons_sum_length"]
+	header = ["#gene_name", "transcript_name", "mean_tsl", "exons_len_percentage", "gene_length", "exons_number", "exons_sum_length"]
 	output_file.write('\t'.join(header) + '\n') #write the header
 
 	for transcript in genes_with_transcript_names:
-		output_file.write('\t'.join([transcript[1]['gene'], transcript[0].replace(transcript[1]['gene'] + '_', ''), str(transcript[1]['score']),str(transcript[1]['gene_length']), str(transcript[1]['sum_length'])]) + '\n')
+		output_file.write('\t'.join([transcript[1]['gene'], transcript[0].replace(transcript[1]['gene'] + '_', ''), str(transcript[1]['mean_tsl']), str(transcript[1]['score']), str(transcript[1]['gene_length']), str(transcript[1]['exons_number']), str(transcript[1]['sum_length'])]) + '\n')
 
 	output_file.close()
 
@@ -276,7 +304,7 @@ def main():
 
 
 	all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact)
-	plot_and_output(all_genes, args.output_directory)
+	write_outputfile(all_genes, args.output_directory)
 
 
 	logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2)))