final changes in comments and visual presentation

anastasiia · Dec 11, 2018 · d236c95 · d236c95
1 parent 1133d0f
commit d236c95
Showing 1 changed file with 5 additions and 8 deletions.
diff --git a/find_exons.py b/find_exons.py
@@ -16,8 +16,6 @@
 from collections import defaultdict
 import textwrap
 
-#from gtfparse import read_gtf #makes the logger to look different o_o
-
 logger = logging.getLogger('find_exons')
 logger.setLevel(logging.INFO)
 
@@ -108,8 +106,6 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 	for gene in genes_of_interest:
 		grep = grep + "-e " + gene + ". " #. makes grep to look for genes starting with the name from the list of genes of interest
 
-	#print(grep + "gtf_genes" + "| grep \"protein_coding\" | grep \"transcript_type \\\"protein_coding\\\"\"")
-
 	#finally add to the grep command so that we only look for protein coding
 	small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") #extract only the rows containing protein_coding
 	if len(small_gtf_genes) != 0:
@@ -193,6 +189,7 @@ def write_outputfile(all_genes, output_directory):
 	#find the sum length of exons for each gene
 	for gene in all_genes:
 		if all_genes[gene]['exons'] != []:
+			#sort after transcript_name, exon_number and tsl
 			sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[5], x[1], x[2]))
 
 			check_exon = 1
@@ -201,10 +198,11 @@ def write_outputfile(all_genes, output_directory):
 			tsls = []
 
 			transcript_name = sorted_exons[0][5] #save the transcript name of the first exon
-			sum_length = sorted_exons[0][0] #save the length of the best first exon
+			sum_length = sorted_exons[0][0] #save the length of the first exon
 			tsls.append(sorted_exons[0][2]) #transcript support level of the first exon
 
 			for i in range(1, len(sorted_exons)):
+
 				if sorted_exons[i][5] != transcript_name: #this is a new transcript
 					#save the previous transcript_name
 					score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) #which is actually the percentage of the sum length of the exons and the whole length of the gene
@@ -226,8 +224,9 @@ def write_outputfile(all_genes, output_directory):
 
 				elif i == len(sorted_exons) - 1: #this is the last transcript
 					sum_length = sum_length + sorted_exons[i][0]
-					score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
+					tsls.append(sorted_exons[i][2])
 
+					score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
 					mean_tsl = int(np.mean(tsls)) #find the average tsl for this transcript
 					if mean_tsl > 5:
 						mean_tsl = "NA"
@@ -289,11 +288,9 @@ def main():
 	#logger.info("find_exons.py was called using these parameters:")
 	#logger.info(vars(args))
 
-
 	all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact)
 	write_outputfile(all_genes, args.output_directory)
 
-
 	logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2)))
 
 	for handler in logger.handlers: