diff --git a/find_exons.py b/find_exons.py index a086e3e..6fa2a80 100644 --- a/find_exons.py +++ b/find_exons.py @@ -16,8 +16,6 @@ from collections import defaultdict import textwrap -#from gtfparse import read_gtf #makes the logger to look different o_o - logger = logging.getLogger('find_exons') logger.setLevel(logging.INFO) @@ -108,8 +106,6 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): for gene in genes_of_interest: grep = grep + "-e " + gene + ". " #. makes grep to look for genes starting with the name from the list of genes of interest - #print(grep + "gtf_genes" + "| grep \"protein_coding\" | grep \"transcript_type \\\"protein_coding\\\"\"") - #finally add to the grep command so that we only look for protein coding small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") #extract only the rows containing protein_coding if len(small_gtf_genes) != 0: @@ -193,6 +189,7 @@ def write_outputfile(all_genes, output_directory): #find the sum length of exons for each gene for gene in all_genes: if all_genes[gene]['exons'] != []: + #sort after transcript_name, exon_number and tsl sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[5], x[1], x[2])) check_exon = 1 @@ -201,10 +198,11 @@ def write_outputfile(all_genes, output_directory): tsls = [] transcript_name = sorted_exons[0][5] #save the transcript name of the first exon - sum_length = sorted_exons[0][0] #save the length of the best first exon + sum_length = sorted_exons[0][0] #save the length of the first exon tsls.append(sorted_exons[0][2]) #transcript support level of the first exon for i in range(1, len(sorted_exons)): + if sorted_exons[i][5] != transcript_name: #this is a new transcript #save the previous transcript_name score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) #which is actually the percentage of the sum length of the exons and the whole length of the gene @@ -226,8 +224,9 @@ def write_outputfile(all_genes, output_directory): elif i == len(sorted_exons) - 1: #this is the last transcript sum_length = sum_length + sorted_exons[i][0] - score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) + tsls.append(sorted_exons[i][2]) + score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) mean_tsl = int(np.mean(tsls)) #find the average tsl for this transcript if mean_tsl > 5: mean_tsl = "NA" @@ -289,11 +288,9 @@ def main(): #logger.info("find_exons.py was called using these parameters:") #logger.info(vars(args)) - all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact) write_outputfile(all_genes, args.output_directory) - logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2))) for handler in logger.handlers: