From 0b7318e3a3c404d250a8a3dcad2124dcc33377f0 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Mon, 26 Nov 2018 01:13:51 +0100 Subject: [PATCH] excluding pseudogenes while searching and saving only the best gene to the output file --- find_exons.py | 97 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 82 insertions(+), 15 deletions(-) diff --git a/find_exons.py b/find_exons.py index f9ecd38..265ca3b 100644 --- a/find_exons.py +++ b/find_exons.py @@ -122,11 +122,12 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): all_genes = {} for gene_name in genes_of_interest: - gene_groups[gene_name] = [] + #first look for the genes in the gtf_genes file - small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes)#find the first line of the input file + small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes + " | grep -v \"pseudogene\"")#find the first line of the input file if len(small_gtf_genes) != 0: + gene_groups[gene_name] = [] for genes_line in re.split(r'\n', small_gtf_genes): genes_array = re.split(r'\t', genes_line) @@ -145,7 +146,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): gene_groups[gene_name].append(gene_full_name) #find the nubmer and the length of all exons for each gene of interest - small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons) + small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons + " | grep -v \"pseudogene\"") for exons_line in re.split(r'\n', small_gtf_exons): #print(exons_line) @@ -168,7 +169,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): tsl = int(tsl) if gene_full_name in all_genes.keys(): - all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl]) + all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end]) else: logger.info("The gene " + gene_name + " was not found in the .gtf file with genes") @@ -235,46 +236,112 @@ def plot_and_output(all_genes, output_directory, gene_groups): #find the sum length of exons for each gene for gene in all_genes: + """ #first sort the exons array exons_sorted_after_length = sorted(all_genes[gene]['exons'], key=lambda x: x[0], reverse=True) #then sort after exon number and transcription support level sorted_exons = sorted(exons_sorted_after_length, key = lambda x: (x[1], x[2])) + """ + #TRY sort after the start and and positions + sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[3], x[4])) + + #print(sorted_exons) check_exon = 1 sum_length = 0 last_exon = sorted_exons[-1][1] + """ while check_exon <= last_exon: - if sorted_exons[0][1] == check_exon: + if sorted_exons[0][1] == check_exon: #check the current exon_number sum_length = sum_length + sorted_exons[0][0] sorted_exons = [x for x in sorted_exons if x[1] != check_exon] #cut the current sorted_exons array so that there are no occurences of the current exon there check_exon = check_exon + 1 + """ + + #TRY i am trying to work with the positions of exons to reduce overlaps + start_pos = int(sorted_exons[0][3]) #save the start position of the first exon + end_pos = int(sorted_exons[0][4]) #save the end position of the first exon + + #if gene == "AL589843.2": + # print(sorted_exons) + # print(start_pos) + # print(end_pos) + # print() + + for i in range(1, len(sorted_exons)): #start with the second exon, as we have already saved the first one + if int(sorted_exons[i][3]) < end_pos: #the current exon is overlapping the previous one + if not int(sorted_exons[i][4]) <= end_pos: + end_pos = int(sorted_exons[i][4]) #the new end position is the one of the current exon + if i == len(sorted_exons) - 1: + #this is the last exon and an overlap, so save it + previous_len = end_pos - start_pos + sum_length = sum_length + previous_len + #if gene == "AL589843.2": + # print("set new end pos ", end_pos) + # print(sum_length) + # print() + #else do nothing + else: #the current exon does not overlap the previous one + #save the length of the previous exon + previous_len = end_pos - start_pos + sum_length = sum_length + previous_len + #and save the new start and end positions to check for overlaps + start_pos = int(sorted_exons[i][3]) + end_pos = int(sorted_exons[i][4]) + #if gene == "AL589843.2": + # print(sum_length) + # print(start_pos) + # print(end_pos) + # print() score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) - # TODO make one nice dictionary instead of all these + all_genes[gene]['sum_length'] = sum_length + all_genes[gene]['score'] = score + + """ + #for test purposes gene_names.append(gene) gene_lengths.append(all_genes[gene]['gene_length']) exons_lengths.append(sum_length) scores.append(score) + """ + #""" logger.info('extracting the best genes') - print(gene_names) + #print(gene_names) for gene_original in gene_groups.keys(): #look inside the nice dictionary for the gene with the biggest score and if the scores are the same, look for the biggest length #if we found one gene to save, than make all these arrays with gene_names, gene_lengths and everything - check_score = 0 - + check_score = 0.0 gene_to_save = '' - for test_gene in gene_groups[gene_original]: - gene_index = gene_names.index(test_gene) - print(gene_original) - #look for this gene group within all gene_names and - print(gene_groups[gene_original]) - print(gene_names.index('AL589880.1')) + for test_gene in gene_groups[gene_original]: + #print(all_genes[test_gene]) + if float(all_genes[test_gene]['score']) > check_score: + #print("score is bigger ", check_score, all_genes[test_gene]['score']) + check_score = float(all_genes[test_gene]['score']) + gene_to_save = test_gene + elif float(all_genes[test_gene]['score']) == check_score: + #print("score is the same ", check_score, all_genes[test_gene]['score']) + #check for the gene length + if gene_to_save != '' and int(all_genes[test_gene]['gene_length']) > int(all_genes[gene_to_save]['gene_length']): + gene_to_save = test_gene + #else do nothing + + if gene_to_save == '': #the score for this gene is 0.0, so no best gene occurence could be found + gene_to_save = gene_original + + gene_names.append(gene_to_save) + gene_lengths.append(all_genes[gene_to_save]['gene_length']) + exons_lengths.append(all_genes[gene_to_save]['sum_length']) + scores.append(all_genes[gene_to_save]['score']) + + #print("i am saving this gene ", all_genes[gene_to_save]) + #""" logger.info("writing the output file")