diff --git a/find_exons.py b/find_exons.py index 44dc5ee..4c5da26 100644 --- a/find_exons.py +++ b/find_exons.py @@ -124,9 +124,10 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): for gene_name in genes_of_interest: #first look for the genes in the gtf_genes file - small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes + " | grep -v \"pseudogene\"")#find the first line of the input file + small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes + " | grep \"protein_coding\"") #" | grep -v \"pseudogene\"")#find the first line of the input file if len(small_gtf_genes) != 0: + logger.info("working with the gene " + gene_name) gene_groups[gene_name] = [] for genes_line in re.split(r'\n', small_gtf_genes): @@ -146,7 +147,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): gene_groups[gene_name].append(gene_full_name) #find the nubmer and the length of all exons for each gene of interest - small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons + " | grep -v \"pseudogene\"") + small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons + " | grep \"protein_coding\"") for exons_line in re.split(r'\n', small_gtf_exons): #print(exons_line) @@ -167,9 +168,11 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): tsl = 6 else: tsl = int(tsl) + elif attribut.startswith("transcript_name"): + transcript_name = re.split(r'\s', attribut)[1].replace('"', '') if gene_full_name in all_genes.keys(): - all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end]) + all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name]) else: logger.info("The gene " + gene_name + " was not found in the .gtf file with genes") @@ -234,6 +237,9 @@ def plot_and_output(all_genes, output_directory, gene_groups): exons_lengths = [] scores = [] + #TRY + genes_with_transcript_names = {} + #find the sum length of exons for each gene for gene in all_genes: """ @@ -242,23 +248,26 @@ def plot_and_output(all_genes, output_directory, gene_groups): #then sort after exon number and transcription support level sorted_exons = sorted(exons_sorted_after_length, key = lambda x: (x[1], x[2])) """ + """ #TRY sort after the start and and positions sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[3], x[4])) - - #print(sorted_exons) + """ + #TRY sort after transcript name and then after the exon number, and then after transcription support level + sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[5], x[1], x[2])) check_exon = 1 sum_length = 0 - last_exon = sorted_exons[-1][1] """ + last_exon = sorted_exons[-1][1] + while check_exon <= last_exon: if sorted_exons[0][1] == check_exon: #check the current exon_number sum_length = sum_length + sorted_exons[0][0] sorted_exons = [x for x in sorted_exons if x[1] != check_exon] #cut the current sorted_exons array so that there are no occurences of the current exon there check_exon = check_exon + 1 """ - + """ #TRY i am trying to work with the positions of exons to reduce overlaps start_pos = int(sorted_exons[0][3]) #save the start position of the first exon end_pos = int(sorted_exons[0][4]) #save the end position of the first exon @@ -294,11 +303,28 @@ def plot_and_output(all_genes, output_directory, gene_groups): # print(start_pos) # print(end_pos) # print() + """ + transcript_name = sorted_exons[0][5] #save the transcript name of the first exon + sum_length = sorted_exons[0][0] #save the length of the best first exon + #genes_with_transcript_names[transcript_name] = genes_with_transcript_names.get(transcript_name, {}) + + for i in range(1, len(sorted_exons)): + if sorted_exons[i][5] != transcript_name or i == len(sorted_exons) - 1: #this is a new transcript or the last one + #save the previous transcript_name + score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) + genes_with_transcript_names[transcript_name] = genes_with_transcript_names.get(transcript_name, {}) + genes_with_transcript_names[transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score} + transcript_name = sorted_exons[i][5] + sum_length = sorted_exons[i][0] + else: #this is still the same transcript name + #save the length of this exon + sum_length = sum_length + sorted_exons[i][0] - score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) - + + """ all_genes[gene]['sum_length'] = sum_length all_genes[gene]['score'] = score + """ """ #for test purposes @@ -308,7 +334,7 @@ def plot_and_output(all_genes, output_directory, gene_groups): scores.append(score) """ - #""" + """ logger.info('extracting the best genes') #print(gene_names) @@ -341,17 +367,25 @@ def plot_and_output(all_genes, output_directory, gene_groups): scores.append(all_genes[gene_to_save]['score']) #print("i am saving this gene ", all_genes[gene_to_save]) - #""" + """ + #sorted_dict = sorted(dict_motifs_p_values.items(), key = lambda x : (x[1]['adjusted_p_value']), reverse = False) + + genes_with_transcript_names = sorted(genes_with_transcript_names.items(), key = lambda x: (x[1]['gene'], x[0])) logger.info("writing the output file") #now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score output_file = open(os.path.join(output_directory, "genes_exons_correlation.txt"), 'w') - header = ["#gene_name", "exons_len_percentage", "gene_length", "exons_sum_length"] + header = ["#gene_name", "transcript_name", "exons_len_percentage", "gene_length", "exons_sum_length"] output_file.write('\t'.join(header) + '\n') #write the header + """ for i in range(len(gene_names)): output_file.write('\t'.join([gene_names[i], str(scores[i]), str(gene_lengths[i]), str(exons_lengths[i])]) + '\n') + """ + + for transcript in genes_with_transcript_names: + output_file.write('\t'.join([transcript[1]['gene'], transcript[0], str(transcript[1]['score']),str(transcript[1]['gene_length']), str(transcript[1]['sum_length'])]) + '\n') output_file.close()