diff --git a/find_exons.py b/find_exons.py index a75ca20..a36e9fd 100644 --- a/find_exons.py +++ b/find_exons.py @@ -105,6 +105,7 @@ def check_existing_input_files(args): return genes_of_interest def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): + logger.info("working with the gtf files") all_genes = {} @@ -112,7 +113,9 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): #make a command for grep with or function for gene in genes_of_interest: - grep = grep + "-e " + gene + " " + grep = grep + "-e " + gene + ". " #. makes grep to look for genes starting with the name from the list of genes of interest + + print(grep) #finally add to the grep command so that we only look for protein coding small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") @@ -134,7 +137,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons} #now save the information about the exons to the diictionary - small_gtf_exons = subprocess.getoutput(grep + gtf_exons + "| grep \"protein_coding\"") + small_gtf_exons = subprocess.getoutput(grep + gtf_exons + "| grep \"protein_coding\" | grep \"transcript_support_level\"") for exons_line in re.split(r'\n', small_gtf_exons): genes_array = re.split(r'\t', exons_line) @@ -157,14 +160,17 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): transcript_name = re.split(r'\s', attribut)[1].replace('"', '') if gene_full_name in all_genes.keys(): + print(gene_full_name) + #print(all_genes[gene_full_name]) all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name]) else: logger.info("none of the requested genes were found in the .gtf file") sys.exit() - logger.info(len(small_gtf_genes)) - logger.info(len(small_gtf_exons)) - sys.exit() + logger.info(len(re.split(r'\n', small_gtf_genes))) + logger.info(len(re.split(r'\n', small_gtf_exons))) + logger.info(len(all_genes)) + #sys.exit() return all_genes @@ -191,7 +197,7 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color def plot_and_output(all_genes, output_directory): - logger.info("preparing for plotting") + #logger.info("preparing for plotting") gene_names = [] gene_lengths = [] @@ -202,27 +208,29 @@ def plot_and_output(all_genes, output_directory): #find the sum length of exons for each gene for gene in all_genes: - - sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[5], x[1], x[2])) - - check_exon = 1 - sum_length = 0 - - transcript_name = sorted_exons[0][5] #save the transcript name of the first exon - sum_length = sorted_exons[0][0] #save the length of the best first exon - - for i in range(1, len(sorted_exons)): - if sorted_exons[i][5] != transcript_name or i == len(sorted_exons) - 1: #this is a new transcript or the last one - #save the previous transcript_name - score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) - gene_transcript_name = gene + "_" + transcript_name - genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {}) - genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score} #anzahl von exons - transcript_name = sorted_exons[i][5] - sum_length = sorted_exons[i][0] - else: #this is still the same transcript name - #save the length of this exon - sum_length = sum_length + sorted_exons[i][0] + if all_genes[gene]['exons'] != []: + sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[5], x[1], x[2])) + + check_exon = 1 + sum_length = 0 + + print(sorted_exons[0]) + transcript_name = sorted_exons[0][5] #save the transcript name of the first exon + sum_length = sorted_exons[0][0] #save the length of the best first exon + + for i in range(1, len(sorted_exons)): + if sorted_exons[i][5] != transcript_name or i == len(sorted_exons) - 1: #this is a new transcript or the last one + #save the previous transcript_name + score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) + gene_transcript_name = gene + "_" + transcript_name + genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {}) + genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score} #anzahl von exons + transcript_name = sorted_exons[i][5] + sum_length = sorted_exons[i][0] + else: #this is still the same transcript name + #save the length of this exon + sum_length = sum_length + sorted_exons[i][0] + #else do nothing, there are no exons found corresponding to this gene genes_with_transcript_names = sorted(genes_with_transcript_names.items(), key = lambda x: (x[1]['gene'], x[0]))