From 1c199b0f806a1e4601f19fcd872d297cf90d3fe4 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Mon, 10 Dec 2018 15:26:45 +0100 Subject: [PATCH] adding the mean_tsl to the output file --- find_exons.py | 62 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/find_exons.py b/find_exons.py index a36e9fd..297607d 100644 --- a/find_exons.py +++ b/find_exons.py @@ -105,7 +105,7 @@ def check_existing_input_files(args): return genes_of_interest def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): - logger.info("working with the gtf files") + logger.info("working with gtf files") all_genes = {} @@ -115,10 +115,10 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): for gene in genes_of_interest: grep = grep + "-e " + gene + ". " #. makes grep to look for genes starting with the name from the list of genes of interest - print(grep) + print(grep + "gtf_genes" + "| grep \"protein_coding\" | grep \"transcript_type \\\"protein_coding\\\"\"") #finally add to the grep command so that we only look for protein coding - small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") + small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") #| grep \"transcript_type \"protein_coding\"\"" i dont know how to handle it :( if len(small_gtf_genes) != 0: #save the genes to the dictionary for genes_line in re.split(r'\n', small_gtf_genes): @@ -160,16 +160,16 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): transcript_name = re.split(r'\s', attribut)[1].replace('"', '') if gene_full_name in all_genes.keys(): - print(gene_full_name) + #print(gene_full_name) #print(all_genes[gene_full_name]) all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name]) else: logger.info("none of the requested genes were found in the .gtf file") sys.exit() - logger.info(len(re.split(r'\n', small_gtf_genes))) - logger.info(len(re.split(r'\n', small_gtf_exons))) - logger.info(len(all_genes)) + #logger.info(len(re.split(r'\n', small_gtf_genes))) + #logger.info(len(re.split(r'\n', small_gtf_exons))) + #logger.info(len(all_genes)) #sys.exit() return all_genes @@ -195,7 +195,7 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color plt.show() -def plot_and_output(all_genes, output_directory): +def write_outputfile(all_genes, output_directory): #logger.info("preparing for plotting") @@ -214,22 +214,50 @@ def plot_and_output(all_genes, output_directory): check_exon = 1 sum_length = 0 - print(sorted_exons[0]) + tsls = [] + transcript_name = sorted_exons[0][5] #save the transcript name of the first exon sum_length = sorted_exons[0][0] #save the length of the best first exon + tsls.append(sorted_exons[0][2]) #transcript support level of the first exon for i in range(1, len(sorted_exons)): - if sorted_exons[i][5] != transcript_name or i == len(sorted_exons) - 1: #this is a new transcript or the last one + if sorted_exons[i][5] != transcript_name: #this is a new transcript #save the previous transcript_name - score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) + score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) #which is actually the percentage of the sum length of the exons and the whole length of the gene + + mean_tsl = "%.2f" % round(np.mean(tsls), 2) #find the average tsl for this transcript + if mean_tsl > 5: + mean_tsl = "NA" + + exons_number = len(tsls) + gene_transcript_name = gene + "_" + transcript_name genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {}) - genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score} #anzahl von exons + genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number} #anzahl von exons + + tsls = [] + tsls.append(sorted_exons[i][2]) transcript_name = sorted_exons[i][5] sum_length = sorted_exons[i][0] - else: #this is still the same transcript name - #save the length of this exon + + elif i == len(sorted_exons) - 1: #this is the last transcript sum_length = sum_length + sorted_exons[i][0] + score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) + + mean_tsl = "%.2f" % round(np.mean(tsls), 2) #find the average tsl for this transcript + if mean_tsl > 5: + mean_tsl = "NA" + + exons_number = len(tsls) + + gene_transcript_name = gene + "_" + transcript_name + genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {}) + genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number} + + else: #this is still the same transcript name + tsls.append(sorted_exons[i][2]) #save the tsl to find the average tsl later on + sum_length = sum_length + sorted_exons[i][0] #save the length of this exon + #else do nothing, there are no exons found corresponding to this gene genes_with_transcript_names = sorted(genes_with_transcript_names.items(), key = lambda x: (x[1]['gene'], x[0])) @@ -238,11 +266,11 @@ def plot_and_output(all_genes, output_directory): #now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score output_file = open(os.path.join(output_directory, "genes_exons_correlation.txt"), 'w') - header = ["#gene_name", "transcript_name", "exons_len_percentage", "gene_length", "exons_sum_length"] + header = ["#gene_name", "transcript_name", "mean_tsl", "exons_len_percentage", "gene_length", "exons_number", "exons_sum_length"] output_file.write('\t'.join(header) + '\n') #write the header for transcript in genes_with_transcript_names: - output_file.write('\t'.join([transcript[1]['gene'], transcript[0].replace(transcript[1]['gene'] + '_', ''), str(transcript[1]['score']),str(transcript[1]['gene_length']), str(transcript[1]['sum_length'])]) + '\n') + output_file.write('\t'.join([transcript[1]['gene'], transcript[0].replace(transcript[1]['gene'] + '_', ''), str(transcript[1]['mean_tsl']), str(transcript[1]['score']), str(transcript[1]['gene_length']), str(transcript[1]['exons_number']), str(transcript[1]['sum_length'])]) + '\n') output_file.close() @@ -276,7 +304,7 @@ def main(): all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact) - plot_and_output(all_genes, args.output_directory) + write_outputfile(all_genes, args.output_directory) logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2)))