From bd040a279b80b1f68e9bfe5401dd2a0889b7cb25 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Thu, 15 Nov 2018 14:13:08 +0100 Subject: [PATCH] writing the output file --- find_exons.py | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/find_exons.py b/find_exons.py index 3aec43f..43874b1 100644 --- a/find_exons.py +++ b/find_exons.py @@ -69,6 +69,8 @@ def get_name_from_path(full_path): def check_existing_input_files(args): + print('checking the input files') + if not os.path.isfile(args.gtf_genes): print('please make sure the .gtf file with genes exists') sys.exit() @@ -106,6 +108,8 @@ def check_existing_input_files(args): def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest): + logger.info("looking for genes of interest in gtf files") + all_genes = {} for gene_name in genes_of_interest: @@ -151,11 +155,6 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest): if gene_full_name in all_genes.keys(): all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl]) - """ - for gene in all_genes: - print(gene, all_genes[gene]) - print() - """ return all_genes def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name): @@ -179,6 +178,13 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color def plot_and_output(all_genes, output_directory): + logger.info("preparing for plotting") + + gene_names = [] + gene_lengths = [] + exons_lengths = [] + scores = [] + #find the sum length of exons for each gene for gene in all_genes: #first sort the exons array @@ -187,17 +193,35 @@ def plot_and_output(all_genes, output_directory): check_exon = 1 sum_length = 0 - last_exon = sorted_exons[-1][1] + last_exon = sorted_exons[-1][1] while check_exon <= last_exon: if sorted_exons[0][1] == check_exon: - print(check_exon) sum_length = sum_length + sorted_exons[0][0] - print(sum_length) - sorted_exons = [x for x in sorted_exons if x[1] != check_exon] - print(sorted_exons) + sorted_exons = [x for x in sorted_exons if x[1] != check_exon] #cut the current sorted_exons array so that there are no occurences of the current exon there check_exon = check_exon + 1 + score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) + + gene_names.append(gene) + gene_lengths.append(all_genes[gene]['gene_length']) + exons_lengths.append(sum_length) + scores.append(score) + + logger.info("writing the output file") + + #now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score + output_file = open(os.path.join(output_directory, "genes_exons_correlation.txt"), 'w') + header = ["gene_name", "exons_len_percentage", "gene_length", "exons_sum_length"] + output_file.write('\t'.join(header) + '\n') #write the header + + for i in range(len(gene_names)): + output_file.write('\t'.join([gene_names[i], str(scores[i]), str(gene_lengths[i]), str(exons_lengths[i])]) + '\n') + + output_file.close() + + + def main():