From 4a31126849c8774c4831c77161c8ad994eeedfc4 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Thu, 15 Nov 2018 14:44:38 +0100 Subject: [PATCH] the plotting is done --- find_exons.py | 113 +++++++++++++++++++++++++++++--------------------- 1 file changed, 65 insertions(+), 48 deletions(-) diff --git a/find_exons.py b/find_exons.py index 43874b1..4505180 100644 --- a/find_exons.py +++ b/find_exons.py @@ -47,6 +47,7 @@ def parse_args(): parser.add_argument('--genes_file', help='names of genes of interest') parser.add_argument('--output_directory', default='output', const='output', nargs='?', help='output directory, by default ./output/') parser.add_argument('--silent', action='store_true', help='while working with data write the information only into ./call_peaks_log.txt') + parser.add_argument('--exact', action='store_true', help='choose this parameter if you want to look for exact gene matches') args = parser.parse_args() return args @@ -106,64 +107,80 @@ def check_existing_input_files(args): return args.genes -def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest): +def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): logger.info("looking for genes of interest in gtf files") + if exact: + grep = "grep -w " + else: + grep = "grep " + all_genes = {} for gene_name in genes_of_interest: #first look for the genes in the gtf_genes file - small_gtf_genes = subprocess.getoutput("grep " + gtf_genes + " -e \"" + gene_name + "\"")#find the first line of the input file - - for genes_line in re.split(r'\n', small_gtf_genes): - genes_array = re.split(r'\t', genes_line) - gene_start = int(genes_array[3]) - gene_end = int(genes_array[4]) - gene_length = gene_end - gene_start - attributes = re.split(r'; ', genes_array[8]) - for attribut in attributes: - if attribut.startswith("gene_name"): - gene_full_name = re.split(r'\s', attribut)[1].replace('"', '') + small_gtf_genes = subprocess.getoutput(grep + gtf_genes + " -e \"" + gene_name + "\"")#find the first line of the input file + + if len(small_gtf_genes) != 0: + + for genes_line in re.split(r'\n', small_gtf_genes): + genes_array = re.split(r'\t', genes_line) + gene_start = int(genes_array[3]) + gene_end = int(genes_array[4]) + gene_length = gene_end - gene_start + attributes = re.split(r'; ', genes_array[8]) + for attribut in attributes: + if attribut.startswith("gene_name"): + gene_full_name = re.split(r'\s', attribut)[1].replace('"', '') + + all_genes[gene_full_name] = all_genes.get(gene_full_name, {}) + exons = [] + #all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons} + all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons} + + #find the nubmer and the length of all exons for each gene of interest + small_gtf_exons = subprocess.getoutput("grep " + gtf_exons + " -e \"" + gene_name + "\"") - all_genes[gene_full_name] = all_genes.get(gene_full_name, {}) - exons = [] - #all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons} - all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons} + for exons_line in re.split(r'\n', small_gtf_exons): + genes_array = re.split(r'\t', exons_line) + exon_start = int(genes_array[3]) + exon_end = int(genes_array[4]) + exon_length = exon_end - exon_start + attributes = re.split(r'; ', genes_array[8]) + for attribut in attributes: + if attribut.startswith("gene_name"): + gene_full_name = re.split(r'\s', attribut)[1].replace('"', '') + elif attribut.startswith("exon_number"): + exon_number = int(re.split(r'\s', attribut)[1]) + elif attribut.startswith("transcript_support_level"): + tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA + if tsl == "NA": + tsl = 6 + else: + tsl = int(tsl) + + if gene_full_name in all_genes.keys(): + all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl]) - #find the nubmer and the length of all exons for each gene of interest - small_gtf_exons = subprocess.getoutput("grep " + gtf_exons + " -e \"" + gene_name + "\"") - - for exons_line in re.split(r'\n', small_gtf_exons): - genes_array = re.split(r'\t', exons_line) - exon_start = int(genes_array[3]) - exon_end = int(genes_array[4]) - exon_length = exon_end - exon_start - attributes = re.split(r'; ', genes_array[8]) - for attribut in attributes: - if attribut.startswith("gene_name"): - gene_full_name = re.split(r'\s', attribut)[1].replace('"', '') - elif attribut.startswith("exon_number"): - exon_number = int(re.split(r'\s', attribut)[1]) - elif attribut.startswith("transcript_support_level"): - tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA - if tsl == "NA": - tsl = 6 - else: - tsl = int(tsl) - - if gene_full_name in all_genes.keys(): - all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl]) - - return all_genes + else: + logger.info("The gene " + gene_name + " was not found in the .gtf file with genes") + + if len(all_genes) == 0: + print("None of the given list of genes was found in the .gtf file") + sys.exit() + else: + return all_genes def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name): + + the_max = max(max(x_array), max(y_array)) - #plt.xlim(0,1) - #plt.ylim(0,1) + plt.xlim(0, the_max) + plt.ylim(0, the_max) fig, ax = plt.subplots() - plt.plot([0,1], '--', color = "black", linewidth = 0.5) + plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5) plt.plot(x_array, y_array, 'o', color = color_name) plt.xlabel("whole gene length") @@ -209,7 +226,7 @@ def plot_and_output(all_genes, output_directory): scores.append(score) logger.info("writing the output file") - + #now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score output_file = open(os.path.join(output_directory, "genes_exons_correlation.txt"), 'w') header = ["gene_name", "exons_len_percentage", "gene_length", "exons_sum_length"] @@ -220,8 +237,8 @@ def plot_and_output(all_genes, output_directory): output_file.close() - - + logger.info("making a plot") + make_plot(gene_lengths, exons_lengths, gene_names, output_directory, "genes_exons_correlation.png", 'gold') def main(): @@ -251,7 +268,7 @@ def main(): logger.info("find_exons.py was called using these parameters:") logger.info(vars(args)) - all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes) + all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact) plot_and_output(all_genes, args.output_directory) logger.info("find_exons needed %s seconds to generate the output" % (time.time() - start))