From 6f21dc312662912cf35ca7d84e81f3c19b30cdaa Mon Sep 17 00:00:00 2001 From: anastasiia Date: Tue, 27 Nov 2018 16:09:02 +0100 Subject: [PATCH] changing the command of grep to call all genes at once with help of grep -e --- find_exons.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/find_exons.py b/find_exons.py index 2c33246..6eebaf1 100644 --- a/find_exons.py +++ b/find_exons.py @@ -45,7 +45,6 @@ def parse_args(): required_arguments.add_argument('--genes_of_interest', nargs='*', dest='genes_of_interest', help='a .txt file or a list with genes one is interested in', required=True) #all other arguments are optional - parser.add_argument('--genes_file', help='names of genes of interest') parser.add_argument('--output_directory', default='output', const='output', nargs='?', help='output directory, by default ./output/') parser.add_argument('--silent', action='store_true', help='while working with data write the information only into ./call_peaks_log.txt') parser.add_argument('--exact', action='store_true', help='choose this parameter if you want to look for exact gene matches') @@ -110,6 +109,70 @@ def check_existing_input_files(args): def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): + all_genes = {} + + grep = "grep -i " + + #make a command for grep with or function + for gene in genes_of_interest: + grep = grep + "-e " + gene + " " + + #finally add tp the grep command so that we only look for protein coding + + small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") + if len(small_gtf_genes) != 0: + #save the genes to the dictionary + for genes_line in re.split(r'\n', small_gtf_genes): + genes_array = re.split(r'\t', genes_line) + gene_start = int(genes_array[3]) + gene_end = int(genes_array[4]) + gene_length = gene_end - gene_start + attributes = re.split(r'; ', genes_array[8]) + for attribut in attributes: + if attribut.startswith("gene_name"): + gene_full_name = re.split(r'\s', attribut)[1].replace('"', '') + + all_genes[gene_full_name] = all_genes.get(gene_full_name, {}) + exons = [] + #all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons} + all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons} + #gene_groups[gene_name].append(gene_full_name) + + #now save the information about the exons to the diictionary + small_gtf_exons = subprocess.getoutput(grep + gtf_exons + "| grep \"protein_coding\"") + + for exons_line in re.split(r'\n', small_gtf_exons): + #print(exons_line) + genes_array = re.split(r'\t', exons_line) + #print(genes_array) + exon_start = int(genes_array[3]) + exon_end = int(genes_array[4]) + exon_length = exon_end - exon_start + attributes = re.split(r'; ', genes_array[8]) + for attribut in attributes: + if attribut.startswith("gene_name"): + gene_full_name = re.split(r'\s', attribut)[1].replace('"', '') + elif attribut.startswith("exon_number"): + exon_number = int(re.split(r'\s', attribut)[1]) + elif attribut.startswith("transcript_support_level"): + tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA + if tsl == "NA": + tsl = 6 + else: + tsl = int(tsl) + elif attribut.startswith("transcript_name"): + transcript_name = re.split(r'\s', attribut)[1].replace('"', '') + + if gene_full_name in all_genes.keys(): + all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name]) + else: + logger.info("none of the requested genes were found in the .gtf file") + sys.exit() + + logger.info(len(small_gtf_genes)) + logger.info(len(small_gtf_exons)) + sys.exit() + """ gene_groups = {} logger.info("looking for genes of interest in gtf files") @@ -182,6 +245,8 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): sys.exit() else: return all_genes, gene_groups + """ + return all_genes def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name): @@ -228,7 +293,7 @@ def make_reg_plot(x_array, y_array, gene_names, output_directory, figure_name, c #plt.show() -def plot_and_output(all_genes, output_directory, gene_groups): +def plot_and_output(all_genes, output_directory): logger.info("preparing for plotting") @@ -422,8 +487,8 @@ def main(): logger.info(vars(args)) - all_genes, gene_groups = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact) - plot_and_output(all_genes, args.output_directory, gene_groups) + all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact) + plot_and_output(all_genes, args.output_directory) logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2)))