From 4e34a884c151dd3089e4c0c407c689f0ce1965d4 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Wed, 14 Nov 2018 16:05:01 +0100 Subject: [PATCH] looking for genes of interest in the genes gtf file --- find_exons.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/find_exons.py b/find_exons.py index dd87554..c09bec6 100644 --- a/find_exons.py +++ b/find_exons.py @@ -24,6 +24,8 @@ import random import textwrap +#from gtfparse import read_gtf #makes the logger to look different o_o + logger = logging.getLogger('find_exons') logger.setLevel(logging.INFO) @@ -102,6 +104,40 @@ def check_existing_input_files(args): return args.genes +def procede_gtf(gtf_genes, gtf_exons, genes_of_interest, output_directory): + #df = read_gtf(gtf_genes) + #df_genes = df[df["feature"] == "gene"][df["gene_name"] == genes_of_interest[0]] + #print(df_genes) + #print(df_genes["seqname"], df_genes["source"], df_genes["feature"], df_genes["start"], df_genes["end"]) + + all_genes = {} + + #first look for the genew in the gtf_genes file + for i in range(len(genes_of_interest)): + gene_name = genes_of_interest[i] + with open(gtf_genes) as gtf_genes_file: + for line in gtf_genes_file: + if gene_name in line: + line_array = re.split(r'\t', line.rstrip('\n')) + gene_start = int(line_array[3]) + gene_end = int(line_array[4]) + attributes = re.split(r'; ', line_array[8]) + for attribut in attributes: + if attribut.startswith("gene_id"): + gene_id = re.split(r'\s', attribut)[1].replace('"', '') + elif attribut.startswith("gene_name"): + gene_full_name = re.split(r'\s', attribut)[1].replace('"', '') + gene_length = gene_end - gene_start + + all_genes[gene_full_name] = all_genes.get(gene_full_name, {}) + all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'gene_id': gene_id} + + gtf_genes_file.close() + + #print() + for gene in all_genes: + print(gene) + def main(): start = time.time() @@ -130,6 +166,8 @@ def main(): logger.info("find_exons.py was called using these parameters:") logger.info(vars(args)) + procede_gtf(args.gtf_genes, args.gtf_exons, args.genes, args.output_directory) + logger.info("find_exons needed %s seconds to generate the output" % (time.time() - start)) for handler in logger.handlers: