Skip to content

Commit

Permalink
looking for genes of interest in the genes gtf file
Browse files Browse the repository at this point in the history
  • Loading branch information
anastasiia committed Nov 14, 2018
1 parent 67a80fb commit 4e34a88
Showing 1 changed file with 38 additions and 0 deletions.
38 changes: 38 additions & 0 deletions find_exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
import random
import textwrap

#from gtfparse import read_gtf #makes the logger to look different o_o

logger = logging.getLogger('find_exons')
logger.setLevel(logging.INFO)

Expand Down Expand Up @@ -102,6 +104,40 @@ def check_existing_input_files(args):

return args.genes

def procede_gtf(gtf_genes, gtf_exons, genes_of_interest, output_directory):
#df = read_gtf(gtf_genes)
#df_genes = df[df["feature"] == "gene"][df["gene_name"] == genes_of_interest[0]]
#print(df_genes)
#print(df_genes["seqname"], df_genes["source"], df_genes["feature"], df_genes["start"], df_genes["end"])

all_genes = {}

#first look for the genew in the gtf_genes file
for i in range(len(genes_of_interest)):
gene_name = genes_of_interest[i]
with open(gtf_genes) as gtf_genes_file:
for line in gtf_genes_file:
if gene_name in line:
line_array = re.split(r'\t', line.rstrip('\n'))
gene_start = int(line_array[3])
gene_end = int(line_array[4])
attributes = re.split(r'; ', line_array[8])
for attribut in attributes:
if attribut.startswith("gene_id"):
gene_id = re.split(r'\s', attribut)[1].replace('"', '')
elif attribut.startswith("gene_name"):
gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')
gene_length = gene_end - gene_start

all_genes[gene_full_name] = all_genes.get(gene_full_name, {})
all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'gene_id': gene_id}

gtf_genes_file.close()

#print()
for gene in all_genes:
print(gene)

def main():

start = time.time()
Expand Down Expand Up @@ -130,6 +166,8 @@ def main():
logger.info("find_exons.py was called using these parameters:")
logger.info(vars(args))

procede_gtf(args.gtf_genes, args.gtf_exons, args.genes, args.output_directory)

logger.info("find_exons needed %s seconds to generate the output" % (time.time() - start))

for handler in logger.handlers:
Expand Down

0 comments on commit 4e34a88

Please sign in to comment.