Skip to content

Commit

Permalink
the extracting of information from the gtf files is done
Browse files Browse the repository at this point in the history
  • Loading branch information
anastasiia committed Nov 15, 2018
1 parent 92673e2 commit 01dd313
Showing 1 changed file with 45 additions and 37 deletions.
82 changes: 45 additions & 37 deletions find_exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,49 +104,56 @@ def check_existing_input_files(args):

return args.genes

def procede_gtf(gtf_genes, gtf_exons, genes_of_interest, output_directory):
#df = read_gtf(gtf_genes)
#df_genes = df[df["feature"] == "gene"][df["gene_name"] == genes_of_interest[0]]
#print(df_genes)
#print(df_genes["seqname"], df_genes["source"], df_genes["feature"], df_genes["start"], df_genes["end"])
def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest):

all_genes = {}

#first look for the genew in the gtf_genes file
for gene_name in genes_of_interest:
print(gene_name)
#first look for the genes in the gtf_genes file
small_gtf_genes = subprocess.getoutput("grep " + gtf_genes + " -e \"" + gene_name + "\"")#find the first line of the input file
#small_gtf_genes_lines = re.split(r'\n', small_gtf_genes)
for genes_line in re.split(r'\n', small_gtf_genes):
print(line)
print()



"""
with open(gtf_genes) as gtf_genes_file:
for line in gtf_genes_file:
if gene_name in line:
line_array = re.split(r'\t', line.rstrip('\n'))
gene_start = int(line_array[3])
gene_end = int(line_array[4])
attributes = re.split(r'; ', line_array[8])
for attribut in attributes:
if attribut.startswith("gene_id"):
gene_id = re.split(r'\s', attribut)[1].replace('"', '')
elif attribut.startswith("gene_name"):
gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')
gene_length = gene_end - gene_start
all_genes[gene_full_name] = all_genes.get(gene_full_name, {})
all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'gene_id': gene_id}

gtf_genes_file.close()
"""
for genes_line in re.split(r'\n', small_gtf_genes):
genes_array = re.split(r'\t', genes_line)
gene_start = int(genes_array[3])
gene_end = int(genes_array[4])
gene_length = gene_end - gene_start
attributes = re.split(r'; ', genes_array[8])
for attribut in attributes:
if attribut.startswith("gene_name"):
gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')

all_genes[gene_full_name] = all_genes.get(gene_full_name, {})
exons = []
all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons}

#find the nubmer and the length of all exons for each gene of interest
small_gtf_exons = subprocess.getoutput("grep " + gtf_exons + " -e \"" + gene_name + "\"")

for exons_line in re.split(r'\n', small_gtf_exons):
genes_array = re.split(r'\t', exons_line)
exon_start = int(genes_array[3])
exon_end = int(genes_array[4])
attributes = re.split(r'; ', genes_array[8])
for attribut in attributes:
if attribut.startswith("gene_name"):
gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')
elif attribut.startswith("exon_number"):
exon_number = re.split(r'\s', attribut)[1]
elif attribut.startswith("transcript_support_level"):
tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA

if gene_full_name in all_genes.keys():
all_genes[gene_full_name]['exons'].append([exon_start, exon_end, exon_number, tsl])

"""
for gene in all_genes:
print(gene, all_genes[gene])
print()
"""
return all_genes

def make_plot(all_genes, output_directory):

#print()
#for gene in all_genes:
# print(gene)

def main():

Expand Down Expand Up @@ -176,7 +183,8 @@ def main():
logger.info("find_exons.py was called using these parameters:")
logger.info(vars(args))

procede_gtf(args.gtf_genes, args.gtf_exons, args.genes, args.output_directory)
all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes)
make_plot(all_genes, args.output_directory)

logger.info("find_exons needed %s seconds to generate the output" % (time.time() - start))

Expand Down

0 comments on commit 01dd313

Please sign in to comment.