Skip to content

Commit

Permalink
excluding pseudogenes while searching and saving only the best gene t…
Browse files Browse the repository at this point in the history
…o the output file
  • Loading branch information
anastasiia committed Nov 26, 2018
1 parent 8a3ab2e commit 0b7318e
Showing 1 changed file with 82 additions and 15 deletions.
97 changes: 82 additions & 15 deletions find_exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,11 +122,12 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
all_genes = {}

for gene_name in genes_of_interest:
gene_groups[gene_name] = []

#first look for the genes in the gtf_genes file
small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes)#find the first line of the input file
small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes + " | grep -v \"pseudogene\"")#find the first line of the input file

if len(small_gtf_genes) != 0:
gene_groups[gene_name] = []

for genes_line in re.split(r'\n', small_gtf_genes):
genes_array = re.split(r'\t', genes_line)
Expand All @@ -145,7 +146,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
gene_groups[gene_name].append(gene_full_name)

#find the nubmer and the length of all exons for each gene of interest
small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons)
small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons + " | grep -v \"pseudogene\"")

for exons_line in re.split(r'\n', small_gtf_exons):
#print(exons_line)
Expand All @@ -168,7 +169,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
tsl = int(tsl)

if gene_full_name in all_genes.keys():
all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl])
all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end])

else:
logger.info("The gene " + gene_name + " was not found in the .gtf file with genes")
Expand Down Expand Up @@ -235,46 +236,112 @@ def plot_and_output(all_genes, output_directory, gene_groups):

#find the sum length of exons for each gene
for gene in all_genes:
"""
#first sort the exons array
exons_sorted_after_length = sorted(all_genes[gene]['exons'], key=lambda x: x[0], reverse=True)
#then sort after exon number and transcription support level
sorted_exons = sorted(exons_sorted_after_length, key = lambda x: (x[1], x[2]))
"""
#TRY sort after the start and and positions
sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[3], x[4]))

#print(sorted_exons)

check_exon = 1
sum_length = 0
last_exon = sorted_exons[-1][1]

"""
while check_exon <= last_exon:
if sorted_exons[0][1] == check_exon:
if sorted_exons[0][1] == check_exon: #check the current exon_number
sum_length = sum_length + sorted_exons[0][0]
sorted_exons = [x for x in sorted_exons if x[1] != check_exon] #cut the current sorted_exons array so that there are no occurences of the current exon there
check_exon = check_exon + 1
"""

#TRY i am trying to work with the positions of exons to reduce overlaps
start_pos = int(sorted_exons[0][3]) #save the start position of the first exon
end_pos = int(sorted_exons[0][4]) #save the end position of the first exon

#if gene == "AL589843.2":
# print(sorted_exons)
# print(start_pos)
# print(end_pos)
# print()

for i in range(1, len(sorted_exons)): #start with the second exon, as we have already saved the first one
if int(sorted_exons[i][3]) < end_pos: #the current exon is overlapping the previous one
if not int(sorted_exons[i][4]) <= end_pos:
end_pos = int(sorted_exons[i][4]) #the new end position is the one of the current exon
if i == len(sorted_exons) - 1:
#this is the last exon and an overlap, so save it
previous_len = end_pos - start_pos
sum_length = sum_length + previous_len
#if gene == "AL589843.2":
# print("set new end pos ", end_pos)
# print(sum_length)
# print()
#else do nothing
else: #the current exon does not overlap the previous one
#save the length of the previous exon
previous_len = end_pos - start_pos
sum_length = sum_length + previous_len
#and save the new start and end positions to check for overlaps
start_pos = int(sorted_exons[i][3])
end_pos = int(sorted_exons[i][4])
#if gene == "AL589843.2":
# print(sum_length)
# print(start_pos)
# print(end_pos)
# print()

score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)

# TODO make one nice dictionary instead of all these
all_genes[gene]['sum_length'] = sum_length
all_genes[gene]['score'] = score

"""
#for test purposes
gene_names.append(gene)
gene_lengths.append(all_genes[gene]['gene_length'])
exons_lengths.append(sum_length)
scores.append(score)
"""

#"""
logger.info('extracting the best genes')

print(gene_names)
#print(gene_names)

for gene_original in gene_groups.keys():
#look inside the nice dictionary for the gene with the biggest score and if the scores are the same, look for the biggest length
#if we found one gene to save, than make all these arrays with gene_names, gene_lengths and everything
check_score = 0

check_score = 0.0
gene_to_save = ''
for test_gene in gene_groups[gene_original]:
gene_index = gene_names.index(test_gene)

print(gene_original)
#look for this gene group within all gene_names and
print(gene_groups[gene_original])
print(gene_names.index('AL589880.1'))
for test_gene in gene_groups[gene_original]:
#print(all_genes[test_gene])
if float(all_genes[test_gene]['score']) > check_score:
#print("score is bigger ", check_score, all_genes[test_gene]['score'])
check_score = float(all_genes[test_gene]['score'])
gene_to_save = test_gene
elif float(all_genes[test_gene]['score']) == check_score:
#print("score is the same ", check_score, all_genes[test_gene]['score'])
#check for the gene length
if gene_to_save != '' and int(all_genes[test_gene]['gene_length']) > int(all_genes[gene_to_save]['gene_length']):
gene_to_save = test_gene
#else do nothing

if gene_to_save == '': #the score for this gene is 0.0, so no best gene occurence could be found
gene_to_save = gene_original

gene_names.append(gene_to_save)
gene_lengths.append(all_genes[gene_to_save]['gene_length'])
exons_lengths.append(all_genes[gene_to_save]['sum_length'])
scores.append(all_genes[gene_to_save]['score'])

#print("i am saving this gene ", all_genes[gene_to_save])
#"""

logger.info("writing the output file")

Expand Down

0 comments on commit 0b7318e

Please sign in to comment.