Skip to content

Commit

Permalink
changing the idea on how to check for the best gene
Browse files Browse the repository at this point in the history
  • Loading branch information
anastasiia committed Nov 23, 2018
1 parent 4a31126 commit 8a3ab2e
Showing 1 changed file with 62 additions and 9 deletions.
71 changes: 62 additions & 9 deletions find_exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import matplotlib.pyplot as plt
import random
import textwrap
import seaborn as sns

#from gtfparse import read_gtf #makes the logger to look different o_o

Expand Down Expand Up @@ -109,18 +110,21 @@ def check_existing_input_files(args):

def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):

gene_groups = {}

logger.info("looking for genes of interest in gtf files")

if exact:
grep = "grep -w "
grep = "grep -w -i "
else:
grep = "grep "
grep = "grep -i "

all_genes = {}

for gene_name in genes_of_interest:
gene_groups[gene_name] = []
#first look for the genes in the gtf_genes file
small_gtf_genes = subprocess.getoutput(grep + gtf_genes + " -e \"" + gene_name + "\"")#find the first line of the input file
small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes)#find the first line of the input file

if len(small_gtf_genes) != 0:

Expand All @@ -138,12 +142,15 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
exons = []
#all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons}
all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons}
gene_groups[gene_name].append(gene_full_name)

#find the nubmer and the length of all exons for each gene of interest
small_gtf_exons = subprocess.getoutput("grep " + gtf_exons + " -e \"" + gene_name + "\"")
small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons)

for exons_line in re.split(r'\n', small_gtf_exons):
#print(exons_line)
genes_array = re.split(r'\t', exons_line)
#print(genes_array)
exon_start = int(genes_array[3])
exon_end = int(genes_array[4])
exon_length = exon_end - exon_start
Expand All @@ -170,7 +177,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
print("None of the given list of genes was found in the .gtf file")
sys.exit()
else:
return all_genes
return all_genes, gene_groups

def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):

Expand All @@ -193,7 +200,31 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color

plt.show()

def plot_and_output(all_genes, output_directory):
def make_reg_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):

the_max = max(max(x_array), max(y_array))

#plt.xlim(0, the_max)
#plt.ylim(0, the_max)

#fig, ax = plt.subplots()
#plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5)
#plt.plot(x_array, y_array, 'o', color = color_name)
sns.set()
sns.regplot(np.array(x_array), np.array(y_array))
sns.plt.show()

#plt.xlabel("whole gene length")
#plt.ylabel("sum length of exons")

#for i,j,name in zip(x_array, y_array, gene_names):
# ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small')

#fig.savefig(os.path.join(output_directory, figure_name))

#plt.show()

def plot_and_output(all_genes, output_directory, gene_groups):

logger.info("preparing for plotting")

Expand All @@ -206,6 +237,7 @@ def plot_and_output(all_genes, output_directory):
for gene in all_genes:
#first sort the exons array
exons_sorted_after_length = sorted(all_genes[gene]['exons'], key=lambda x: x[0], reverse=True)
#then sort after exon number and transcription support level
sorted_exons = sorted(exons_sorted_after_length, key = lambda x: (x[1], x[2]))

check_exon = 1
Expand All @@ -220,11 +252,30 @@ def plot_and_output(all_genes, output_directory):

score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)

# TODO make one nice dictionary instead of all these
gene_names.append(gene)
gene_lengths.append(all_genes[gene]['gene_length'])
exons_lengths.append(sum_length)
scores.append(score)

logger.info('extracting the best genes')

print(gene_names)

for gene_original in gene_groups.keys():
#look inside the nice dictionary for the gene with the biggest score and if the scores are the same, look for the biggest length
#if we found one gene to save, than make all these arrays with gene_names, gene_lengths and everything
check_score = 0

gene_to_save = ''
for test_gene in gene_groups[gene_original]:
gene_index = gene_names.index(test_gene)

print(gene_original)
#look for this gene group within all gene_names and
print(gene_groups[gene_original])
print(gene_names.index('AL589880.1'))

logger.info("writing the output file")

#now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score
Expand Down Expand Up @@ -268,10 +319,12 @@ def main():
logger.info("find_exons.py was called using these parameters:")
logger.info(vars(args))

all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact)
plot_and_output(all_genes, args.output_directory)

logger.info("find_exons needed %s seconds to generate the output" % (time.time() - start))
all_genes, gene_groups = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact)
plot_and_output(all_genes, args.output_directory, gene_groups)


logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2)))

for handler in logger.handlers:
handler.close()
Expand Down

0 comments on commit 8a3ab2e

Please sign in to comment.