Skip to content

Commit

Permalink
the plotting is done
Browse files Browse the repository at this point in the history
  • Loading branch information
anastasiia committed Nov 15, 2018
1 parent bd040a2 commit 4a31126
Showing 1 changed file with 65 additions and 48 deletions.
113 changes: 65 additions & 48 deletions find_exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def parse_args():
parser.add_argument('--genes_file', help='names of genes of interest')
parser.add_argument('--output_directory', default='output', const='output', nargs='?', help='output directory, by default ./output/')
parser.add_argument('--silent', action='store_true', help='while working with data write the information only into ./call_peaks_log.txt')
parser.add_argument('--exact', action='store_true', help='choose this parameter if you want to look for exact gene matches')
args = parser.parse_args()

return args
Expand Down Expand Up @@ -106,64 +107,80 @@ def check_existing_input_files(args):

return args.genes

def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest):
def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):

logger.info("looking for genes of interest in gtf files")

if exact:
grep = "grep -w "
else:
grep = "grep "

all_genes = {}

for gene_name in genes_of_interest:
#first look for the genes in the gtf_genes file
small_gtf_genes = subprocess.getoutput("grep " + gtf_genes + " -e \"" + gene_name + "\"")#find the first line of the input file

for genes_line in re.split(r'\n', small_gtf_genes):
genes_array = re.split(r'\t', genes_line)
gene_start = int(genes_array[3])
gene_end = int(genes_array[4])
gene_length = gene_end - gene_start
attributes = re.split(r'; ', genes_array[8])
for attribut in attributes:
if attribut.startswith("gene_name"):
gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')
small_gtf_genes = subprocess.getoutput(grep + gtf_genes + " -e \"" + gene_name + "\"")#find the first line of the input file

if len(small_gtf_genes) != 0:

for genes_line in re.split(r'\n', small_gtf_genes):
genes_array = re.split(r'\t', genes_line)
gene_start = int(genes_array[3])
gene_end = int(genes_array[4])
gene_length = gene_end - gene_start
attributes = re.split(r'; ', genes_array[8])
for attribut in attributes:
if attribut.startswith("gene_name"):
gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')

all_genes[gene_full_name] = all_genes.get(gene_full_name, {})
exons = []
#all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons}
all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons}

#find the nubmer and the length of all exons for each gene of interest
small_gtf_exons = subprocess.getoutput("grep " + gtf_exons + " -e \"" + gene_name + "\"")

all_genes[gene_full_name] = all_genes.get(gene_full_name, {})
exons = []
#all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons}
all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons}
for exons_line in re.split(r'\n', small_gtf_exons):
genes_array = re.split(r'\t', exons_line)
exon_start = int(genes_array[3])
exon_end = int(genes_array[4])
exon_length = exon_end - exon_start
attributes = re.split(r'; ', genes_array[8])
for attribut in attributes:
if attribut.startswith("gene_name"):
gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')
elif attribut.startswith("exon_number"):
exon_number = int(re.split(r'\s', attribut)[1])
elif attribut.startswith("transcript_support_level"):
tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA
if tsl == "NA":
tsl = 6
else:
tsl = int(tsl)

if gene_full_name in all_genes.keys():
all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl])

#find the nubmer and the length of all exons for each gene of interest
small_gtf_exons = subprocess.getoutput("grep " + gtf_exons + " -e \"" + gene_name + "\"")

for exons_line in re.split(r'\n', small_gtf_exons):
genes_array = re.split(r'\t', exons_line)
exon_start = int(genes_array[3])
exon_end = int(genes_array[4])
exon_length = exon_end - exon_start
attributes = re.split(r'; ', genes_array[8])
for attribut in attributes:
if attribut.startswith("gene_name"):
gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')
elif attribut.startswith("exon_number"):
exon_number = int(re.split(r'\s', attribut)[1])
elif attribut.startswith("transcript_support_level"):
tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA
if tsl == "NA":
tsl = 6
else:
tsl = int(tsl)

if gene_full_name in all_genes.keys():
all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl])

return all_genes
else:
logger.info("The gene " + gene_name + " was not found in the .gtf file with genes")

if len(all_genes) == 0:
print("None of the given list of genes was found in the .gtf file")
sys.exit()
else:
return all_genes

def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):

the_max = max(max(x_array), max(y_array))

#plt.xlim(0,1)
#plt.ylim(0,1)
plt.xlim(0, the_max)
plt.ylim(0, the_max)

fig, ax = plt.subplots()
plt.plot([0,1], '--', color = "black", linewidth = 0.5)
plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5)
plt.plot(x_array, y_array, 'o', color = color_name)

plt.xlabel("whole gene length")
Expand Down Expand Up @@ -209,7 +226,7 @@ def plot_and_output(all_genes, output_directory):
scores.append(score)

logger.info("writing the output file")

#now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score
output_file = open(os.path.join(output_directory, "genes_exons_correlation.txt"), 'w')
header = ["gene_name", "exons_len_percentage", "gene_length", "exons_sum_length"]
Expand All @@ -220,8 +237,8 @@ def plot_and_output(all_genes, output_directory):

output_file.close()



logger.info("making a plot")
make_plot(gene_lengths, exons_lengths, gene_names, output_directory, "genes_exons_correlation.png", 'gold')

def main():

Expand Down Expand Up @@ -251,7 +268,7 @@ def main():
logger.info("find_exons.py was called using these parameters:")
logger.info(vars(args))

all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes)
all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact)
plot_and_output(all_genes, args.output_directory)

logger.info("find_exons needed %s seconds to generate the output" % (time.time() - start))
Expand Down

0 comments on commit 4a31126

Please sign in to comment.