Skip to content

Commit

Permalink
deleting unused code and creating an environment for the find_exons
Browse files Browse the repository at this point in the history
  • Loading branch information
anastasiia committed Dec 4, 2018
1 parent eb62101 commit 932097f
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 253 deletions.
234 changes: 7 additions & 227 deletions find_exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,15 @@
import multiprocessing
import logging
import subprocess
from Bio import SeqIO
import Bio.SeqIO.FastaIO as bio
import numpy as np
from collections import defaultdict
from scipy import stats
import pyBigWig
from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni
#import pyBigWig
#from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni
import matplotlib.pyplot as plt
import random
import textwrap
import seaborn as sns
#import seaborn as sns

#from gtfparse import read_gtf #makes the logger to look different o_o

Expand Down Expand Up @@ -55,7 +53,6 @@ def parse_args():
def check_directory(directory):
if not os.path.exists(directory):
os.makedirs(directory)
#logger.info('a new directory ' + directory + ' was created')
print('a new directory ' + directory + ' was created')

def remove_file(file):
Expand Down Expand Up @@ -117,8 +114,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
for gene in genes_of_interest:
grep = grep + "-e " + gene + " "

#finally add tp the grep command so that we only look for protein coding

#finally add to the grep command so that we only look for protein coding
small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"")
if len(small_gtf_genes) != 0:
#save the genes to the dictionary
Expand All @@ -136,15 +132,12 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
exons = []
#all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons}
all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons}
#gene_groups[gene_name].append(gene_full_name)

#now save the information about the exons to the diictionary
small_gtf_exons = subprocess.getoutput(grep + gtf_exons + "| grep \"protein_coding\"")

for exons_line in re.split(r'\n', small_gtf_exons):
#print(exons_line)
genes_array = re.split(r'\t', exons_line)
#print(genes_array)
exon_start = int(genes_array[3])
exon_end = int(genes_array[4])
exon_length = exon_end - exon_start
Expand All @@ -157,7 +150,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
elif attribut.startswith("transcript_support_level"):
tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA
if tsl == "NA":
tsl = 6
tsl = 6 #"convert" the tsl to int, so that we can easily compare it afterwards
else:
tsl = int(tsl)
elif attribut.startswith("transcript_name"):
Expand All @@ -172,80 +165,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
logger.info(len(small_gtf_genes))
logger.info(len(small_gtf_exons))
sys.exit()
"""
gene_groups = {}
logger.info("looking for genes of interest in gtf files")
if exact:
grep = "grep -w -i "
else:
grep = "grep -i "
all_genes = {}
for gene_name in genes_of_interest:
#first look for the genes in the gtf_genes file
small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes + " | grep \"protein_coding\"") #" | grep -v \"pseudogene\"")#find the first line of the input file
if len(small_gtf_genes) != 0:
logger.info("working with the gene " + gene_name)
gene_groups[gene_name] = []
for genes_line in re.split(r'\n', small_gtf_genes):
genes_array = re.split(r'\t', genes_line)
gene_start = int(genes_array[3])
gene_end = int(genes_array[4])
gene_length = gene_end - gene_start
attributes = re.split(r'; ', genes_array[8])
for attribut in attributes:
if attribut.startswith("gene_name"):
gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')
all_genes[gene_full_name] = all_genes.get(gene_full_name, {})
exons = []
#all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons}
all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons}
gene_groups[gene_name].append(gene_full_name)
#find the nubmer and the length of all exons for each gene of interest
small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons + " | grep \"protein_coding\"")
for exons_line in re.split(r'\n', small_gtf_exons):
#print(exons_line)
genes_array = re.split(r'\t', exons_line)
#print(genes_array)
exon_start = int(genes_array[3])
exon_end = int(genes_array[4])
exon_length = exon_end - exon_start
attributes = re.split(r'; ', genes_array[8])
for attribut in attributes:
if attribut.startswith("gene_name"):
gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')
elif attribut.startswith("exon_number"):
exon_number = int(re.split(r'\s', attribut)[1])
elif attribut.startswith("transcript_support_level"):
tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA
if tsl == "NA":
tsl = 6
else:
tsl = int(tsl)
elif attribut.startswith("transcript_name"):
transcript_name = re.split(r'\s', attribut)[1].replace('"', '')
if gene_full_name in all_genes.keys():
all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name])
else:
logger.info("The gene " + gene_name + " was not found in the .gtf file with genes")

if len(all_genes) == 0:
print("None of the given list of genes was found in the .gtf file")
sys.exit()
else:
return all_genes, gene_groups
"""
return all_genes

def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):
Expand All @@ -269,30 +189,6 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color

plt.show()

def make_reg_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):

the_max = max(max(x_array), max(y_array))

#plt.xlim(0, the_max)
#plt.ylim(0, the_max)

#fig, ax = plt.subplots()
#plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5)
#plt.plot(x_array, y_array, 'o', color = color_name)
sns.set()
sns.regplot(np.array(x_array), np.array(y_array))
sns.plt.show()

#plt.xlabel("whole gene length")
#plt.ylabel("sum length of exons")

#for i,j,name in zip(x_array, y_array, gene_names):
# ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small')

#fig.savefig(os.path.join(output_directory, figure_name))

#plt.show()

def plot_and_output(all_genes, output_directory):

logger.info("preparing for plotting")
Expand All @@ -302,77 +198,19 @@ def plot_and_output(all_genes, output_directory):
exons_lengths = []
scores = []

#TRY
genes_with_transcript_names = {}

#find the sum length of exons for each gene
for gene in all_genes:
"""
#first sort the exons array
exons_sorted_after_length = sorted(all_genes[gene]['exons'], key=lambda x: x[0], reverse=True)
#then sort after exon number and transcription support level
sorted_exons = sorted(exons_sorted_after_length, key = lambda x: (x[1], x[2]))
"""
"""
#TRY sort after the start and and positions
sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[3], x[4]))
"""
#TRY sort after transcript name and then after the exon number, and then after transcription support level

sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[5], x[1], x[2]))

check_exon = 1
sum_length = 0

"""
last_exon = sorted_exons[-1][1]
while check_exon <= last_exon:
if sorted_exons[0][1] == check_exon: #check the current exon_number
sum_length = sum_length + sorted_exons[0][0]
sorted_exons = [x for x in sorted_exons if x[1] != check_exon] #cut the current sorted_exons array so that there are no occurences of the current exon there
check_exon = check_exon + 1
"""
"""
#TRY i am trying to work with the positions of exons to reduce overlaps
start_pos = int(sorted_exons[0][3]) #save the start position of the first exon
end_pos = int(sorted_exons[0][4]) #save the end position of the first exon
#if gene == "AL589843.2":
# print(sorted_exons)
# print(start_pos)
# print(end_pos)
# print()
for i in range(1, len(sorted_exons)): #start with the second exon, as we have already saved the first one
if int(sorted_exons[i][3]) < end_pos: #the current exon is overlapping the previous one
if not int(sorted_exons[i][4]) <= end_pos:
end_pos = int(sorted_exons[i][4]) #the new end position is the one of the current exon
if i == len(sorted_exons) - 1:
#this is the last exon and an overlap, so save it
previous_len = end_pos - start_pos
sum_length = sum_length + previous_len
#if gene == "AL589843.2":
# print("set new end pos ", end_pos)
# print(sum_length)
# print()
#else do nothing
else: #the current exon does not overlap the previous one
#save the length of the previous exon
previous_len = end_pos - start_pos
sum_length = sum_length + previous_len
#and save the new start and end positions to check for overlaps
start_pos = int(sorted_exons[i][3])
end_pos = int(sorted_exons[i][4])
#if gene == "AL589843.2":
# print(sum_length)
# print(start_pos)
# print(end_pos)
# print()
"""
transcript_name = sorted_exons[0][5] #save the transcript name of the first exon
sum_length = sorted_exons[0][0] #save the length of the best first exon
#genes_with_transcript_names[transcript_name] = genes_with_transcript_names.get(transcript_name, {})


for i in range(1, len(sorted_exons)):
if sorted_exons[i][5] != transcript_name or i == len(sorted_exons) - 1: #this is a new transcript or the last one
#save the previous transcript_name
Expand All @@ -386,56 +224,6 @@ def plot_and_output(all_genes, output_directory):
#save the length of this exon
sum_length = sum_length + sorted_exons[i][0]


"""
all_genes[gene]['sum_length'] = sum_length
all_genes[gene]['score'] = score
"""

"""
#for test purposes
gene_names.append(gene)
gene_lengths.append(all_genes[gene]['gene_length'])
exons_lengths.append(sum_length)
scores.append(score)
"""

"""
logger.info('extracting the best genes')
#print(gene_names)
for gene_original in gene_groups.keys():
#look inside the nice dictionary for the gene with the biggest score and if the scores are the same, look for the biggest length
#if we found one gene to save, than make all these arrays with gene_names, gene_lengths and everything
check_score = 0.0
gene_to_save = ''
for test_gene in gene_groups[gene_original]:
#print(all_genes[test_gene])
if float(all_genes[test_gene]['score']) > check_score:
#print("score is bigger ", check_score, all_genes[test_gene]['score'])
check_score = float(all_genes[test_gene]['score'])
gene_to_save = test_gene
elif float(all_genes[test_gene]['score']) == check_score:
#print("score is the same ", check_score, all_genes[test_gene]['score'])
#check for the gene length
if gene_to_save != '' and int(all_genes[test_gene]['gene_length']) > int(all_genes[gene_to_save]['gene_length']):
gene_to_save = test_gene
#else do nothing
if gene_to_save == '': #the score for this gene is 0.0, so no best gene occurence could be found
gene_to_save = gene_original
gene_names.append(gene_to_save)
gene_lengths.append(all_genes[gene_to_save]['gene_length'])
exons_lengths.append(all_genes[gene_to_save]['sum_length'])
scores.append(all_genes[gene_to_save]['score'])
#print("i am saving this gene ", all_genes[gene_to_save])
"""
#sorted_dict = sorted(dict_motifs_p_values.items(), key = lambda x : (x[1]['adjusted_p_value']), reverse = False)

genes_with_transcript_names = sorted(genes_with_transcript_names.items(), key = lambda x: (x[1]['gene'], x[0]))

logger.info("writing the output file")
Expand All @@ -445,19 +233,11 @@ def plot_and_output(all_genes, output_directory):
header = ["#gene_name", "transcript_name", "exons_len_percentage", "gene_length", "exons_sum_length"]
output_file.write('\t'.join(header) + '\n') #write the header

"""
for i in range(len(gene_names)):
output_file.write('\t'.join([gene_names[i], str(scores[i]), str(gene_lengths[i]), str(exons_lengths[i])]) + '\n')
"""

for transcript in genes_with_transcript_names:
output_file.write('\t'.join([transcript[1]['gene'], transcript[0].replace(transcript[1]['gene'] + '_', ''), str(transcript[1]['score']),str(transcript[1]['gene_length']), str(transcript[1]['sum_length'])]) + '\n')

output_file.close()

#logger.info("making a plot")
#make_plot(gene_lengths, exons_lengths, gene_names, output_directory, "genes_exons_correlation.png", 'gold')

def main():

start = time.time()
Expand Down
13 changes: 13 additions & 0 deletions find_exons.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: find_exons

channels:
- bioconda
- conda-forge

dependencies:
- python
- snakemake
- biopython
- matplotlib
- scipy
- statsmodels
27 changes: 1 addition & 26 deletions visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,10 @@
import numpy as np
from collections import defaultdict
from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni
#from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni
import matplotlib.pyplot as plt
import random
import textwrap
import seaborn as sns

#from gtfparse import read_gtf #makes the logger to look different o_o

Expand Down Expand Up @@ -135,30 +134,6 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color

#plt.show()

def make_reg_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):

the_max = max(max(x_array), max(y_array))

#plt.xlim(0, the_max)
#plt.ylim(0, the_max)

#fig, ax = plt.subplots()
#plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5)
#plt.plot(x_array, y_array, 'o', color = color_name)
sns.set()
sns.regplot(np.array(x_array), np.array(y_array))
sns.plt.show()

#plt.xlabel("whole gene length")
#plt.ylabel("sum length of exons")

#for i,j,name in zip(x_array, y_array, gene_names):
# ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small')

#fig.savefig(os.path.join(output_directory, figure_name))

#plt.show()

def read_correlation(correlation_file, genes_of_interest, variable):
#read the text file and make a dictionary containing genes with array of their scores

Expand Down

0 comments on commit 932097f

Please sign in to comment.