deleting unused code and creating an environment for the find_exons

anastasiia · Dec 4, 2018 · 932097f · 932097f
1 parent eb62101
commit 932097f
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 253 deletions.
diff --git a/find_exons.py b/find_exons.py
@@ -13,17 +13,15 @@
 import multiprocessing
 import logging
 import subprocess
-from Bio import SeqIO
-import Bio.SeqIO.FastaIO as bio
 import numpy as np
 from collections import defaultdict
 from scipy import stats
-import pyBigWig
-from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni
+#import pyBigWig
+#from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni
 import matplotlib.pyplot as plt
 import random
 import textwrap
-import seaborn as sns
+#import seaborn as sns
 
 #from gtfparse import read_gtf #makes the logger to look different o_o
 
@@ -55,7 +53,6 @@ def parse_args():
 def check_directory(directory):
 	if not os.path.exists(directory):
 		os.makedirs(directory)
-		#logger.info('a new directory ' + directory + ' was created')
 		print('a new directory ' + directory + ' was created')
 
 def remove_file(file):
@@ -117,8 +114,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 	for gene in genes_of_interest:
 		grep = grep + "-e " + gene + " "
 
-	#finally add tp the grep command so that we only look for protein coding
-
+	#finally add to the grep command so that we only look for protein coding
 	small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"")
 	if len(small_gtf_genes) != 0:
 		#save the genes to the dictionary
@@ -136,15 +132,12 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 			exons = []
 			#all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons}
 			all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons}
-			#gene_groups[gene_name].append(gene_full_name)
 
 		#now save the information about the exons to the diictionary
 		small_gtf_exons = subprocess.getoutput(grep + gtf_exons + "| grep \"protein_coding\"")
 
 		for exons_line in re.split(r'\n', small_gtf_exons):
-			#print(exons_line)
 			genes_array = re.split(r'\t', exons_line)
-			#print(genes_array)
 			exon_start = int(genes_array[3])
 			exon_end = int(genes_array[4])
 			exon_length = exon_end - exon_start
@@ -157,7 +150,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 				elif attribut.startswith("transcript_support_level"):
 					tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA
 					if tsl == "NA":
-						tsl = 6
+						tsl = 6 #"convert" the tsl to int, so that we can easily compare it afterwards
 					else:
 						tsl = int(tsl)
 				elif attribut.startswith("transcript_name"):
@@ -172,80 +165,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 	logger.info(len(small_gtf_genes))
 	logger.info(len(small_gtf_exons))
 	sys.exit()
-	"""
-	gene_groups = {}
-
-	logger.info("looking for genes of interest in gtf files")
-
-	if exact:
-		grep = "grep -w -i "
-	else:
-		grep = "grep -i "
-
-	all_genes = {}
-
-	for gene_name in genes_of_interest:
-		
-		#first look for the genes in the gtf_genes file
-		small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes + " | grep \"protein_coding\"") #" | grep -v \"pseudogene\"")#find the first line of the input file
-
-		if len(small_gtf_genes) != 0:
-			logger.info("working with the gene " + gene_name)
-			gene_groups[gene_name] = []
-
-			for genes_line in re.split(r'\n', small_gtf_genes):
-				genes_array = re.split(r'\t', genes_line)
-				gene_start = int(genes_array[3])
-				gene_end = int(genes_array[4])
-				gene_length = gene_end - gene_start
-				attributes = re.split(r'; ', genes_array[8])
-				for attribut in attributes:
-					if attribut.startswith("gene_name"):
-						gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')
-				
-				all_genes[gene_full_name] = all_genes.get(gene_full_name, {})
-				exons = []
-				#all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons}
-				all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons}
-				gene_groups[gene_name].append(gene_full_name)
-
-			#find the nubmer and the length of all exons for each gene of interest
-			small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons + " | grep \"protein_coding\"")
-			
-			for exons_line in re.split(r'\n', small_gtf_exons):
-				#print(exons_line)
-				genes_array = re.split(r'\t', exons_line)
-				#print(genes_array)
-				exon_start = int(genes_array[3])
-				exon_end = int(genes_array[4])
-				exon_length = exon_end - exon_start
-				attributes = re.split(r'; ', genes_array[8])
-				for attribut in attributes:
-					if attribut.startswith("gene_name"):
-						gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')
-					elif attribut.startswith("exon_number"):
-						exon_number = int(re.split(r'\s', attribut)[1])
-					elif attribut.startswith("transcript_support_level"):
-						tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA
-						if tsl == "NA":
-							tsl = 6
-						else:
-							tsl = int(tsl)
-					elif attribut.startswith("transcript_name"):
-						transcript_name = re.split(r'\s', attribut)[1].replace('"', '')
-
-				if gene_full_name in all_genes.keys():
-					all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name])
-
-		else:
-			logger.info("The gene " + gene_name + " was not found in the .gtf file with genes")
 
-	if len(all_genes) == 0:
-		print("None of the given list of genes was found in the .gtf file")
-		sys.exit()
-	else:
-		return all_genes, gene_groups
-	"""
 	return all_genes
 
 def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):
@@ -269,30 +189,6 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color
 
 	plt.show()
 
-def make_reg_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):
-
-	the_max = max(max(x_array), max(y_array))
-
-	#plt.xlim(0, the_max)
-	#plt.ylim(0, the_max)
-
-	#fig, ax = plt.subplots()
-	#plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5)
-	#plt.plot(x_array, y_array, 'o', color = color_name)
-	sns.set()
-	sns.regplot(np.array(x_array), np.array(y_array))
-	sns.plt.show()
-
-	#plt.xlabel("whole gene length")
-	#plt.ylabel("sum length of exons")
-
-	#for i,j,name in zip(x_array, y_array, gene_names):
-	#	ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small')
-
-	#fig.savefig(os.path.join(output_directory, figure_name))
-
-	#plt.show()
-
 def plot_and_output(all_genes, output_directory):
 
 	logger.info("preparing for plotting")
@@ -302,77 +198,19 @@ def plot_and_output(all_genes, output_directory):
 	exons_lengths = []
 	scores = []
 
-	#TRY
 	genes_with_transcript_names = {}
 
 	#find the sum length of exons for each gene
 	for gene in all_genes:
-		"""
-		#first sort the exons array
-		exons_sorted_after_length = sorted(all_genes[gene]['exons'], key=lambda x: x[0], reverse=True)
-		#then sort after exon number and transcription support level
-		sorted_exons = sorted(exons_sorted_after_length, key = lambda x: (x[1], x[2]))
-		"""
-		"""
-		#TRY sort after the start and and positions
-		sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[3], x[4]))
-		"""
-		#TRY sort after transcript name and then after the exon number, and then after transcription support level
+
 		sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[5], x[1], x[2]))
 
 		check_exon = 1
 		sum_length = 0
 
-		"""
-		last_exon = sorted_exons[-1][1] 
-
-		while check_exon <= last_exon:
-			if sorted_exons[0][1] == check_exon: #check the current exon_number
-				sum_length = sum_length + sorted_exons[0][0]
-				sorted_exons = [x for x in sorted_exons if x[1] != check_exon] #cut the current sorted_exons array so that there are no occurences of the current exon there
-			check_exon = check_exon + 1
-		"""
-		"""
-		#TRY i am trying to work with the positions of exons to reduce overlaps
-		start_pos = int(sorted_exons[0][3]) #save the start position of the first exon
-		end_pos = int(sorted_exons[0][4]) #save the end position of the first exon
-
-		#if gene == "AL589843.2":
-		#	print(sorted_exons)
-		#	print(start_pos)
-		#	print(end_pos)
-		#	print()
-
-		for i in range(1, len(sorted_exons)): #start with the second exon, as we have already saved the first one
-			if int(sorted_exons[i][3]) < end_pos: #the current exon is overlapping the previous one
-				if not int(sorted_exons[i][4]) <= end_pos:
-					end_pos = int(sorted_exons[i][4]) #the new end position is the one of the current exon
-					if i == len(sorted_exons) - 1:
-						#this is the last exon and an overlap, so save it
-						previous_len = end_pos - start_pos
-						sum_length = sum_length + previous_len
-						#if gene == "AL589843.2":
-						#	print("set new end pos ", end_pos)
-						#	print(sum_length)
-						#	print()
-				#else do nothing
-			else: #the current exon does not overlap the previous one
-				#save the length of the previous exon
-				previous_len = end_pos - start_pos
-				sum_length = sum_length + previous_len
-				#and save the new start and end positions to check for overlaps
-				start_pos = int(sorted_exons[i][3])
-				end_pos = int(sorted_exons[i][4])
-				#if gene == "AL589843.2":
-				#	print(sum_length)
-				#	print(start_pos)
-				#	print(end_pos)
-				#	print()
-		"""
 		transcript_name = sorted_exons[0][5] #save the transcript name of the first exon
 		sum_length = sorted_exons[0][0] #save the length of the best first exon
-		#genes_with_transcript_names[transcript_name] = genes_with_transcript_names.get(transcript_name, {})
-
+
 		for i in range(1, len(sorted_exons)):
 			if sorted_exons[i][5] != transcript_name or i == len(sorted_exons) - 1: #this is a new transcript or the last one
 				#save the previous transcript_name
@@ -386,56 +224,6 @@ def plot_and_output(all_genes, output_directory):
 				#save the length of this exon
 				sum_length = sum_length + sorted_exons[i][0]
 
-
-		"""
-		all_genes[gene]['sum_length'] = sum_length
-		all_genes[gene]['score'] = score
-		"""
-
-		"""
-		#for test purposes
-		gene_names.append(gene)
-		gene_lengths.append(all_genes[gene]['gene_length'])
-		exons_lengths.append(sum_length)
-		scores.append(score)
-		"""
-
-	"""
-	logger.info('extracting the best genes')
-
-	#print(gene_names)
-
-	for gene_original in gene_groups.keys():
-		#look inside the nice dictionary for the gene with the biggest score and if the scores are the same, look for the biggest length
-		#if we found one gene to save, than make all these arrays with gene_names, gene_lengths and everything
-		check_score = 0.0
-		gene_to_save = ''
-
-		for test_gene in gene_groups[gene_original]:
-			#print(all_genes[test_gene])
-			if float(all_genes[test_gene]['score']) > check_score:
-				#print("score is bigger ", check_score, all_genes[test_gene]['score'])
-				check_score = float(all_genes[test_gene]['score'])
-				gene_to_save = test_gene
-			elif float(all_genes[test_gene]['score']) == check_score:
-				#print("score is the same ", check_score, all_genes[test_gene]['score'])
-				#check for the gene length
-				if gene_to_save != '' and int(all_genes[test_gene]['gene_length']) > int(all_genes[gene_to_save]['gene_length']):
-					gene_to_save = test_gene
-			#else do nothing
-
-		if gene_to_save == '': #the score for this gene is 0.0, so no best gene occurence could be found
-			gene_to_save = gene_original
-
-		gene_names.append(gene_to_save)
-		gene_lengths.append(all_genes[gene_to_save]['gene_length'])
-		exons_lengths.append(all_genes[gene_to_save]['sum_length'])
-		scores.append(all_genes[gene_to_save]['score'])
-	
-		#print("i am saving this gene ", all_genes[gene_to_save])
-	"""
-	#sorted_dict = sorted(dict_motifs_p_values.items(), key = lambda x : (x[1]['adjusted_p_value']), reverse = False)
-
 	genes_with_transcript_names = sorted(genes_with_transcript_names.items(), key = lambda x: (x[1]['gene'], x[0]))
 
 	logger.info("writing the output file")
@@ -445,19 +233,11 @@ def plot_and_output(all_genes, output_directory):
 	header = ["#gene_name", "transcript_name", "exons_len_percentage", "gene_length", "exons_sum_length"]
 	output_file.write('\t'.join(header) + '\n') #write the header
 
-	"""
-	for i in range(len(gene_names)):
-		output_file.write('\t'.join([gene_names[i], str(scores[i]), str(gene_lengths[i]), str(exons_lengths[i])]) + '\n')
-	"""
-
 	for transcript in genes_with_transcript_names:
 		output_file.write('\t'.join([transcript[1]['gene'], transcript[0].replace(transcript[1]['gene'] + '_', ''), str(transcript[1]['score']),str(transcript[1]['gene_length']), str(transcript[1]['sum_length'])]) + '\n')
 
 	output_file.close()
 
-	#logger.info("making a plot")
-	#make_plot(gene_lengths, exons_lengths, gene_names, output_directory, "genes_exons_correlation.png", 'gold')
-
 def main():
 
 	start = time.time()

diff --git a/find_exons.yaml b/find_exons.yaml
@@ -0,0 +1,13 @@
+name: find_exons
+
+channels:
+  - bioconda
+  - conda-forge
+
+dependencies:
+  - python
+  - snakemake
+  - biopython
+  - matplotlib
+  - scipy
+  - statsmodels
diff --git a/visualize.py b/visualize.py
@@ -16,11 +16,10 @@
 import numpy as np
 from collections import defaultdict
 from scipy import stats
-from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni
+#from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni
 import matplotlib.pyplot as plt
 import random
 import textwrap
-import seaborn as sns
 
 #from gtfparse import read_gtf #makes the logger to look different o_o
 
@@ -135,30 +134,6 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color
 
 	#plt.show()
 
-def make_reg_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):
-
-	the_max = max(max(x_array), max(y_array))
-
-	#plt.xlim(0, the_max)
-	#plt.ylim(0, the_max)
-
-	#fig, ax = plt.subplots()
-	#plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5)
-	#plt.plot(x_array, y_array, 'o', color = color_name)
-	sns.set()
-	sns.regplot(np.array(x_array), np.array(y_array))
-	sns.plt.show()
-
-	#plt.xlabel("whole gene length")
-	#plt.ylabel("sum length of exons")
-
-	#for i,j,name in zip(x_array, y_array, gene_names):
-	#	ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small')
-
-	#fig.savefig(os.path.join(output_directory, figure_name))
-
-	#plt.show()
-
 def read_correlation(correlation_file, genes_of_interest, variable):
 	#read the text file and make a dictionary containing genes with array of their scores