diff --git a/find_exons.py b/find_exons.py index 6eebaf1..a75ca20 100644 --- a/find_exons.py +++ b/find_exons.py @@ -13,17 +13,15 @@ import multiprocessing import logging import subprocess -from Bio import SeqIO -import Bio.SeqIO.FastaIO as bio import numpy as np from collections import defaultdict from scipy import stats -import pyBigWig -from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni +#import pyBigWig +#from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni import matplotlib.pyplot as plt import random import textwrap -import seaborn as sns +#import seaborn as sns #from gtfparse import read_gtf #makes the logger to look different o_o @@ -55,7 +53,6 @@ def parse_args(): def check_directory(directory): if not os.path.exists(directory): os.makedirs(directory) - #logger.info('a new directory ' + directory + ' was created') print('a new directory ' + directory + ' was created') def remove_file(file): @@ -117,8 +114,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): for gene in genes_of_interest: grep = grep + "-e " + gene + " " - #finally add tp the grep command so that we only look for protein coding - + #finally add to the grep command so that we only look for protein coding small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") if len(small_gtf_genes) != 0: #save the genes to the dictionary @@ -136,15 +132,12 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): exons = [] #all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons} all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons} - #gene_groups[gene_name].append(gene_full_name) #now save the information about the exons to the diictionary small_gtf_exons = subprocess.getoutput(grep + gtf_exons + "| grep \"protein_coding\"") for exons_line in re.split(r'\n', small_gtf_exons): - #print(exons_line) genes_array = re.split(r'\t', exons_line) - #print(genes_array) exon_start = int(genes_array[3]) exon_end = int(genes_array[4]) exon_length = exon_end - exon_start @@ -157,7 +150,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): elif attribut.startswith("transcript_support_level"): tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA if tsl == "NA": - tsl = 6 + tsl = 6 #"convert" the tsl to int, so that we can easily compare it afterwards else: tsl = int(tsl) elif attribut.startswith("transcript_name"): @@ -172,80 +165,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): logger.info(len(small_gtf_genes)) logger.info(len(small_gtf_exons)) sys.exit() - """ - gene_groups = {} - - logger.info("looking for genes of interest in gtf files") - - if exact: - grep = "grep -w -i " - else: - grep = "grep -i " - - all_genes = {} - - for gene_name in genes_of_interest: - - #first look for the genes in the gtf_genes file - small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes + " | grep \"protein_coding\"") #" | grep -v \"pseudogene\"")#find the first line of the input file - - if len(small_gtf_genes) != 0: - logger.info("working with the gene " + gene_name) - gene_groups[gene_name] = [] - - for genes_line in re.split(r'\n', small_gtf_genes): - genes_array = re.split(r'\t', genes_line) - gene_start = int(genes_array[3]) - gene_end = int(genes_array[4]) - gene_length = gene_end - gene_start - attributes = re.split(r'; ', genes_array[8]) - for attribut in attributes: - if attribut.startswith("gene_name"): - gene_full_name = re.split(r'\s', attribut)[1].replace('"', '') - - all_genes[gene_full_name] = all_genes.get(gene_full_name, {}) - exons = [] - #all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons} - all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons} - gene_groups[gene_name].append(gene_full_name) - - #find the nubmer and the length of all exons for each gene of interest - small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons + " | grep \"protein_coding\"") - - for exons_line in re.split(r'\n', small_gtf_exons): - #print(exons_line) - genes_array = re.split(r'\t', exons_line) - #print(genes_array) - exon_start = int(genes_array[3]) - exon_end = int(genes_array[4]) - exon_length = exon_end - exon_start - attributes = re.split(r'; ', genes_array[8]) - for attribut in attributes: - if attribut.startswith("gene_name"): - gene_full_name = re.split(r'\s', attribut)[1].replace('"', '') - elif attribut.startswith("exon_number"): - exon_number = int(re.split(r'\s', attribut)[1]) - elif attribut.startswith("transcript_support_level"): - tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA - if tsl == "NA": - tsl = 6 - else: - tsl = int(tsl) - elif attribut.startswith("transcript_name"): - transcript_name = re.split(r'\s', attribut)[1].replace('"', '') - - if gene_full_name in all_genes.keys(): - all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name]) - - else: - logger.info("The gene " + gene_name + " was not found in the .gtf file with genes") - if len(all_genes) == 0: - print("None of the given list of genes was found in the .gtf file") - sys.exit() - else: - return all_genes, gene_groups - """ return all_genes def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name): @@ -269,30 +189,6 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color plt.show() -def make_reg_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name): - - the_max = max(max(x_array), max(y_array)) - - #plt.xlim(0, the_max) - #plt.ylim(0, the_max) - - #fig, ax = plt.subplots() - #plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5) - #plt.plot(x_array, y_array, 'o', color = color_name) - sns.set() - sns.regplot(np.array(x_array), np.array(y_array)) - sns.plt.show() - - #plt.xlabel("whole gene length") - #plt.ylabel("sum length of exons") - - #for i,j,name in zip(x_array, y_array, gene_names): - # ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small') - - #fig.savefig(os.path.join(output_directory, figure_name)) - - #plt.show() - def plot_and_output(all_genes, output_directory): logger.info("preparing for plotting") @@ -302,77 +198,19 @@ def plot_and_output(all_genes, output_directory): exons_lengths = [] scores = [] - #TRY genes_with_transcript_names = {} #find the sum length of exons for each gene for gene in all_genes: - """ - #first sort the exons array - exons_sorted_after_length = sorted(all_genes[gene]['exons'], key=lambda x: x[0], reverse=True) - #then sort after exon number and transcription support level - sorted_exons = sorted(exons_sorted_after_length, key = lambda x: (x[1], x[2])) - """ - """ - #TRY sort after the start and and positions - sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[3], x[4])) - """ - #TRY sort after transcript name and then after the exon number, and then after transcription support level + sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[5], x[1], x[2])) check_exon = 1 sum_length = 0 - """ - last_exon = sorted_exons[-1][1] - - while check_exon <= last_exon: - if sorted_exons[0][1] == check_exon: #check the current exon_number - sum_length = sum_length + sorted_exons[0][0] - sorted_exons = [x for x in sorted_exons if x[1] != check_exon] #cut the current sorted_exons array so that there are no occurences of the current exon there - check_exon = check_exon + 1 - """ - """ - #TRY i am trying to work with the positions of exons to reduce overlaps - start_pos = int(sorted_exons[0][3]) #save the start position of the first exon - end_pos = int(sorted_exons[0][4]) #save the end position of the first exon - - #if gene == "AL589843.2": - # print(sorted_exons) - # print(start_pos) - # print(end_pos) - # print() - - for i in range(1, len(sorted_exons)): #start with the second exon, as we have already saved the first one - if int(sorted_exons[i][3]) < end_pos: #the current exon is overlapping the previous one - if not int(sorted_exons[i][4]) <= end_pos: - end_pos = int(sorted_exons[i][4]) #the new end position is the one of the current exon - if i == len(sorted_exons) - 1: - #this is the last exon and an overlap, so save it - previous_len = end_pos - start_pos - sum_length = sum_length + previous_len - #if gene == "AL589843.2": - # print("set new end pos ", end_pos) - # print(sum_length) - # print() - #else do nothing - else: #the current exon does not overlap the previous one - #save the length of the previous exon - previous_len = end_pos - start_pos - sum_length = sum_length + previous_len - #and save the new start and end positions to check for overlaps - start_pos = int(sorted_exons[i][3]) - end_pos = int(sorted_exons[i][4]) - #if gene == "AL589843.2": - # print(sum_length) - # print(start_pos) - # print(end_pos) - # print() - """ transcript_name = sorted_exons[0][5] #save the transcript name of the first exon sum_length = sorted_exons[0][0] #save the length of the best first exon - #genes_with_transcript_names[transcript_name] = genes_with_transcript_names.get(transcript_name, {}) - + for i in range(1, len(sorted_exons)): if sorted_exons[i][5] != transcript_name or i == len(sorted_exons) - 1: #this is a new transcript or the last one #save the previous transcript_name @@ -386,56 +224,6 @@ def plot_and_output(all_genes, output_directory): #save the length of this exon sum_length = sum_length + sorted_exons[i][0] - - """ - all_genes[gene]['sum_length'] = sum_length - all_genes[gene]['score'] = score - """ - - """ - #for test purposes - gene_names.append(gene) - gene_lengths.append(all_genes[gene]['gene_length']) - exons_lengths.append(sum_length) - scores.append(score) - """ - - """ - logger.info('extracting the best genes') - - #print(gene_names) - - for gene_original in gene_groups.keys(): - #look inside the nice dictionary for the gene with the biggest score and if the scores are the same, look for the biggest length - #if we found one gene to save, than make all these arrays with gene_names, gene_lengths and everything - check_score = 0.0 - gene_to_save = '' - - for test_gene in gene_groups[gene_original]: - #print(all_genes[test_gene]) - if float(all_genes[test_gene]['score']) > check_score: - #print("score is bigger ", check_score, all_genes[test_gene]['score']) - check_score = float(all_genes[test_gene]['score']) - gene_to_save = test_gene - elif float(all_genes[test_gene]['score']) == check_score: - #print("score is the same ", check_score, all_genes[test_gene]['score']) - #check for the gene length - if gene_to_save != '' and int(all_genes[test_gene]['gene_length']) > int(all_genes[gene_to_save]['gene_length']): - gene_to_save = test_gene - #else do nothing - - if gene_to_save == '': #the score for this gene is 0.0, so no best gene occurence could be found - gene_to_save = gene_original - - gene_names.append(gene_to_save) - gene_lengths.append(all_genes[gene_to_save]['gene_length']) - exons_lengths.append(all_genes[gene_to_save]['sum_length']) - scores.append(all_genes[gene_to_save]['score']) - - #print("i am saving this gene ", all_genes[gene_to_save]) - """ - #sorted_dict = sorted(dict_motifs_p_values.items(), key = lambda x : (x[1]['adjusted_p_value']), reverse = False) - genes_with_transcript_names = sorted(genes_with_transcript_names.items(), key = lambda x: (x[1]['gene'], x[0])) logger.info("writing the output file") @@ -445,19 +233,11 @@ def plot_and_output(all_genes, output_directory): header = ["#gene_name", "transcript_name", "exons_len_percentage", "gene_length", "exons_sum_length"] output_file.write('\t'.join(header) + '\n') #write the header - """ - for i in range(len(gene_names)): - output_file.write('\t'.join([gene_names[i], str(scores[i]), str(gene_lengths[i]), str(exons_lengths[i])]) + '\n') - """ - for transcript in genes_with_transcript_names: output_file.write('\t'.join([transcript[1]['gene'], transcript[0].replace(transcript[1]['gene'] + '_', ''), str(transcript[1]['score']),str(transcript[1]['gene_length']), str(transcript[1]['sum_length'])]) + '\n') output_file.close() - #logger.info("making a plot") - #make_plot(gene_lengths, exons_lengths, gene_names, output_directory, "genes_exons_correlation.png", 'gold') - def main(): start = time.time() diff --git a/find_exons.yaml b/find_exons.yaml new file mode 100644 index 0000000..be3c8ea --- /dev/null +++ b/find_exons.yaml @@ -0,0 +1,13 @@ +name: find_exons + +channels: + - bioconda + - conda-forge + +dependencies: + - python + - snakemake + - biopython + - matplotlib + - scipy + - statsmodels diff --git a/visualize.py b/visualize.py index ab28a68..036b071 100644 --- a/visualize.py +++ b/visualize.py @@ -16,11 +16,10 @@ import numpy as np from collections import defaultdict from scipy import stats -from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni +#from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni import matplotlib.pyplot as plt import random import textwrap -import seaborn as sns #from gtfparse import read_gtf #makes the logger to look different o_o @@ -135,30 +134,6 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color #plt.show() -def make_reg_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name): - - the_max = max(max(x_array), max(y_array)) - - #plt.xlim(0, the_max) - #plt.ylim(0, the_max) - - #fig, ax = plt.subplots() - #plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5) - #plt.plot(x_array, y_array, 'o', color = color_name) - sns.set() - sns.regplot(np.array(x_array), np.array(y_array)) - sns.plt.show() - - #plt.xlabel("whole gene length") - #plt.ylabel("sum length of exons") - - #for i,j,name in zip(x_array, y_array, gene_names): - # ax.annotate('%s' %name, xy=(i,j), xytext=(1,0), textcoords='offset points', size='xx-small') - - #fig.savefig(os.path.join(output_directory, figure_name)) - - #plt.show() - def read_correlation(correlation_file, genes_of_interest, variable): #read the text file and make a dictionary containing genes with array of their scores