Skip to content

Commit

Permalink
adding transcript name to the output file and do not chose the best t…
Browse files Browse the repository at this point in the history
…ranscript, just save everything
  • Loading branch information
anastasiia committed Nov 26, 2018
1 parent 50efe41 commit 3549b97
Showing 1 changed file with 46 additions and 12 deletions.
58 changes: 46 additions & 12 deletions find_exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,10 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
for gene_name in genes_of_interest:

#first look for the genes in the gtf_genes file
small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes + " | grep -v \"pseudogene\"")#find the first line of the input file
small_gtf_genes = subprocess.getoutput(grep + gene_name + " " + gtf_genes + " | grep \"protein_coding\"") #" | grep -v \"pseudogene\"")#find the first line of the input file

if len(small_gtf_genes) != 0:
logger.info("working with the gene " + gene_name)
gene_groups[gene_name] = []

for genes_line in re.split(r'\n', small_gtf_genes):
Expand All @@ -146,7 +147,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
gene_groups[gene_name].append(gene_full_name)

#find the nubmer and the length of all exons for each gene of interest
small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons + " | grep -v \"pseudogene\"")
small_gtf_exons = subprocess.getoutput(grep + gene_name + " " + gtf_exons + " | grep \"protein_coding\"")

for exons_line in re.split(r'\n', small_gtf_exons):
#print(exons_line)
Expand All @@ -167,9 +168,11 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
tsl = 6
else:
tsl = int(tsl)
elif attribut.startswith("transcript_name"):
transcript_name = re.split(r'\s', attribut)[1].replace('"', '')

if gene_full_name in all_genes.keys():
all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end])
all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name])

else:
logger.info("The gene " + gene_name + " was not found in the .gtf file with genes")
Expand Down Expand Up @@ -234,6 +237,9 @@ def plot_and_output(all_genes, output_directory, gene_groups):
exons_lengths = []
scores = []

#TRY
genes_with_transcript_names = {}

#find the sum length of exons for each gene
for gene in all_genes:
"""
Expand All @@ -242,23 +248,26 @@ def plot_and_output(all_genes, output_directory, gene_groups):
#then sort after exon number and transcription support level
sorted_exons = sorted(exons_sorted_after_length, key = lambda x: (x[1], x[2]))
"""
"""
#TRY sort after the start and and positions
sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[3], x[4]))

#print(sorted_exons)
"""
#TRY sort after transcript name and then after the exon number, and then after transcription support level
sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[5], x[1], x[2]))

check_exon = 1
sum_length = 0
last_exon = sorted_exons[-1][1]

"""
last_exon = sorted_exons[-1][1]
while check_exon <= last_exon:
if sorted_exons[0][1] == check_exon: #check the current exon_number
sum_length = sum_length + sorted_exons[0][0]
sorted_exons = [x for x in sorted_exons if x[1] != check_exon] #cut the current sorted_exons array so that there are no occurences of the current exon there
check_exon = check_exon + 1
"""

"""
#TRY i am trying to work with the positions of exons to reduce overlaps
start_pos = int(sorted_exons[0][3]) #save the start position of the first exon
end_pos = int(sorted_exons[0][4]) #save the end position of the first exon
Expand Down Expand Up @@ -294,11 +303,28 @@ def plot_and_output(all_genes, output_directory, gene_groups):
# print(start_pos)
# print(end_pos)
# print()
"""
transcript_name = sorted_exons[0][5] #save the transcript name of the first exon
sum_length = sorted_exons[0][0] #save the length of the best first exon
#genes_with_transcript_names[transcript_name] = genes_with_transcript_names.get(transcript_name, {})

for i in range(1, len(sorted_exons)):
if sorted_exons[i][5] != transcript_name or i == len(sorted_exons) - 1: #this is a new transcript or the last one
#save the previous transcript_name
score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
genes_with_transcript_names[transcript_name] = genes_with_transcript_names.get(transcript_name, {})
genes_with_transcript_names[transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score}
transcript_name = sorted_exons[i][5]
sum_length = sorted_exons[i][0]
else: #this is still the same transcript name
#save the length of this exon
sum_length = sum_length + sorted_exons[i][0]

score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)


"""
all_genes[gene]['sum_length'] = sum_length
all_genes[gene]['score'] = score
"""

"""
#for test purposes
Expand All @@ -308,7 +334,7 @@ def plot_and_output(all_genes, output_directory, gene_groups):
scores.append(score)
"""

#"""
"""
logger.info('extracting the best genes')
#print(gene_names)
Expand Down Expand Up @@ -341,17 +367,25 @@ def plot_and_output(all_genes, output_directory, gene_groups):
scores.append(all_genes[gene_to_save]['score'])
#print("i am saving this gene ", all_genes[gene_to_save])
#"""
"""
#sorted_dict = sorted(dict_motifs_p_values.items(), key = lambda x : (x[1]['adjusted_p_value']), reverse = False)

genes_with_transcript_names = sorted(genes_with_transcript_names.items(), key = lambda x: (x[1]['gene'], x[0]))

logger.info("writing the output file")

#now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score
output_file = open(os.path.join(output_directory, "genes_exons_correlation.txt"), 'w')
header = ["#gene_name", "exons_len_percentage", "gene_length", "exons_sum_length"]
header = ["#gene_name", "transcript_name", "exons_len_percentage", "gene_length", "exons_sum_length"]
output_file.write('\t'.join(header) + '\n') #write the header

"""
for i in range(len(gene_names)):
output_file.write('\t'.join([gene_names[i], str(scores[i]), str(gene_lengths[i]), str(exons_lengths[i])]) + '\n')
"""

for transcript in genes_with_transcript_names:
output_file.write('\t'.join([transcript[1]['gene'], transcript[0], str(transcript[1]['score']),str(transcript[1]['gene_length']), str(transcript[1]['sum_length'])]) + '\n')

output_file.close()

Expand Down

0 comments on commit 3549b97

Please sign in to comment.