Skip to content

Commit

Permalink
adding the mean_tsl to the output file
Browse files Browse the repository at this point in the history
  • Loading branch information
anastasiia committed Dec 10, 2018
1 parent f8b5f5e commit 1c199b0
Showing 1 changed file with 45 additions and 17 deletions.
62 changes: 45 additions & 17 deletions find_exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def check_existing_input_files(args):
return genes_of_interest

def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
logger.info("working with the gtf files")
logger.info("working with gtf files")

all_genes = {}

Expand All @@ -115,10 +115,10 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
for gene in genes_of_interest:
grep = grep + "-e " + gene + ". " #. makes grep to look for genes starting with the name from the list of genes of interest

print(grep)
print(grep + "gtf_genes" + "| grep \"protein_coding\" | grep \"transcript_type \\\"protein_coding\\\"\"")

#finally add to the grep command so that we only look for protein coding
small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"")
small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") #| grep \"transcript_type \"protein_coding\"\"" i dont know how to handle it :(
if len(small_gtf_genes) != 0:
#save the genes to the dictionary
for genes_line in re.split(r'\n', small_gtf_genes):
Expand Down Expand Up @@ -160,16 +160,16 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
transcript_name = re.split(r'\s', attribut)[1].replace('"', '')

if gene_full_name in all_genes.keys():
print(gene_full_name)
#print(gene_full_name)
#print(all_genes[gene_full_name])
all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name])
else:
logger.info("none of the requested genes were found in the .gtf file")
sys.exit()

logger.info(len(re.split(r'\n', small_gtf_genes)))
logger.info(len(re.split(r'\n', small_gtf_exons)))
logger.info(len(all_genes))
#logger.info(len(re.split(r'\n', small_gtf_genes)))
#logger.info(len(re.split(r'\n', small_gtf_exons)))
#logger.info(len(all_genes))
#sys.exit()

return all_genes
Expand All @@ -195,7 +195,7 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color

plt.show()

def plot_and_output(all_genes, output_directory):
def write_outputfile(all_genes, output_directory):

#logger.info("preparing for plotting")

Expand All @@ -214,22 +214,50 @@ def plot_and_output(all_genes, output_directory):
check_exon = 1
sum_length = 0

print(sorted_exons[0])
tsls = []

transcript_name = sorted_exons[0][5] #save the transcript name of the first exon
sum_length = sorted_exons[0][0] #save the length of the best first exon
tsls.append(sorted_exons[0][2]) #transcript support level of the first exon

for i in range(1, len(sorted_exons)):
if sorted_exons[i][5] != transcript_name or i == len(sorted_exons) - 1: #this is a new transcript or the last one
if sorted_exons[i][5] != transcript_name: #this is a new transcript
#save the previous transcript_name
score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) #which is actually the percentage of the sum length of the exons and the whole length of the gene

mean_tsl = "%.2f" % round(np.mean(tsls), 2) #find the average tsl for this transcript
if mean_tsl > 5:
mean_tsl = "NA"

exons_number = len(tsls)

gene_transcript_name = gene + "_" + transcript_name
genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {})
genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score} #anzahl von exons
genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number} #anzahl von exons

tsls = []
tsls.append(sorted_exons[i][2])
transcript_name = sorted_exons[i][5]
sum_length = sorted_exons[i][0]
else: #this is still the same transcript name
#save the length of this exon
elif i == len(sorted_exons) - 1: #this is the last transcript
sum_length = sum_length + sorted_exons[i][0]
score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)

mean_tsl = "%.2f" % round(np.mean(tsls), 2) #find the average tsl for this transcript
if mean_tsl > 5:
mean_tsl = "NA"

exons_number = len(tsls)

gene_transcript_name = gene + "_" + transcript_name
genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {})
genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number}

else: #this is still the same transcript name
tsls.append(sorted_exons[i][2]) #save the tsl to find the average tsl later on
sum_length = sum_length + sorted_exons[i][0] #save the length of this exon

#else do nothing, there are no exons found corresponding to this gene

genes_with_transcript_names = sorted(genes_with_transcript_names.items(), key = lambda x: (x[1]['gene'], x[0]))
Expand All @@ -238,11 +266,11 @@ def plot_and_output(all_genes, output_directory):

#now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score
output_file = open(os.path.join(output_directory, "genes_exons_correlation.txt"), 'w')
header = ["#gene_name", "transcript_name", "exons_len_percentage", "gene_length", "exons_sum_length"]
header = ["#gene_name", "transcript_name", "mean_tsl", "exons_len_percentage", "gene_length", "exons_number", "exons_sum_length"]
output_file.write('\t'.join(header) + '\n') #write the header

for transcript in genes_with_transcript_names:
output_file.write('\t'.join([transcript[1]['gene'], transcript[0].replace(transcript[1]['gene'] + '_', ''), str(transcript[1]['score']),str(transcript[1]['gene_length']), str(transcript[1]['sum_length'])]) + '\n')
output_file.write('\t'.join([transcript[1]['gene'], transcript[0].replace(transcript[1]['gene'] + '_', ''), str(transcript[1]['mean_tsl']), str(transcript[1]['score']), str(transcript[1]['gene_length']), str(transcript[1]['exons_number']), str(transcript[1]['sum_length'])]) + '\n')

output_file.close()

Expand Down Expand Up @@ -276,7 +304,7 @@ def main():


all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact)
plot_and_output(all_genes, args.output_directory)
write_outputfile(all_genes, args.output_directory)


logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2)))
Expand Down

0 comments on commit 1c199b0

Please sign in to comment.