Skip to content

Commit

Permalink
adding two new columns gene_id and transcript_id to the output file
Browse files Browse the repository at this point in the history
  • Loading branch information
anastasiia committed Dec 27, 2018
1 parent 51540f5 commit b10fba0
Showing 1 changed file with 12 additions and 5 deletions.
17 changes: 12 additions & 5 deletions find_exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,14 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
tsl = int(tsl)
elif attribut.startswith("transcript_name"):
transcript_name = re.split(r'\s', attribut)[1].replace('"', '')
elif attribut.startswith("transcript_id"):
transcript_id = re.split(r'\s', attribut)[1].replace('"', '')
elif attribut.startswith("gene_id"):
gene_id = re.split(r'\s', attribut)[1].replace('"', '')

if gene_full_name in all_genes.keys():
all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name])
all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name, transcript_id])
all_genes[gene_full_name]['gene_id'] = gene_id
else:
logger.info("none of the requested genes were found in the .gtf file")
sys.exit()
Expand Down Expand Up @@ -191,6 +196,7 @@ def write_outputfile(all_genes, output_directory):
tsls = []

transcript_name = sorted_exons[0][5] #save the transcript name of the first exon
transcript_id = sorted_exons[0][6] #save the transcript_id of the first exon
sum_length = sorted_exons[0][0] #save the length of the first exon
tsls.append(sorted_exons[0][2]) #transcript support level of the first exon

Expand All @@ -208,11 +214,12 @@ def write_outputfile(all_genes, output_directory):

gene_transcript_name = gene + "_" + transcript_name
genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {})
genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number}
genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number, 'gene_id': all_genes[gene]['gene_id'], 'transcript_id': transcript_id}

tsls = []
tsls.append(sorted_exons[i][2])
transcript_name = sorted_exons[i][5]
transcript_id = sorted_exons[i][6]
sum_length = sorted_exons[i][0]

elif i == len(sorted_exons) - 1: #this is the last transcript
Expand All @@ -228,7 +235,7 @@ def write_outputfile(all_genes, output_directory):

gene_transcript_name = gene + "_" + transcript_name
genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {})
genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number}
genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number, 'gene_id': all_genes[gene]['gene_id'], 'transcript_id': transcript_id}

else: #this is still the same transcript name
tsls.append(sorted_exons[i][2]) #save the tsl to find the average tsl later on
Expand All @@ -242,11 +249,11 @@ def write_outputfile(all_genes, output_directory):

#now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score
output_file = open(os.path.join(output_directory, "genes_exons_correlation.txt"), 'w')
header = ["#gene_name", "transcript_name", "mean_tsl", "exons_len_percentage", "gene_length", "exons_number", "exons_sum_length"]
header = ["#gene_name", "transcript_name", "mean_tsl", "exons_len_percentage", "gene_length", "exons_number", "exons_sum_length", "gene_id", "transcript_id"]
output_file.write('\t'.join(header) + '\n') #write the header

for transcript in genes_with_transcript_names:
output_file.write('\t'.join([transcript[1]['gene'], transcript[0].replace(transcript[1]['gene'] + '_', ''), str(transcript[1]['mean_tsl']), str(transcript[1]['score']), str(transcript[1]['gene_length']), str(transcript[1]['exons_number']), str(transcript[1]['sum_length'])]) + '\n')
output_file.write('\t'.join([transcript[1]['gene'], transcript[0].replace(transcript[1]['gene'] + '_', ''), str(transcript[1]['mean_tsl']), str(transcript[1]['score']), str(transcript[1]['gene_length']), str(transcript[1]['exons_number']), str(transcript[1]['sum_length']), str(transcript[1]['gene_id']), str(transcript[1]['transcript_id'])]) + '\n')

output_file.close()

Expand Down

0 comments on commit b10fba0

Please sign in to comment.