From b10fba0c1afb76bd5089440979d3eceeb64683d0 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Thu, 27 Dec 2018 23:26:36 +0100 Subject: [PATCH] adding two new columns gene_id and transcript_id to the output file --- find_exons.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/find_exons.py b/find_exons.py index 9bd28dd..5487a29 100644 --- a/find_exons.py +++ b/find_exons.py @@ -140,9 +140,14 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): tsl = int(tsl) elif attribut.startswith("transcript_name"): transcript_name = re.split(r'\s', attribut)[1].replace('"', '') + elif attribut.startswith("transcript_id"): + transcript_id = re.split(r'\s', attribut)[1].replace('"', '') + elif attribut.startswith("gene_id"): + gene_id = re.split(r'\s', attribut)[1].replace('"', '') if gene_full_name in all_genes.keys(): - all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name]) + all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name, transcript_id]) + all_genes[gene_full_name]['gene_id'] = gene_id else: logger.info("none of the requested genes were found in the .gtf file") sys.exit() @@ -191,6 +196,7 @@ def write_outputfile(all_genes, output_directory): tsls = [] transcript_name = sorted_exons[0][5] #save the transcript name of the first exon + transcript_id = sorted_exons[0][6] #save the transcript_id of the first exon sum_length = sorted_exons[0][0] #save the length of the first exon tsls.append(sorted_exons[0][2]) #transcript support level of the first exon @@ -208,11 +214,12 @@ def write_outputfile(all_genes, output_directory): gene_transcript_name = gene + "_" + transcript_name genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {}) - genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number} + genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number, 'gene_id': all_genes[gene]['gene_id'], 'transcript_id': transcript_id} tsls = [] tsls.append(sorted_exons[i][2]) transcript_name = sorted_exons[i][5] + transcript_id = sorted_exons[i][6] sum_length = sorted_exons[i][0] elif i == len(sorted_exons) - 1: #this is the last transcript @@ -228,7 +235,7 @@ def write_outputfile(all_genes, output_directory): gene_transcript_name = gene + "_" + transcript_name genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {}) - genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number} + genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number, 'gene_id': all_genes[gene]['gene_id'], 'transcript_id': transcript_id} else: #this is still the same transcript name tsls.append(sorted_exons[i][2]) #save the tsl to find the average tsl later on @@ -242,11 +249,11 @@ def write_outputfile(all_genes, output_directory): #now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score output_file = open(os.path.join(output_directory, "genes_exons_correlation.txt"), 'w') - header = ["#gene_name", "transcript_name", "mean_tsl", "exons_len_percentage", "gene_length", "exons_number", "exons_sum_length"] + header = ["#gene_name", "transcript_name", "mean_tsl", "exons_len_percentage", "gene_length", "exons_number", "exons_sum_length", "gene_id", "transcript_id"] output_file.write('\t'.join(header) + '\n') #write the header for transcript in genes_with_transcript_names: - output_file.write('\t'.join([transcript[1]['gene'], transcript[0].replace(transcript[1]['gene'] + '_', ''), str(transcript[1]['mean_tsl']), str(transcript[1]['score']), str(transcript[1]['gene_length']), str(transcript[1]['exons_number']), str(transcript[1]['sum_length'])]) + '\n') + output_file.write('\t'.join([transcript[1]['gene'], transcript[0].replace(transcript[1]['gene'] + '_', ''), str(transcript[1]['mean_tsl']), str(transcript[1]['score']), str(transcript[1]['gene_length']), str(transcript[1]['exons_number']), str(transcript[1]['sum_length']), str(transcript[1]['gene_id']), str(transcript[1]['transcript_id'])]) + '\n') output_file.close()