Skip to content

Commit

Permalink
final changes in comments and visual presentation
Browse files Browse the repository at this point in the history
  • Loading branch information
anastasiia committed Dec 11, 2018
1 parent 1133d0f commit d236c95
Showing 1 changed file with 5 additions and 8 deletions.
13 changes: 5 additions & 8 deletions find_exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
from collections import defaultdict
import textwrap

#from gtfparse import read_gtf #makes the logger to look different o_o

logger = logging.getLogger('find_exons')
logger.setLevel(logging.INFO)

Expand Down Expand Up @@ -108,8 +106,6 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
for gene in genes_of_interest:
grep = grep + "-e " + gene + ". " #. makes grep to look for genes starting with the name from the list of genes of interest

#print(grep + "gtf_genes" + "| grep \"protein_coding\" | grep \"transcript_type \\\"protein_coding\\\"\"")

#finally add to the grep command so that we only look for protein coding
small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") #extract only the rows containing protein_coding
if len(small_gtf_genes) != 0:
Expand Down Expand Up @@ -193,6 +189,7 @@ def write_outputfile(all_genes, output_directory):
#find the sum length of exons for each gene
for gene in all_genes:
if all_genes[gene]['exons'] != []:
#sort after transcript_name, exon_number and tsl
sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[5], x[1], x[2]))

check_exon = 1
Expand All @@ -201,10 +198,11 @@ def write_outputfile(all_genes, output_directory):
tsls = []

transcript_name = sorted_exons[0][5] #save the transcript name of the first exon
sum_length = sorted_exons[0][0] #save the length of the best first exon
sum_length = sorted_exons[0][0] #save the length of the first exon
tsls.append(sorted_exons[0][2]) #transcript support level of the first exon

for i in range(1, len(sorted_exons)):

if sorted_exons[i][5] != transcript_name: #this is a new transcript
#save the previous transcript_name
score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) #which is actually the percentage of the sum length of the exons and the whole length of the gene
Expand All @@ -226,8 +224,9 @@ def write_outputfile(all_genes, output_directory):

elif i == len(sorted_exons) - 1: #this is the last transcript
sum_length = sum_length + sorted_exons[i][0]
score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
tsls.append(sorted_exons[i][2])

score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
mean_tsl = int(np.mean(tsls)) #find the average tsl for this transcript
if mean_tsl > 5:
mean_tsl = "NA"
Expand Down Expand Up @@ -289,11 +288,9 @@ def main():
#logger.info("find_exons.py was called using these parameters:")
#logger.info(vars(args))


all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact)
write_outputfile(all_genes, args.output_directory)


logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2)))

for handler in logger.handlers:
Expand Down

0 comments on commit d236c95

Please sign in to comment.