From 1133d0f84eae691105a58b1aef798b54fb224bf6 Mon Sep 17 00:00:00 2001
From: anastasiia <anastasiia.petrova@mpi-bn.mpg.de>
Date: Tue, 11 Dec 2018 11:04:27 +0100
Subject: [PATCH] adding the tsl to the output file, as well as changing some
 imports and comments

---
 find_exons.py | 39 +++++++++++++--------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/find_exons.py b/find_exons.py
index 297607d..a086e3e 100644
--- a/find_exons.py
+++ b/find_exons.py
@@ -10,18 +10,11 @@
 import os
 import re
 import time
-import multiprocessing
 import logging
 import subprocess
 import numpy as np
 from collections import defaultdict
-from scipy import stats
-#import pyBigWig
-#from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni
-import matplotlib.pyplot as plt
-import random
 import textwrap
-#import seaborn as sns
 
 #from gtfparse import read_gtf #makes the logger to look different o_o
 
@@ -44,7 +37,7 @@ def parse_args():
 
 	#all other arguments are optional
 	parser.add_argument('--output_directory',  default='output', const='output', nargs='?', help='output directory, by default ./output/')
-	parser.add_argument('--silent', action='store_true', help='while working with data write the information only into ./call_peaks_log.txt')
+	parser.add_argument('--silent', action='store_true', help='while working with data write the information only into ./find_exons.log')
 	parser.add_argument('--exact', action='store_true', help='choose this parameter if you want to look for exact gene matches')
 	args = parser.parse_args()
 
@@ -82,7 +75,7 @@ def check_existing_input_files(args):
 			print('working with one gene of interest')
 			genes_of_interest = args.genes_of_interest
 		else:
-			print('working with file of genes')
+			print('reading from the file with genes')
 			#there is a file with list of genes. Extract them and return in an array
 			genes = []
 			with open(args.genes_of_interest[0]) as genes_file:
@@ -115,10 +108,10 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 	for gene in genes_of_interest:
 		grep = grep + "-e " + gene + ". " #. makes grep to look for genes starting with the name from the list of genes of interest
 
-	print(grep + "gtf_genes" + "| grep \"protein_coding\" | grep \"transcript_type \\\"protein_coding\\\"\"")
+	#print(grep + "gtf_genes" + "| grep \"protein_coding\" | grep \"transcript_type \\\"protein_coding\\\"\"")
 
 	#finally add to the grep command so that we only look for protein coding
-	small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") #| grep \"transcript_type \"protein_coding\"\"" i dont know how to handle it :(
+	small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") #extract only the rows containing protein_coding
 	if len(small_gtf_genes) != 0:
 		#save the genes to the dictionary
 		for genes_line in re.split(r'\n', small_gtf_genes):
@@ -160,18 +153,11 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 					transcript_name = re.split(r'\s', attribut)[1].replace('"', '')
 
 			if gene_full_name in all_genes.keys():
-				#print(gene_full_name)
-				#print(all_genes[gene_full_name])
 				all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name])
 	else:
 		logger.info("none of the requested genes were found in the .gtf file")
 		sys.exit()
 
-	#logger.info(len(re.split(r'\n', small_gtf_genes)))
-	#logger.info(len(re.split(r'\n', small_gtf_exons)))
-	#logger.info(len(all_genes))
-	#sys.exit()
-
 	return all_genes
 
 def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):
@@ -197,8 +183,6 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color
 
 def write_outputfile(all_genes, output_directory):
 
-	#logger.info("preparing for plotting")
-
 	gene_names = []
 	gene_lengths = []
 	exons_lengths = []
@@ -225,7 +209,7 @@ def write_outputfile(all_genes, output_directory):
 					#save the previous transcript_name
 					score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) #which is actually the percentage of the sum length of the exons and the whole length of the gene
 
-					mean_tsl = "%.2f" % round(np.mean(tsls), 2) #find the average tsl for this transcript
+					mean_tsl = int(np.mean(tsls)) #find the average tsl for this transcript
 					if mean_tsl > 5:
 						mean_tsl = "NA"
 
@@ -233,7 +217,7 @@ def write_outputfile(all_genes, output_directory):
 
 					gene_transcript_name = gene + "_" + transcript_name
 					genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {})
-					genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number} #anzahl von exons
+					genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number}
 
 					tsls = []
 					tsls.append(sorted_exons[i][2])
@@ -244,7 +228,7 @@ def write_outputfile(all_genes, output_directory):
 					sum_length = sum_length + sorted_exons[i][0]
 					score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
 
-					mean_tsl = "%.2f" % round(np.mean(tsls), 2) #find the average tsl for this transcript
+					mean_tsl = int(np.mean(tsls)) #find the average tsl for this transcript
 					if mean_tsl > 5:
 						mean_tsl = "NA"
 
@@ -280,12 +264,15 @@ def main():
 
 	args = parse_args()
 
+	print("find_exons.py was called using these parameters:")
+	print(vars(args))
+
 	args.genes = check_existing_input_files(args)
 
 	#check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user
 	check_directory(args.output_directory)
 
-	fh = logging.FileHandler(os.path.join(args.output_directory, "find_exons_log.txt"))
+	fh = logging.FileHandler(os.path.join(args.output_directory, "find_exons.log"))
 	fh.setLevel(logging.INFO)
 	fh.setFormatter(formatter)
 	logger.addHandler(fh)
@@ -299,8 +286,8 @@ def main():
 	if args.silent:
 		logger.removeHandler(ch)
 
-	logger.info("find_exons.py was called using these parameters:")
-	logger.info(vars(args))
+	#logger.info("find_exons.py was called using these parameters:")
+	#logger.info(vars(args))
 
 
 	all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact)