From 1133d0f84eae691105a58b1aef798b54fb224bf6 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Tue, 11 Dec 2018 11:04:27 +0100 Subject: [PATCH] adding the tsl to the output file, as well as changing some imports and comments --- find_exons.py | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/find_exons.py b/find_exons.py index 297607d..a086e3e 100644 --- a/find_exons.py +++ b/find_exons.py @@ -10,18 +10,11 @@ import os import re import time -import multiprocessing import logging import subprocess import numpy as np from collections import defaultdict -from scipy import stats -#import pyBigWig -#from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni -import matplotlib.pyplot as plt -import random import textwrap -#import seaborn as sns #from gtfparse import read_gtf #makes the logger to look different o_o @@ -44,7 +37,7 @@ def parse_args(): #all other arguments are optional parser.add_argument('--output_directory', default='output', const='output', nargs='?', help='output directory, by default ./output/') - parser.add_argument('--silent', action='store_true', help='while working with data write the information only into ./call_peaks_log.txt') + parser.add_argument('--silent', action='store_true', help='while working with data write the information only into ./find_exons.log') parser.add_argument('--exact', action='store_true', help='choose this parameter if you want to look for exact gene matches') args = parser.parse_args() @@ -82,7 +75,7 @@ def check_existing_input_files(args): print('working with one gene of interest') genes_of_interest = args.genes_of_interest else: - print('working with file of genes') + print('reading from the file with genes') #there is a file with list of genes. Extract them and return in an array genes = [] with open(args.genes_of_interest[0]) as genes_file: @@ -115,10 +108,10 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): for gene in genes_of_interest: grep = grep + "-e " + gene + ". " #. makes grep to look for genes starting with the name from the list of genes of interest - print(grep + "gtf_genes" + "| grep \"protein_coding\" | grep \"transcript_type \\\"protein_coding\\\"\"") + #print(grep + "gtf_genes" + "| grep \"protein_coding\" | grep \"transcript_type \\\"protein_coding\\\"\"") #finally add to the grep command so that we only look for protein coding - small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") #| grep \"transcript_type \"protein_coding\"\"" i dont know how to handle it :( + small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") #extract only the rows containing protein_coding if len(small_gtf_genes) != 0: #save the genes to the dictionary for genes_line in re.split(r'\n', small_gtf_genes): @@ -160,18 +153,11 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact): transcript_name = re.split(r'\s', attribut)[1].replace('"', '') if gene_full_name in all_genes.keys(): - #print(gene_full_name) - #print(all_genes[gene_full_name]) all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name]) else: logger.info("none of the requested genes were found in the .gtf file") sys.exit() - #logger.info(len(re.split(r'\n', small_gtf_genes))) - #logger.info(len(re.split(r'\n', small_gtf_exons))) - #logger.info(len(all_genes)) - #sys.exit() - return all_genes def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name): @@ -197,8 +183,6 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color def write_outputfile(all_genes, output_directory): - #logger.info("preparing for plotting") - gene_names = [] gene_lengths = [] exons_lengths = [] @@ -225,7 +209,7 @@ def write_outputfile(all_genes, output_directory): #save the previous transcript_name score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) #which is actually the percentage of the sum length of the exons and the whole length of the gene - mean_tsl = "%.2f" % round(np.mean(tsls), 2) #find the average tsl for this transcript + mean_tsl = int(np.mean(tsls)) #find the average tsl for this transcript if mean_tsl > 5: mean_tsl = "NA" @@ -233,7 +217,7 @@ def write_outputfile(all_genes, output_directory): gene_transcript_name = gene + "_" + transcript_name genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {}) - genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number} #anzahl von exons + genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number} tsls = [] tsls.append(sorted_exons[i][2]) @@ -244,7 +228,7 @@ def write_outputfile(all_genes, output_directory): sum_length = sum_length + sorted_exons[i][0] score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) - mean_tsl = "%.2f" % round(np.mean(tsls), 2) #find the average tsl for this transcript + mean_tsl = int(np.mean(tsls)) #find the average tsl for this transcript if mean_tsl > 5: mean_tsl = "NA" @@ -280,12 +264,15 @@ def main(): args = parse_args() + print("find_exons.py was called using these parameters:") + print(vars(args)) + args.genes = check_existing_input_files(args) #check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user check_directory(args.output_directory) - fh = logging.FileHandler(os.path.join(args.output_directory, "find_exons_log.txt")) + fh = logging.FileHandler(os.path.join(args.output_directory, "find_exons.log")) fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) @@ -299,8 +286,8 @@ def main(): if args.silent: logger.removeHandler(ch) - logger.info("find_exons.py was called using these parameters:") - logger.info(vars(args)) + #logger.info("find_exons.py was called using these parameters:") + #logger.info(vars(args)) all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact)