Skip to content

Commit

Permalink
adding the tsl to the output file, as well as changing some imports a…
Browse files Browse the repository at this point in the history
…nd comments
  • Loading branch information
anastasiia committed Dec 11, 2018
1 parent 1c199b0 commit 1133d0f
Showing 1 changed file with 13 additions and 26 deletions.
39 changes: 13 additions & 26 deletions find_exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,11 @@
import os
import re
import time
import multiprocessing
import logging
import subprocess
import numpy as np
from collections import defaultdict
from scipy import stats
#import pyBigWig
#from statsmodels.sandbox.stats.multicomp import multipletests #for bonfferoni
import matplotlib.pyplot as plt
import random
import textwrap
#import seaborn as sns

#from gtfparse import read_gtf #makes the logger to look different o_o

Expand All @@ -44,7 +37,7 @@ def parse_args():

#all other arguments are optional
parser.add_argument('--output_directory', default='output', const='output', nargs='?', help='output directory, by default ./output/')
parser.add_argument('--silent', action='store_true', help='while working with data write the information only into ./call_peaks_log.txt')
parser.add_argument('--silent', action='store_true', help='while working with data write the information only into ./find_exons.log')
parser.add_argument('--exact', action='store_true', help='choose this parameter if you want to look for exact gene matches')
args = parser.parse_args()

Expand Down Expand Up @@ -82,7 +75,7 @@ def check_existing_input_files(args):
print('working with one gene of interest')
genes_of_interest = args.genes_of_interest
else:
print('working with file of genes')
print('reading from the file with genes')
#there is a file with list of genes. Extract them and return in an array
genes = []
with open(args.genes_of_interest[0]) as genes_file:
Expand Down Expand Up @@ -115,10 +108,10 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
for gene in genes_of_interest:
grep = grep + "-e " + gene + ". " #. makes grep to look for genes starting with the name from the list of genes of interest

print(grep + "gtf_genes" + "| grep \"protein_coding\" | grep \"transcript_type \\\"protein_coding\\\"\"")
#print(grep + "gtf_genes" + "| grep \"protein_coding\" | grep \"transcript_type \\\"protein_coding\\\"\"")

#finally add to the grep command so that we only look for protein coding
small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") #| grep \"transcript_type \"protein_coding\"\"" i dont know how to handle it :(
small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"") #extract only the rows containing protein_coding
if len(small_gtf_genes) != 0:
#save the genes to the dictionary
for genes_line in re.split(r'\n', small_gtf_genes):
Expand Down Expand Up @@ -160,18 +153,11 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
transcript_name = re.split(r'\s', attribut)[1].replace('"', '')

if gene_full_name in all_genes.keys():
#print(gene_full_name)
#print(all_genes[gene_full_name])
all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name])
else:
logger.info("none of the requested genes were found in the .gtf file")
sys.exit()

#logger.info(len(re.split(r'\n', small_gtf_genes)))
#logger.info(len(re.split(r'\n', small_gtf_exons)))
#logger.info(len(all_genes))
#sys.exit()

return all_genes

def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):
Expand All @@ -197,8 +183,6 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color

def write_outputfile(all_genes, output_directory):

#logger.info("preparing for plotting")

gene_names = []
gene_lengths = []
exons_lengths = []
Expand All @@ -225,15 +209,15 @@ def write_outputfile(all_genes, output_directory):
#save the previous transcript_name
score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2) #which is actually the percentage of the sum length of the exons and the whole length of the gene

mean_tsl = "%.2f" % round(np.mean(tsls), 2) #find the average tsl for this transcript
mean_tsl = int(np.mean(tsls)) #find the average tsl for this transcript
if mean_tsl > 5:
mean_tsl = "NA"

exons_number = len(tsls)

gene_transcript_name = gene + "_" + transcript_name
genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {})
genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number} #anzahl von exons
genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score, 'mean_tsl': mean_tsl, 'exons_number': exons_number}

tsls = []
tsls.append(sorted_exons[i][2])
Expand All @@ -244,7 +228,7 @@ def write_outputfile(all_genes, output_directory):
sum_length = sum_length + sorted_exons[i][0]
score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)

mean_tsl = "%.2f" % round(np.mean(tsls), 2) #find the average tsl for this transcript
mean_tsl = int(np.mean(tsls)) #find the average tsl for this transcript
if mean_tsl > 5:
mean_tsl = "NA"

Expand Down Expand Up @@ -280,12 +264,15 @@ def main():

args = parse_args()

print("find_exons.py was called using these parameters:")
print(vars(args))

args.genes = check_existing_input_files(args)

#check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user
check_directory(args.output_directory)

fh = logging.FileHandler(os.path.join(args.output_directory, "find_exons_log.txt"))
fh = logging.FileHandler(os.path.join(args.output_directory, "find_exons.log"))
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)
Expand All @@ -299,8 +286,8 @@ def main():
if args.silent:
logger.removeHandler(ch)

logger.info("find_exons.py was called using these parameters:")
logger.info(vars(args))
#logger.info("find_exons.py was called using these parameters:")
#logger.info(vars(args))


all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact)
Expand Down

0 comments on commit 1133d0f

Please sign in to comment.