Skip to content

Commit

Permalink
using . after the gene name makes grep to look for occurences of word…
Browse files Browse the repository at this point in the history
…s that starts with gene names, and not these containing gene names within
  • Loading branch information
anastasiia committed Dec 4, 2018
1 parent 932097f commit f8b5f5e
Showing 1 changed file with 35 additions and 27 deletions.
62 changes: 35 additions & 27 deletions find_exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,17 @@ def check_existing_input_files(args):
return genes_of_interest

def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
logger.info("working with the gtf files")

all_genes = {}

grep = "grep -i "

#make a command for grep with or function
for gene in genes_of_interest:
grep = grep + "-e " + gene + " "
grep = grep + "-e " + gene + ". " #. makes grep to look for genes starting with the name from the list of genes of interest

print(grep)

#finally add to the grep command so that we only look for protein coding
small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"")
Expand All @@ -134,7 +137,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons}

#now save the information about the exons to the diictionary
small_gtf_exons = subprocess.getoutput(grep + gtf_exons + "| grep \"protein_coding\"")
small_gtf_exons = subprocess.getoutput(grep + gtf_exons + "| grep \"protein_coding\" | grep \"transcript_support_level\"")

for exons_line in re.split(r'\n', small_gtf_exons):
genes_array = re.split(r'\t', exons_line)
Expand All @@ -157,14 +160,17 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
transcript_name = re.split(r'\s', attribut)[1].replace('"', '')

if gene_full_name in all_genes.keys():
print(gene_full_name)
#print(all_genes[gene_full_name])
all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name])
else:
logger.info("none of the requested genes were found in the .gtf file")
sys.exit()

logger.info(len(small_gtf_genes))
logger.info(len(small_gtf_exons))
sys.exit()
logger.info(len(re.split(r'\n', small_gtf_genes)))
logger.info(len(re.split(r'\n', small_gtf_exons)))
logger.info(len(all_genes))
#sys.exit()

return all_genes

Expand All @@ -191,7 +197,7 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color

def plot_and_output(all_genes, output_directory):

logger.info("preparing for plotting")
#logger.info("preparing for plotting")

gene_names = []
gene_lengths = []
Expand All @@ -202,27 +208,29 @@ def plot_and_output(all_genes, output_directory):

#find the sum length of exons for each gene
for gene in all_genes:

sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[5], x[1], x[2]))

check_exon = 1
sum_length = 0

transcript_name = sorted_exons[0][5] #save the transcript name of the first exon
sum_length = sorted_exons[0][0] #save the length of the best first exon

for i in range(1, len(sorted_exons)):
if sorted_exons[i][5] != transcript_name or i == len(sorted_exons) - 1: #this is a new transcript or the last one
#save the previous transcript_name
score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
gene_transcript_name = gene + "_" + transcript_name
genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {})
genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score} #anzahl von exons
transcript_name = sorted_exons[i][5]
sum_length = sorted_exons[i][0]
else: #this is still the same transcript name
#save the length of this exon
sum_length = sum_length + sorted_exons[i][0]
if all_genes[gene]['exons'] != []:
sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[5], x[1], x[2]))

check_exon = 1
sum_length = 0

print(sorted_exons[0])
transcript_name = sorted_exons[0][5] #save the transcript name of the first exon
sum_length = sorted_exons[0][0] #save the length of the best first exon

for i in range(1, len(sorted_exons)):
if sorted_exons[i][5] != transcript_name or i == len(sorted_exons) - 1: #this is a new transcript or the last one
#save the previous transcript_name
score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
gene_transcript_name = gene + "_" + transcript_name
genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {})
genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score} #anzahl von exons
transcript_name = sorted_exons[i][5]
sum_length = sorted_exons[i][0]
else: #this is still the same transcript name
#save the length of this exon
sum_length = sum_length + sorted_exons[i][0]
#else do nothing, there are no exons found corresponding to this gene

genes_with_transcript_names = sorted(genes_with_transcript_names.items(), key = lambda x: (x[1]['gene'], x[0]))

Expand Down

0 comments on commit f8b5f5e

Please sign in to comment.