Skip to content

Commit

Permalink
changing the command of grep to call all genes at once with help of g…
Browse files Browse the repository at this point in the history
…rep -e
  • Loading branch information
anastasiia committed Nov 27, 2018
1 parent 3cb2320 commit 6f21dc3
Showing 1 changed file with 69 additions and 4 deletions.
73 changes: 69 additions & 4 deletions find_exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def parse_args():
required_arguments.add_argument('--genes_of_interest', nargs='*', dest='genes_of_interest', help='a .txt file or a list with genes one is interested in', required=True)

#all other arguments are optional
parser.add_argument('--genes_file', help='names of genes of interest')
parser.add_argument('--output_directory', default='output', const='output', nargs='?', help='output directory, by default ./output/')
parser.add_argument('--silent', action='store_true', help='while working with data write the information only into ./call_peaks_log.txt')
parser.add_argument('--exact', action='store_true', help='choose this parameter if you want to look for exact gene matches')
Expand Down Expand Up @@ -110,6 +109,70 @@ def check_existing_input_files(args):

def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):

all_genes = {}

grep = "grep -i "

#make a command for grep with or function
for gene in genes_of_interest:
grep = grep + "-e " + gene + " "

#finally add tp the grep command so that we only look for protein coding

small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"")
if len(small_gtf_genes) != 0:
#save the genes to the dictionary
for genes_line in re.split(r'\n', small_gtf_genes):
genes_array = re.split(r'\t', genes_line)
gene_start = int(genes_array[3])
gene_end = int(genes_array[4])
gene_length = gene_end - gene_start
attributes = re.split(r'; ', genes_array[8])
for attribut in attributes:
if attribut.startswith("gene_name"):
gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')

all_genes[gene_full_name] = all_genes.get(gene_full_name, {})
exons = []
#all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons}
all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons}
#gene_groups[gene_name].append(gene_full_name)

#now save the information about the exons to the diictionary
small_gtf_exons = subprocess.getoutput(grep + gtf_exons + "| grep \"protein_coding\"")

for exons_line in re.split(r'\n', small_gtf_exons):
#print(exons_line)
genes_array = re.split(r'\t', exons_line)
#print(genes_array)
exon_start = int(genes_array[3])
exon_end = int(genes_array[4])
exon_length = exon_end - exon_start
attributes = re.split(r'; ', genes_array[8])
for attribut in attributes:
if attribut.startswith("gene_name"):
gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')
elif attribut.startswith("exon_number"):
exon_number = int(re.split(r'\s', attribut)[1])
elif attribut.startswith("transcript_support_level"):
tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA
if tsl == "NA":
tsl = 6
else:
tsl = int(tsl)
elif attribut.startswith("transcript_name"):
transcript_name = re.split(r'\s', attribut)[1].replace('"', '')

if gene_full_name in all_genes.keys():
all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name])
else:
logger.info("none of the requested genes were found in the .gtf file")
sys.exit()

logger.info(len(small_gtf_genes))
logger.info(len(small_gtf_exons))
sys.exit()
"""
gene_groups = {}
logger.info("looking for genes of interest in gtf files")
Expand Down Expand Up @@ -182,6 +245,8 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
sys.exit()
else:
return all_genes, gene_groups
"""
return all_genes

def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):

Expand Down Expand Up @@ -228,7 +293,7 @@ def make_reg_plot(x_array, y_array, gene_names, output_directory, figure_name, c

#plt.show()

def plot_and_output(all_genes, output_directory, gene_groups):
def plot_and_output(all_genes, output_directory):

logger.info("preparing for plotting")

Expand Down Expand Up @@ -422,8 +487,8 @@ def main():
logger.info(vars(args))


all_genes, gene_groups = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact)
plot_and_output(all_genes, args.output_directory, gene_groups)
all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact)
plot_and_output(all_genes, args.output_directory)


logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2)))
Expand Down

0 comments on commit 6f21dc3

Please sign in to comment.