From 01dd313366f21315857f856e3c329109f4b4ffcc Mon Sep 17 00:00:00 2001 From: anastasiia Date: Thu, 15 Nov 2018 11:18:49 +0100 Subject: [PATCH] the extracting of information from the gtf files is done --- find_exons.py | 82 ++++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 37 deletions(-) diff --git a/find_exons.py b/find_exons.py index 1557642..2536a54 100644 --- a/find_exons.py +++ b/find_exons.py @@ -104,49 +104,56 @@ def check_existing_input_files(args): return args.genes -def procede_gtf(gtf_genes, gtf_exons, genes_of_interest, output_directory): - #df = read_gtf(gtf_genes) - #df_genes = df[df["feature"] == "gene"][df["gene_name"] == genes_of_interest[0]] - #print(df_genes) - #print(df_genes["seqname"], df_genes["source"], df_genes["feature"], df_genes["start"], df_genes["end"]) +def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest): all_genes = {} - #first look for the genew in the gtf_genes file for gene_name in genes_of_interest: - print(gene_name) + #first look for the genes in the gtf_genes file small_gtf_genes = subprocess.getoutput("grep " + gtf_genes + " -e \"" + gene_name + "\"")#find the first line of the input file - #small_gtf_genes_lines = re.split(r'\n', small_gtf_genes) - for genes_line in re.split(r'\n', small_gtf_genes): - print(line) - print() - - - - """ - with open(gtf_genes) as gtf_genes_file: - for line in gtf_genes_file: - if gene_name in line: - line_array = re.split(r'\t', line.rstrip('\n')) - gene_start = int(line_array[3]) - gene_end = int(line_array[4]) - attributes = re.split(r'; ', line_array[8]) - for attribut in attributes: - if attribut.startswith("gene_id"): - gene_id = re.split(r'\s', attribut)[1].replace('"', '') - elif attribut.startswith("gene_name"): - gene_full_name = re.split(r'\s', attribut)[1].replace('"', '') - gene_length = gene_end - gene_start - - all_genes[gene_full_name] = all_genes.get(gene_full_name, {}) - all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'gene_id': gene_id} - gtf_genes_file.close() - """ + for genes_line in re.split(r'\n', small_gtf_genes): + genes_array = re.split(r'\t', genes_line) + gene_start = int(genes_array[3]) + gene_end = int(genes_array[4]) + gene_length = gene_end - gene_start + attributes = re.split(r'; ', genes_array[8]) + for attribut in attributes: + if attribut.startswith("gene_name"): + gene_full_name = re.split(r'\s', attribut)[1].replace('"', '') + + all_genes[gene_full_name] = all_genes.get(gene_full_name, {}) + exons = [] + all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons} + + #find the nubmer and the length of all exons for each gene of interest + small_gtf_exons = subprocess.getoutput("grep " + gtf_exons + " -e \"" + gene_name + "\"") + + for exons_line in re.split(r'\n', small_gtf_exons): + genes_array = re.split(r'\t', exons_line) + exon_start = int(genes_array[3]) + exon_end = int(genes_array[4]) + attributes = re.split(r'; ', genes_array[8]) + for attribut in attributes: + if attribut.startswith("gene_name"): + gene_full_name = re.split(r'\s', attribut)[1].replace('"', '') + elif attribut.startswith("exon_number"): + exon_number = re.split(r'\s', attribut)[1] + elif attribut.startswith("transcript_support_level"): + tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA + + if gene_full_name in all_genes.keys(): + all_genes[gene_full_name]['exons'].append([exon_start, exon_end, exon_number, tsl]) + + """ + for gene in all_genes: + print(gene, all_genes[gene]) + print() + """ + return all_genes + +def make_plot(all_genes, output_directory): - #print() - #for gene in all_genes: - # print(gene) def main(): @@ -176,7 +183,8 @@ def main(): logger.info("find_exons.py was called using these parameters:") logger.info(vars(args)) - procede_gtf(args.gtf_genes, args.gtf_exons, args.genes, args.output_directory) + all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes) + make_plot(all_genes, args.output_directory) logger.info("find_exons needed %s seconds to generate the output" % (time.time() - start))