From bf0d29506ec69191a5b3d2ee94b67445a5daa89c Mon Sep 17 00:00:00 2001 From: anastasiia Date: Wed, 12 Dec 2018 00:49:54 +0100 Subject: [PATCH] working with groups of genes the user is interested in --- visualize.py | 141 ++++++++++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 63 deletions(-) diff --git a/visualize.py b/visualize.py index 036b071..9ad4e1c 100644 --- a/visualize.py +++ b/visualize.py @@ -23,7 +23,7 @@ #from gtfparse import read_gtf #makes the logger to look different o_o -logger = logging.getLogger('visualise') +logger = logging.getLogger('visualize') logger.setLevel(logging.INFO) formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M") @@ -37,7 +37,8 @@ def parse_args(): required_arguments = parser.add_argument_group('required arguments') required_arguments.add_argument('--correlation', help='a .txt file with correlation between genes length and the length of exons', required=True) - required_arguments.add_argument('--genes_of_interest', nargs='*', dest='genes_of_interest', help='a .txt file or a list with genes one is interested in', required=True) + #required_arguments.add_argument('--genes_of_interest', nargs='*', dest='genes_of_interest', help='a .txt file or a list with genes one is interested in', required=True) + required_arguments.add_argument('--groups', nargs='*', dest='groups', help='enter one or more files containing groups of genes you are interested in, the genes should be separated with tabs or printed in one column') #all other arguments are optional parser.add_argument('--variable', type=int, default=1, help='enter how many chars in the name of a gene can be different from the name of gene you are searching for') @@ -46,6 +47,12 @@ def parse_args(): return args +def find_directory(args): + return os.path.splitext(os.path.dirname(args.correlation))[0] + +def get_name_from_path(full_path): + return os.path.splitext(os.path.basename(full_path))[0] + def check_existing_input_files(args): print('checking the input file') @@ -54,49 +61,55 @@ def check_existing_input_files(args): print('please make sure the .txt file with genes-exons correlation exists') sys.exit() - if len(args.genes_of_interest) == 1: - if not os.path.isfile(args.genes_of_interest[0]): - print('working with one gene of interest') - genes_of_interest = args.genes_of_interest + groups = [] + for group_file in args.groups: + if not os.path.isfile(group_file): + print('the path for the input file ' + str(group_file) + ' could not be opened') + sys.exit() else: - print('working with file of genes') - #there is a file with list of genes. Extract them and return in an array - genes = [] - with open(args.genes_of_interest[0]) as genes_file: - for line in genes_file: + #group_name = "group_" + str(args.groups.index(group_file) + 1) + one_group = [] + with open(group_file) as opened_group_file: + for line in opened_group_file: line_array = re.split(r'\t', line.rstrip('\n')) for gene in line_array: if ' ' in gene: - print('please make sure that genes in the file with genes of interest are separated with tabs or are written each on a new line') + print('please make sure that genes in the file ' + str(group_file) + ' are separated with tabs or are written each on a new line') sys.exit() else: - genes.append(gene) + one_group.append(gene) + opened_group_file.close() + groups.append(one_group) - genes_file.close() - genes_of_interest = genes + return groups - else: #there is a list of genes of interest - print('working with a list of genes') - genes_of_interest = args.genes_of_interest +def make_box_plot(groups_files_names, groups_genes_of_interest, output_directory, figure_name): - return genes_of_interest + groups_names = [] -def make_box_plot(genes_of_interest, output_directory, figure_name, color_name): + for group_file in groups_files_names: + groups_names.append(get_name_from_path(group_file)) - groups_names = [] data = [] - #first sort the genes_of_interest dictionary - sorted_genes_of_interest = sorted(genes_of_interest, key = lambda x: np.mean(genes_of_interest[x])) + for genes_of_interest in groups_genes_of_interest: + + group_data = [] + + #first sort the genes_of_interest dictionary + sorted_genes_of_interest = sorted(genes_of_interest, key = lambda x: np.mean(genes_of_interest[x])) + + for gene in sorted_genes_of_interest: + print(genes_of_interest[gene]) + group_data.extend(genes_of_interest[gene]) - for gene in sorted_genes_of_interest: - groups_names.append(gene) - data.append(genes_of_interest[gene]) + data.append(group_data) fig, ax = plt.subplots(figsize=(10,10)) bp = ax.boxplot(data, notch=False, sym='o', vert=True, whis=1.5, patch_artist=True) + color_name = 'navy' for patch in bp['boxes']: patch.set_facecolor(color_name) @@ -104,7 +117,7 @@ def make_box_plot(genes_of_interest, output_directory, figure_name, color_name): ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) ax.set_title('The visualization of correlation between gene and exons length') - ax.set_xlabel('Genes') + ax.set_xlabel('Genes groups') ax.set_ylabel('Percentage of exons len') ax.set_xticklabels(groups_names, rotation=45, fontsize=8) @@ -134,41 +147,45 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color #plt.show() -def read_correlation(correlation_file, genes_of_interest, variable): +def read_correlation(correlation_file, groups, variable): #read the text file and make a dictionary containing genes with array of their scores - genes_to_show = {} + genes_to_show_groups = [] - for gene_name in genes_of_interest: + for genes_of_interest in groups: + genes_to_show = {} - genes_to_show[gene_name] = genes_to_show.get(gene_name, []) #get a place in the dictionary for this gene - - #first look for the genes in the gtf_genes file - genes_family = subprocess.getoutput("grep -i " + gene_name + " " + correlation_file) - - if len(genes_family) != 0: - logger.info("working with the gene " + gene_name) - - for genes_line in re.split(r'\n', genes_family): - genes_array = re.split(r'\t', genes_line) - if genes_array[0] == gene_name: - #this is a perfect match <3 - genes_to_show[gene_name].append(float(genes_array[2])) #append score to the array - elif genes_array[0].startswith(gene_name) and len(genes_array[0]) <= len(gene_name) + variable: #the found gene is by default 1 char longer than the gene name - if genes_array[0] in genes_to_show.keys(): - genes_to_show[genes_array[0]].append(float(genes_array[2])) - else: #this gene is not yet saved - #make a new place in the dictionary for this gene - genes_to_show[genes_array[0]] = genes_to_show.get(genes_array[0], []) - genes_to_show[genes_array[0]].append(float(genes_array[2])) + for gene_name in genes_of_interest: - else: - logger.info("no similar genes to " + gene_name + " were found") + genes_to_show[gene_name] = genes_to_show.get(gene_name, []) #get a place in the dictionary for this gene + + #first look for the genes in the gtf_genes file + genes_family = subprocess.getoutput("grep -i " + gene_name + " " + correlation_file) + + if len(genes_family) != 0: + logger.info("working with the gene " + gene_name) + + for genes_line in re.split(r'\n', genes_family): + genes_array = re.split(r'\t', genes_line) + if genes_array[0] == gene_name: + #this is a perfect match <3 + genes_to_show[gene_name].append(float(genes_array[2])) #append score to the array + elif genes_array[0].startswith(gene_name) and len(genes_array[0]) <= len(gene_name) + variable: #the found gene is by default 1 char longer than the gene name + if genes_array[0] in genes_to_show.keys(): + genes_to_show[genes_array[0]].append(float(genes_array[2])) + else: #this gene is not yet saved + #make a new place in the dictionary for this gene + genes_to_show[genes_array[0]] = genes_to_show.get(genes_array[0], []) + genes_to_show[genes_array[0]].append(float(genes_array[2])) - #delete genes without information from the dictionary - genes_to_show = dict([(k, v) for k, v in genes_to_show.items() if len(v) > 0]) + else: + logger.info("no similar genes to " + gene_name + " were found") - return genes_to_show + #delete genes without information from the dictionary + genes_to_show = dict([(k, v) for k, v in genes_to_show.items() if len(v) > 0]) + genes_to_show_groups.append(genes_to_show) + + return genes_to_show_groups def main(): @@ -176,12 +193,14 @@ def main(): args = parse_args() - genes_of_interest = check_existing_input_files(args) + groups = check_existing_input_files(args) #check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user #check_directory(args.output_directory) - fh = logging.FileHandler("visualize_log.txt") + output_directory = find_directory(args) + + fh = logging.FileHandler(os.path.join(output_directory, "visualize.log")) fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) @@ -191,15 +210,11 @@ def main(): ch.setFormatter(formatter) logger.addHandler(ch) - genes_to_show = read_correlation(args.correlation, genes_of_interest, args.variable) + genes_to_show_groups = read_correlation(args.correlation, groups, args.variable) logger.info("making a plot") - #make_plot(gene_lengths, exons_lengths, gene_names, "./output", "test1.png", 'gold') - - #group1 = ["OTX1", "OTX2", "PITX1", "GSC2", "NFYA", "NFYB", "DUX4"] #important genes in early development - #group2 = ["MYCL", "LTBP3", "ERGIC3", "GCM2", "LHX8", "HIF1A", "FLI1"] #not so significant for early dev - make_box_plot(genes_to_show, "./output", "test2.png", 'navy') + make_box_plot(args.groups, genes_to_show_groups, output_directory, "boxplot.png") logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2)))