working with groups of genes the user is interested in

anastasiia · Dec 11, 2018 · bf0d295 · bf0d295
1 parent 9bc6edd
commit bf0d295
Showing 1 changed file with 78 additions and 63 deletions.
diff --git a/visualize.py b/visualize.py
@@ -23,7 +23,7 @@
 
 #from gtfparse import read_gtf #makes the logger to look different o_o
 
-logger = logging.getLogger('visualise')
+logger = logging.getLogger('visualize')
 logger.setLevel(logging.INFO)
 
 formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M")
@@ -37,7 +37,8 @@ def parse_args():
 
 	required_arguments = parser.add_argument_group('required arguments')
 	required_arguments.add_argument('--correlation', help='a .txt file with correlation between genes length and the length of exons', required=True)
-	required_arguments.add_argument('--genes_of_interest', nargs='*', dest='genes_of_interest', help='a .txt file or a list with genes one is interested in', required=True)
+	#required_arguments.add_argument('--genes_of_interest', nargs='*', dest='genes_of_interest', help='a .txt file or a list with genes one is interested in', required=True)
+	required_arguments.add_argument('--groups', nargs='*', dest='groups', help='enter one or more files containing groups of genes you are interested in, the genes should be separated with tabs or printed in one column')
 
 	#all other arguments are optional
 	parser.add_argument('--variable', type=int, default=1, help='enter how many chars in the name of a gene can be different from the name of gene you are searching for')
@@ -46,6 +47,12 @@ def parse_args():
 
 	return args
 
+def find_directory(args):
+	return os.path.splitext(os.path.dirname(args.correlation))[0]
+
+def get_name_from_path(full_path):
+	return os.path.splitext(os.path.basename(full_path))[0]
+
 def check_existing_input_files(args):
 
 	print('checking the input file')
@@ -54,57 +61,63 @@ def check_existing_input_files(args):
 		print('please make sure the .txt file with genes-exons correlation exists')
 		sys.exit()
 
-	if len(args.genes_of_interest) == 1:
-		if not os.path.isfile(args.genes_of_interest[0]):
-			print('working with one gene of interest')
-			genes_of_interest = args.genes_of_interest
+	groups = []
+	for group_file in args.groups:
+		if not os.path.isfile(group_file):
+			print('the path for the input file ' + str(group_file) + ' could not be opened')
+			sys.exit()
 		else:
-			print('working with file of genes')
-			#there is a file with list of genes. Extract them and return in an array
-			genes = []
-			with open(args.genes_of_interest[0]) as genes_file:
-				for line in genes_file:
+			#group_name = "group_" + str(args.groups.index(group_file) + 1)
+			one_group = []
+			with open(group_file) as opened_group_file:
+				for line in opened_group_file:
 					line_array = re.split(r'\t', line.rstrip('\n'))
 					for gene in line_array:
 						if ' ' in gene:
-							print('please make sure that genes in the file with genes of interest are separated with tabs or are written each on a new line')
+							print('please make sure that genes in the file ' + str(group_file) + ' are separated with tabs or are written each on a new line')
 							sys.exit()
 						else:
-							genes.append(gene)
+							one_group.append(gene)
+			opened_group_file.close()
+			groups.append(one_group)
 
-			genes_file.close()
-			genes_of_interest = genes
+	return groups
 
-	else: #there is a list of genes of interest
-		print('working with a list of genes')
-		genes_of_interest = args.genes_of_interest
+def make_box_plot(groups_files_names, groups_genes_of_interest, output_directory, figure_name):
 
-	return genes_of_interest
+	groups_names = []
 
-def make_box_plot(genes_of_interest, output_directory, figure_name, color_name):
+	for group_file in groups_files_names:
+		groups_names.append(get_name_from_path(group_file))
 
-	groups_names = []
 	data = []
 
-	#first sort the genes_of_interest dictionary
-	sorted_genes_of_interest = sorted(genes_of_interest, key = lambda x: np.mean(genes_of_interest[x]))
+	for genes_of_interest in groups_genes_of_interest:
+
+		group_data = []
+
+		#first sort the genes_of_interest dictionary
+		sorted_genes_of_interest = sorted(genes_of_interest, key = lambda x: np.mean(genes_of_interest[x]))
+
+		for gene in sorted_genes_of_interest:
+			print(genes_of_interest[gene])
+			group_data.extend(genes_of_interest[gene])
 
-	for gene in sorted_genes_of_interest:
-		groups_names.append(gene)
-		data.append(genes_of_interest[gene])
+		data.append(group_data)
 
 	fig, ax = plt.subplots(figsize=(10,10))
 
 	bp = ax.boxplot(data, notch=False, sym='o', vert=True, whis=1.5, patch_artist=True)
 
+	color_name = 'navy'
 	for patch in bp['boxes']:
 		patch.set_facecolor(color_name)
 
 	# Add a horizontal grid to the plot, but make it very light in color
 	ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
 
 	ax.set_title('The visualization of correlation between gene and exons length')
-	ax.set_xlabel('Genes')
+	ax.set_xlabel('Genes groups')
 	ax.set_ylabel('Percentage of exons len')
 
 	ax.set_xticklabels(groups_names, rotation=45, fontsize=8)
@@ -134,54 +147,60 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color
 
 	#plt.show()
 
-def read_correlation(correlation_file, genes_of_interest, variable):
+def read_correlation(correlation_file, groups, variable):
 	#read the text file and make a dictionary containing genes with array of their scores
 
-	genes_to_show = {}
+	genes_to_show_groups = []
 
-	for gene_name in genes_of_interest:
+	for genes_of_interest in groups:
+		genes_to_show = {}
 
-		genes_to_show[gene_name] = genes_to_show.get(gene_name, []) #get a place in the dictionary for this gene
-
-		#first look for the genes in the gtf_genes file
-		genes_family = subprocess.getoutput("grep -i " + gene_name + " " + correlation_file)
-
-		if len(genes_family) != 0:
-			logger.info("working with the gene " + gene_name)
-
-			for genes_line in re.split(r'\n', genes_family):
-				genes_array = re.split(r'\t', genes_line)
-				if genes_array[0] == gene_name:
-					#this is a perfect match <3
-					genes_to_show[gene_name].append(float(genes_array[2])) #append score to the array
-				elif genes_array[0].startswith(gene_name) and len(genes_array[0]) <= len(gene_name) + variable: #the found gene is by default 1 char longer than the gene name
-					if genes_array[0] in genes_to_show.keys():
-						genes_to_show[genes_array[0]].append(float(genes_array[2]))
-					else: #this gene is not yet saved
-						#make a new place in the dictionary for this gene
-						genes_to_show[genes_array[0]] = genes_to_show.get(genes_array[0], [])
-						genes_to_show[genes_array[0]].append(float(genes_array[2]))
+		for gene_name in genes_of_interest:
 
-		else:
-			logger.info("no similar genes to " + gene_name + " were found")
+			genes_to_show[gene_name] = genes_to_show.get(gene_name, []) #get a place in the dictionary for this gene
+
+			#first look for the genes in the gtf_genes file
+			genes_family = subprocess.getoutput("grep -i " + gene_name + " " + correlation_file)
+
+			if len(genes_family) != 0:
+				logger.info("working with the gene " + gene_name)
+
+				for genes_line in re.split(r'\n', genes_family):
+					genes_array = re.split(r'\t', genes_line)
+					if genes_array[0] == gene_name:
+						#this is a perfect match <3
+						genes_to_show[gene_name].append(float(genes_array[2])) #append score to the array
+					elif genes_array[0].startswith(gene_name) and len(genes_array[0]) <= len(gene_name) + variable: #the found gene is by default 1 char longer than the gene name
+						if genes_array[0] in genes_to_show.keys():
+							genes_to_show[genes_array[0]].append(float(genes_array[2]))
+						else: #this gene is not yet saved
+							#make a new place in the dictionary for this gene
+							genes_to_show[genes_array[0]] = genes_to_show.get(genes_array[0], [])
+							genes_to_show[genes_array[0]].append(float(genes_array[2]))
 
-	#delete genes without information from the dictionary
-	genes_to_show = dict([(k, v) for k, v in genes_to_show.items() if len(v) > 0])
+			else:
+				logger.info("no similar genes to " + gene_name + " were found")
 
-	return genes_to_show
+		#delete genes without information from the dictionary
+		genes_to_show = dict([(k, v) for k, v in genes_to_show.items() if len(v) > 0])
+		genes_to_show_groups.append(genes_to_show)
+
+	return genes_to_show_groups
 
 def main():
 
 	start = time.time()
 
 	args = parse_args()
 
-	genes_of_interest = check_existing_input_files(args)
+	groups = check_existing_input_files(args)
 
 	#check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user
 	#check_directory(args.output_directory)
 
-	fh = logging.FileHandler("visualize_log.txt")
+	output_directory = find_directory(args)
+
+	fh = logging.FileHandler(os.path.join(output_directory, "visualize.log"))
 	fh.setLevel(logging.INFO)
 	fh.setFormatter(formatter)
 	logger.addHandler(fh)
@@ -191,15 +210,11 @@ def main():
 	ch.setFormatter(formatter)
 	logger.addHandler(ch)
 
-	genes_to_show = read_correlation(args.correlation, genes_of_interest, args.variable)
+	genes_to_show_groups = read_correlation(args.correlation, groups, args.variable)
 
 	logger.info("making a plot")
-	#make_plot(gene_lengths, exons_lengths, gene_names, "./output", "test1.png", 'gold')
-
-	#group1 = ["OTX1", "OTX2", "PITX1", "GSC2", "NFYA", "NFYB", "DUX4"] #important genes in early development
-	#group2 = ["MYCL", "LTBP3", "ERGIC3", "GCM2", "LHX8", "HIF1A", "FLI1"] #not so significant for early dev
 
-	make_box_plot(genes_to_show, "./output", "test2.png", 'navy')
+	make_box_plot(args.groups, genes_to_show_groups, output_directory, "boxplot.png")
 
 	logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2)))