the plotting is done

anastasiia · Nov 15, 2018 · 4a31126 · 4a31126
1 parent bd040a2
commit 4a31126
Showing 1 changed file with 65 additions and 48 deletions.
diff --git a/find_exons.py b/find_exons.py
@@ -47,6 +47,7 @@ def parse_args():
 	parser.add_argument('--genes_file', help='names of genes of interest')
 	parser.add_argument('--output_directory',  default='output', const='output', nargs='?', help='output directory, by default ./output/')
 	parser.add_argument('--silent', action='store_true', help='while working with data write the information only into ./call_peaks_log.txt')
+	parser.add_argument('--exact', action='store_true', help='choose this parameter if you want to look for exact gene matches')
 	args = parser.parse_args()
 
 	return args
@@ -106,64 +107,80 @@ def check_existing_input_files(args):
 
 	return args.genes
 
-def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest):
+def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 
 	logger.info("looking for genes of interest in gtf files")
 
+	if exact:
+		grep = "grep -w "
+	else:
+		grep = "grep "
+
 	all_genes = {}
 
 	for gene_name in genes_of_interest:
 		#first look for the genes in the gtf_genes file
-		small_gtf_genes = subprocess.getoutput("grep " + gtf_genes + " -e \"" + gene_name + "\"")#find the first line of the input file
-
-		for genes_line in re.split(r'\n', small_gtf_genes):
-			genes_array = re.split(r'\t', genes_line)
-			gene_start = int(genes_array[3])
-			gene_end = int(genes_array[4])
-			gene_length = gene_end - gene_start
-			attributes = re.split(r'; ', genes_array[8])
-			for attribut in attributes:
-				if attribut.startswith("gene_name"):
-					gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')
+		small_gtf_genes = subprocess.getoutput(grep + gtf_genes + " -e \"" + gene_name + "\"")#find the first line of the input file
+
+		if len(small_gtf_genes) != 0:
+
+			for genes_line in re.split(r'\n', small_gtf_genes):
+				genes_array = re.split(r'\t', genes_line)
+				gene_start = int(genes_array[3])
+				gene_end = int(genes_array[4])
+				gene_length = gene_end - gene_start
+				attributes = re.split(r'; ', genes_array[8])
+				for attribut in attributes:
+					if attribut.startswith("gene_name"):
+						gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')
+
+				all_genes[gene_full_name] = all_genes.get(gene_full_name, {})
+				exons = []
+				#all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons}
+				all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons}
+
+			#find the nubmer and the length of all exons for each gene of interest
+			small_gtf_exons = subprocess.getoutput("grep " + gtf_exons + " -e \"" + gene_name + "\"")
 
-			all_genes[gene_full_name] = all_genes.get(gene_full_name, {})
-			exons = []
-			#all_genes[gene_full_name] = {'gene_start': gene_start, 'gene_end': gene_end, 'gene_length': gene_length, 'exons': exons}
-			all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons}
+			for exons_line in re.split(r'\n', small_gtf_exons):
+				genes_array = re.split(r'\t', exons_line)
+				exon_start = int(genes_array[3])
+				exon_end = int(genes_array[4])
+				exon_length = exon_end - exon_start
+				attributes = re.split(r'; ', genes_array[8])
+				for attribut in attributes:
+					if attribut.startswith("gene_name"):
+						gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')
+					elif attribut.startswith("exon_number"):
+						exon_number = int(re.split(r'\s', attribut)[1])
+					elif attribut.startswith("transcript_support_level"):
+						tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA
+						if tsl == "NA":
+							tsl = 6
+						else:
+							tsl = int(tsl)
+
+				if gene_full_name in all_genes.keys():
+					all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl])
 
-		#find the nubmer and the length of all exons for each gene of interest
-		small_gtf_exons = subprocess.getoutput("grep " + gtf_exons + " -e \"" + gene_name + "\"")
-
-		for exons_line in re.split(r'\n', small_gtf_exons):
-			genes_array = re.split(r'\t', exons_line)
-			exon_start = int(genes_array[3])
-			exon_end = int(genes_array[4])
-			exon_length = exon_end - exon_start
-			attributes = re.split(r'; ', genes_array[8])
-			for attribut in attributes:
-				if attribut.startswith("gene_name"):
-					gene_full_name = re.split(r'\s', attribut)[1].replace('"', '')
-				elif attribut.startswith("exon_number"):
-					exon_number = int(re.split(r'\s', attribut)[1])
-				elif attribut.startswith("transcript_support_level"):
-					tsl = re.split(r'\s', attribut)[1].replace('"', '') #do not cast on int, because the tsl can also be NA
-					if tsl == "NA":
-						tsl = 6
-					else:
-						tsl = int(tsl)
-
-			if gene_full_name in all_genes.keys():
-				all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl])
-
-	return all_genes
+		else:
+			logger.info("The gene " + gene_name + " was not found in the .gtf file with genes")
+
+	if len(all_genes) == 0:
+		print("None of the given list of genes was found in the .gtf file")
+		sys.exit()
+	else:
+		return all_genes
 
 def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color_name):
+
+	the_max = max(max(x_array), max(y_array))
 
-	#plt.xlim(0,1)
-	#plt.ylim(0,1)
+	plt.xlim(0, the_max)
+	plt.ylim(0, the_max)
 
 	fig, ax = plt.subplots()
-	plt.plot([0,1], '--', color = "black", linewidth = 0.5)
+	plt.plot([0, the_max], [0, the_max], '--', color = "black", linewidth = 0.5)
 	plt.plot(x_array, y_array, 'o', color = color_name)
 
 	plt.xlabel("whole gene length")
@@ -209,7 +226,7 @@ def plot_and_output(all_genes, output_directory):
 		scores.append(score)
 
 	logger.info("writing the output file")
-	
+
 	#now write an output file containing names of genes, their length, the sum length of exons and the prozent of exons in the gene length as score
 	output_file = open(os.path.join(output_directory, "genes_exons_correlation.txt"), 'w')
 	header = ["gene_name", "exons_len_percentage", "gene_length", "exons_sum_length"]
@@ -220,8 +237,8 @@ def plot_and_output(all_genes, output_directory):
 
 	output_file.close()
 
-
-
+	logger.info("making a plot")
+	make_plot(gene_lengths, exons_lengths, gene_names, output_directory, "genes_exons_correlation.png", 'gold')
 
 def main():
 
@@ -251,7 +268,7 @@ def main():
 	logger.info("find_exons.py was called using these parameters:")
 	logger.info(vars(args))
 
-	all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes)
+	all_genes = procede_gtf_files(args.gtf_genes, args.gtf_exons, args.genes, args.exact)
 	plot_and_output(all_genes, args.output_directory)
 
 	logger.info("find_exons needed %s seconds to generate the output" % (time.time() - start))