using . after the gene name makes grep to look for occurences of word…

…s that starts with gene names, and not these containing gene names within
anastasiia · Dec 4, 2018 · f8b5f5e · f8b5f5e
1 parent 932097f
commit f8b5f5e
Showing 1 changed file with 35 additions and 27 deletions.
diff --git a/find_exons.py b/find_exons.py
@@ -105,14 +105,17 @@ def check_existing_input_files(args):
 	return genes_of_interest
 
 def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
+	logger.info("working with the gtf files")
 
 	all_genes = {}
 
 	grep = "grep -i "
 
 	#make a command for grep with or function
 	for gene in genes_of_interest:
-		grep = grep + "-e " + gene + " "
+		grep = grep + "-e " + gene + ". " #. makes grep to look for genes starting with the name from the list of genes of interest
+
+	print(grep)
 
 	#finally add to the grep command so that we only look for protein coding
 	small_gtf_genes = subprocess.getoutput(grep + gtf_genes + "| grep \"protein_coding\"")
@@ -134,7 +137,7 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 			all_genes[gene_full_name] = {'gene_length': gene_length, 'exons': exons}
 
 		#now save the information about the exons to the diictionary
-		small_gtf_exons = subprocess.getoutput(grep + gtf_exons + "| grep \"protein_coding\"")
+		small_gtf_exons = subprocess.getoutput(grep + gtf_exons + "| grep \"protein_coding\" | grep \"transcript_support_level\"")
 
 		for exons_line in re.split(r'\n', small_gtf_exons):
 			genes_array = re.split(r'\t', exons_line)
@@ -157,14 +160,17 @@ def procede_gtf_files(gtf_genes, gtf_exons, genes_of_interest, exact):
 					transcript_name = re.split(r'\s', attribut)[1].replace('"', '')
 
 			if gene_full_name in all_genes.keys():
+				print(gene_full_name)
+				#print(all_genes[gene_full_name])
 				all_genes[gene_full_name]['exons'].append([exon_length, exon_number, tsl, exon_start, exon_end, transcript_name])
 	else:
 		logger.info("none of the requested genes were found in the .gtf file")
 		sys.exit()
 
-	logger.info(len(small_gtf_genes))
-	logger.info(len(small_gtf_exons))
-	sys.exit()
+	logger.info(len(re.split(r'\n', small_gtf_genes)))
+	logger.info(len(re.split(r'\n', small_gtf_exons)))
+	logger.info(len(all_genes))
+	#sys.exit()
 
 	return all_genes
 
@@ -191,7 +197,7 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color
 
 def plot_and_output(all_genes, output_directory):
 
-	logger.info("preparing for plotting")
+	#logger.info("preparing for plotting")
 
 	gene_names = []
 	gene_lengths = []
@@ -202,27 +208,29 @@ def plot_and_output(all_genes, output_directory):
 
 	#find the sum length of exons for each gene
 	for gene in all_genes:
-
-		sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[5], x[1], x[2]))
-
-		check_exon = 1
-		sum_length = 0
-
-		transcript_name = sorted_exons[0][5] #save the transcript name of the first exon
-		sum_length = sorted_exons[0][0] #save the length of the best first exon
-
-		for i in range(1, len(sorted_exons)):
-			if sorted_exons[i][5] != transcript_name or i == len(sorted_exons) - 1: #this is a new transcript or the last one
-				#save the previous transcript_name
-				score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
-				gene_transcript_name = gene + "_" + transcript_name
-				genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {})
-				genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score} #anzahl von exons
-				transcript_name = sorted_exons[i][5]
-				sum_length = sorted_exons[i][0]
-			else: #this is still the same transcript name
-				#save the length of this exon
-				sum_length = sum_length + sorted_exons[i][0]
+		if all_genes[gene]['exons'] != []:
+			sorted_exons = sorted(all_genes[gene]['exons'], key = lambda x: (x[5], x[1], x[2]))
+
+			check_exon = 1
+			sum_length = 0
+
+			print(sorted_exons[0])
+			transcript_name = sorted_exons[0][5] #save the transcript name of the first exon
+			sum_length = sorted_exons[0][0] #save the length of the best first exon
+
+			for i in range(1, len(sorted_exons)):
+				if sorted_exons[i][5] != transcript_name or i == len(sorted_exons) - 1: #this is a new transcript or the last one
+					#save the previous transcript_name
+					score = "%.2f" % round((sum_length * 100)/all_genes[gene]['gene_length'], 2)
+					gene_transcript_name = gene + "_" + transcript_name
+					genes_with_transcript_names[gene_transcript_name] = genes_with_transcript_names.get(transcript_name, {})
+					genes_with_transcript_names[gene_transcript_name] = {'gene': gene, 'sum_length': sum_length, 'gene_length': all_genes[gene]['gene_length'], 'score': score} #anzahl von exons
+					transcript_name = sorted_exons[i][5]
+					sum_length = sorted_exons[i][0]
+				else: #this is still the same transcript name
+					#save the length of this exon
+					sum_length = sum_length + sorted_exons[i][0]
+		#else do nothing, there are no exons found corresponding to this gene
 
 	genes_with_transcript_names = sorted(genes_with_transcript_names.items(), key = lambda x: (x[1]['gene'], x[0]))