From 4056411fdaa99d4b2d91da5dacc3fc54df0ad4f8 Mon Sep 17 00:00:00 2001
From: anastasiia <anastasiia.petrova@mpi-bn.mpg.de>
Date: Sun, 20 Jan 2019 21:24:59 +0100
Subject: [PATCH] adding R skripts and modification in find_exons.py and
 visualize.py

---
 concatenate_rna_info.R |  72 +++++++++++++++++++
 create_groups.R        |  70 ++++++++++++++++++
 find_exons.py          |   2 +-
 find_exons.yaml        |  12 +++-
 using_rnaseq.R         |  92 ++++++++++++++++++++++++
 visualize.py           | 156 +++++++++++++++++++++++++++++------------
 6 files changed, 356 insertions(+), 48 deletions(-)
 create mode 100644 concatenate_rna_info.R
 create mode 100644 create_groups.R
 create mode 100644 using_rnaseq.R

diff --git a/concatenate_rna_info.R b/concatenate_rna_info.R
new file mode 100644
index 0000000..373bbe0
--- /dev/null
+++ b/concatenate_rna_info.R
@@ -0,0 +1,72 @@
+library(data.table)
+
+if (!require(optparse)) install.packages("optparse"); library(optparse)
+
+option_list <- list(
+  make_option(opt_str = c("-b", "--rna_input"), default = NULL, help = "Input TSV-file with RNA-seq information", metavar = "character"),
+  make_option(opt_str = c("-t", "--treshold"), default = 10, help = "Input the threshold to cut the baseMeanB", type = "integer")
+)
+
+opt_parser <- OptionParser(option_list = option_list, 
+                           description = "This script creates a TSV-file with information from the RNA-seq with EnsemblID and B-baseMean",
+                           epilogue = "Author: Anastasiia Petrova <Anastasiia.Petrova@mpi-bn.mpg.de>")
+
+opt <- parse_args(opt_parser)
+
+test_fun <- function(tobias_input, sample_size, rna_input, threshold){
+  #message(input)
+  
+  #read the input file as data table
+  rna_seq_file <- rna_input
+  rna_seq <- fread(rna_seq_file, header = TRUE, sep = "\t", fill = TRUE)
+  
+  subset_rna_seq <- rna_seq[rna_seq$`mdux-GFPneg-rna_vs_mdux-GFPpos-rna baseMeanB mdux-GFPpos-rna` > threshold]
+  
+  #sort the table by the column baseMean descending
+  rna_seq_sorted <- subset_rna_seq[order(-subset_rna_seq$`mdux-GFPneg-rna_vs_mdux-GFPpos-rna baseMeanB mdux-GFPpos-rna`), ]
+ 
+  
+  ens_ids <- unlist(rna_seq_sorted$`Ensembl gene id`)
+  base_Means <- unlist(rna_seq_sorted$`mdux-GFPneg-rna_vs_mdux-GFPpos-rna baseMeanB mdux-GFPpos-rna`)
+  
+  output <- cbind(ens_ids, base_Means)
+  
+  file_test <- file("test.txt")
+  write.table(output, file = file_test, row.names = FALSE, col.names = FALSE, quote = FALSE)
+  close(file_test)
+  
+  
+  #tobias_results_file <- "./bindetect_results.txt"
+  #sample_size = 10
+  #tobias_results_file <- tobias_input
+  #tobias_results <- fread(tobias_results_file, header = TRUE, sep = "\t", fill = TRUE)
+  
+  #sort the table by the column mDuxNeg_mDuxPos_change
+  #tobias_results_sorted <- tobias_results[order(-tobias_results$mDuxNeg_mDuxPos_change), ]
+  
+  #take top 10 from the tobias results and save only the gene names
+  #top_10 <- tobias_results_sorted[1:sample_size, ]$TF_name
+  #c_top_10 <- unlist(top_10, use.names = FALSE)
+  
+  #concatenate the Jaspar ID
+  #c_top_10 <- gsub("_.*", "", c_top_10)
+  
+  #make a subset of the same length as top_10, with random samples, replace = False excludes using one gene twice
+  #random_10 <- tobias_results_sorted[sample(sample_size + 1:nrow(tobias_results_sorted), sample_size, replace=FALSE), ]$TF_name
+  #c_random_10 <- unlist(random_10, use.names = FALSE)
+  
+  #concatenate the Jaspar ID
+  #c_random_10 <- gsub("_.*", "", c_random_10)
+  
+  
+  #genes_exons_correlation <- "./genes_exons_correlation.txt"
+  
+  #genes_exons_table <- fread(genes_exons_correlation, header = TRUE, sep = "\t")
+  
+  #genes <- genes_exons_table$gene_name
+  #genes_ids <- unlist(genes_exons_table$gene_id)
+}
+
+#delete the help message from the parameter
+params <- opt[-length(opt)]
+do.call(test_fun, args = params)
\ No newline at end of file
diff --git a/create_groups.R b/create_groups.R
new file mode 100644
index 0000000..af21990
--- /dev/null
+++ b/create_groups.R
@@ -0,0 +1,70 @@
+library(data.table)
+
+if (!require(optparse)) install.packages("optparse"); library(optparse)
+
+option_list <- list(
+  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file from TOBIAS",
+              metavar = "character"),
+  make_option(opt_str = c("-s", "--sample_size"), default = 10, help = "Input the size of the sample to test", type = "integer")
+)
+
+opt_parser <- OptionParser(option_list = option_list, 
+                           description = "This script creates files for visualize.py containing two groups to compare. The first group is the top 10
+                           best genes from the input file, and the other group is a random sampe of genes from the input file.",
+                           epilogue = "Author: Anastasiia Petrova <Anastasiia.Petrova@mpi-bn.mpg.de>")
+
+opt <- parse_args(opt_parser)
+
+test_fun <- function(input, sample_size){
+  #message(input)
+  #message(sample_size)
+  
+  #read the input file as data table
+  #tobias_results_file <- "./bindetect_results.txt"
+  tobias_results_file <- input
+  tobias_results <- fread(tobias_results_file, header = TRUE, sep = "\t", fill = TRUE)
+  
+  #sort the table by the column mDuxNeg_mDuxPos_change
+  tobias_results_sorted <- tobias_results[order(-tobias_results$mDuxNeg_mDuxPos_change), ]
+  
+  #take top 10 from the tobias results and save only the gene names
+  top_10 <- tobias_results_sorted[1:sample_size, ]$TF_name
+  c_top_10 <- unlist(top_10, use.names = FALSE)
+  
+  #concatenate the Jaspar ID
+  c_top_10 <- gsub("_.*", "", c_top_10)
+  
+  #make a subset of the same length as top_10, with random samples, replace = False excludes using one gene twice
+  random_10 <- tobias_results_sorted[sample(sample_size + 1:nrow(tobias_results_sorted), sample_size, replace=FALSE), ]$TF_name
+  c_random_10 <- unlist(random_10, use.names = FALSE)
+
+  #concatenate the Jaspar ID
+  c_random_10 <- gsub("_.*", "", c_random_10)
+  
+  #write the top_10 and the random_10 to the txt files
+  file_top_10 <- file("top_group.txt")
+  writeLines(c_top_10, con = file_top_10, sep = "\n")
+  close(file_top_10)
+  cat("The best ", sample_size, " samples are saved to the file ./top_group.txt \n")
+  
+  file_random_10 <- file("random_group.txt")
+  writeLines(c_random_10, con = file_random_10, sep = "\n")
+  close(file_random_10)
+  cat("The random ", sample_size, " samples are saved to the file ./random_group.txt \n")
+  
+  #make the group of bottom genes
+  tobias_results_sorted_a <- tobias_results[order(tobias_results$mDuxNeg_mDuxPos_change), ]
+  
+  bottom_10 <- tobias_results_sorted_a[1:sample_size, ]$TF_name
+  c_bottom_10 <- unlist(bottom_10, use.names = FALSE)
+  c_bottom_10 <- gsub("_.*", "", c_bottom_10)
+  
+  file_bottom_10 <- file("bottom_group.txt")
+  writeLines(c_bottom_10, con = file_bottom_10, sep = "\n")
+  close(file_bottom_10)
+  cat("The bottom ", sample_size, " samples are saved to the file ./bottom_group.txt \n")
+}
+
+#delete the help message from the parameter
+params <- opt[-length(opt)]
+do.call(test_fun, args = params)
\ No newline at end of file
diff --git a/find_exons.py b/find_exons.py
index 5487a29..853ff5c 100644
--- a/find_exons.py
+++ b/find_exons.py
@@ -25,7 +25,7 @@ def parse_args():
 	
 	parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent('''                           
 
-		This script produces plots and a tsv-file to show the number of exons for the list of genes of interest.
+		This script produces a tsv-file to show the number of exons for the list of genes of interest.
 		'''), epilog='That is what you need to make this script work for you. Enjoy it')
 	
 	required_arguments = parser.add_argument_group('required arguments')
diff --git a/find_exons.yaml b/find_exons.yaml
index be3c8ea..ffd9b14 100644
--- a/find_exons.yaml
+++ b/find_exons.yaml
@@ -5,9 +5,17 @@ channels:
   - conda-forge
 
 dependencies:
-  - python
+  - python >=3
   - snakemake
+  - r-seqinr
+  - r-base>=3.5.1
+  - r-data.table
+  - r-pbapply
+  - r-igraph
+  - r-stringi
+  - r-stringr
+  - r-optparse
   - biopython
   - matplotlib
   - scipy
-  - statsmodels
+  - statsmodels
\ No newline at end of file
diff --git a/using_rnaseq.R b/using_rnaseq.R
new file mode 100644
index 0000000..04a4eeb
--- /dev/null
+++ b/using_rnaseq.R
@@ -0,0 +1,92 @@
+library(data.table)
+library(stringr)
+library(stringi)
+
+genes_exons_correlation <- "./genes_exons_correlation.txt"
+
+genes_exons_table <- fread(genes_exons_correlation, header = TRUE, sep = "\t")
+View(genes_exons_table)
+
+rnaseq_file <- "./bigmatrix_norm.txt"
+
+#use read.delim as read.table produces warning message EOF within quoted string
+rnaseq_table <- fread(rnaseq_file, header = TRUE, sep = "\t", fill = TRUE)
+View(rnaseq_table)
+
+tobias_results_file <- "./bindetect_results.txt"
+tobias_results <- fread(tobias_results_file, header = TRUE, sep = "\t", fill = TRUE)
+View(tobias_results)
+
+#sort the table by the column mDuxNeg_mDuxPos_change
+tobias_results_sorted <- tobias_results[order(tobias_results$mDuxNeg_mDuxPos_change), ]
+View(tobias_results_sorted)
+
+#testing around
+i <- grep("CEBPD_MA0836.1", tobias_results_sorted$TF_name) #the number of the row
+tobias_results_sorted[i,] #print this row
+
+sample_size = 10
+#take top 10 from the tobias results and save only the gene names
+top_10 <- tobias_results_sorted[1:sample_size, ]$TF_name
+View(top_10)
+c_top_10 <- unlist(top_10, use.names = FALSE)
+
+#make a subset of the same length as top_10, with random samples, replace = False excludes using one gene twice
+random_10 <- tobias_results_sorted[sample(sample_size + 1:nrow(tobias_results_sorted), sample_size, replace=FALSE), ]$TF_name
+View(random_10)
+c_random_10 <- unlist(random_10, use.names = FALSE)
+
+#write the top_10 and the random_10 to the txt files
+file_top_10 <- file("top_10.txt")
+writeLines(c_top_10, con = file_top_10, sep = "\n")
+close(file_top_10)
+
+file_random_10 <- file("random_10.txt")
+writeLines(c_random_10, con = file_random_10, sep = "\n")
+close(file_random_10)
+
+
+
+#apply(genes_exons_table, 1, function(r) any(r %in% c_top_10)) #bebe
+
+#look_for <- grepl("Cphx", genes_exons_table$gene_name)
+#look_for
+
+#genes_exons_table[look_for]
+
+#look_for2 <- stri_subset(genes_exons_table$gene_name, regex = c_top_10)
+
+#look_for3 <- apply(outer(genes_exons_table$gene_name, c_top_10, stri_detect), 1, all)
+#genes_exons_table[look_for3]
+
+#stri_detect(str = genes_exons_table$gene_name, regex = c_top_10)
+
+small_list <- lapply(c_top_10, function (x){
+  as.data.table(stri_detect(str = genes_exons_table$gene_name, regex = x))
+}) #liste von datatables
+
+?do.call
+new_table <- do.call(cbind, small_list) #containing rows with true/false
+
+new_vector <- apply(new_table, MARGIN = 1, any) #margin 1 über die zeilen
+
+any(new_vector)
+
+
+
+small_list2 <- lapply(c_top_10, function (y){
+  as.data.table(grepl(x = genes_exons_table$gene_name, pattern = y, ignore.case = TRUE, perl = TRUE))
+})
+
+new_table2 <- do.call(cbind, small_list2) #containing rows with true/false
+
+new_vector2 <- apply(new_table2, MARGIN = 1, any) #margin 1 über die zeilen
+
+any(new_vector2) #false
+any(grepl(x = genes_exons_table$gene_name, pattern = "CPHX", ignore.case = TRUE, perl = TRUE)) #true
+
+#------------------
+
+
+
+
diff --git a/visualize.py b/visualize.py
index 32ddc02..5e89eff 100644
--- a/visualize.py
+++ b/visualize.py
@@ -37,18 +37,20 @@ def parse_args():
 	
 	required_arguments = parser.add_argument_group('required arguments')
 	required_arguments.add_argument('--correlation', help='a .txt file with correlation between genes length and the length of exons', required=True)
-	#required_arguments.add_argument('--genes_of_interest', nargs='*', dest='genes_of_interest', help='a .txt file or a list with genes one is interested in', required=True)
-	required_arguments.add_argument('--groups', nargs='*', dest='groups', help='enter one or more files containing groups of genes you are interested in, the genes should be separated with tabs or printed in one column')
-	
 	#all other arguments are optional
 	parser.add_argument('--variable', type=int, default=1, help='enter how many chars in the name of a gene can be different from the name of gene you are searching for')
+	parser.add_argument('--groups', nargs='*', dest='groups', help='enter one or more files containing groups of genes you are interested in, the genes should be separated with tabs or printed in one column')
+	parser.add_argument('--rna_seq', help='enter the file with rna seq data')
+	parser.add_argument('--bindetect', help='enter the file with results from TOBIAS')
+	parser.add_argument('--group_size', type=int, default=10, help='if the file from TOBIAS was provided, enter the size of groups to create, by default 10')
+	parser.add_argument('--output', help='enter the name of the output file, by default ./boxplot', default='./boxplot')
 
 	args = parser.parse_args()
 
 	return args
 
 def find_directory(args):
-	return os.path.splitext(os.path.dirname(args.correlation))[0]
+	return os.path.splitext(os.path.dirname(args.output))[0]
 
 def get_name_from_path(full_path):
 	return os.path.splitext(os.path.basename(full_path))[0]
@@ -61,28 +63,60 @@ def check_existing_input_files(args):
 		print('please make sure the .txt file with genes-exons correlation exists')
 		sys.exit()
 
+	if args.rna_seq and not os.path.isfile(args.rna_seq):
+		print('please make sure the file with RNA-seq data exists')
+		sys.exit()
+
 	groups = []
-	for group_file in args.groups:
-		if not os.path.isfile(group_file):
-			print('the path for the input file ' + str(group_file) + ' could not be opened')
+	if args.bindetect:
+		if not os.path.isfile(args.bindetect):
+			print('please make sure the file with information from TOBIAS exists')
+			sys.exit()
+	else:
+		if not args.groups:
+			print('please provide your own groups for visualisation or the file from TOBIAS to make the groups')
 			sys.exit()
 		else:
-			#group_name = "group_" + str(args.groups.index(group_file) + 1)
-			one_group = []
-			with open(group_file) as opened_group_file:
-				for line in opened_group_file:
-					line_array = re.split(r'\t', line.rstrip('\n'))
-					for gene in line_array:
-						if ' ' in gene:
-							print('please make sure that genes in the file ' + str(group_file) + ' are separated with tabs or are written each on a new line')
-							sys.exit()
-						else:
-							one_group.append(gene)
-			opened_group_file.close()
-			groups.append(one_group)
+			for group_file in args.groups:
+				if not os.path.isfile(group_file):
+					print('the path for the input file ' + str(group_file) + ' could not be opened')
+					sys.exit()
+				else:
+					#group_name = "group_" + str(args.groups.index(group_file) + 1)
+					one_group = []
+					with open(group_file) as opened_group_file:
+						for line in opened_group_file:
+							line_array = re.split(r'\t', line.rstrip('\n'))
+							for gene in line_array:
+								if ' ' in gene:
+									print('please make sure that genes in the file ' + str(group_file) + ' are separated with tabs or are written each on a new line')
+									sys.exit()
+								else:
+									one_group.append(gene)
+					opened_group_file.close()
+					groups.append(one_group)
 
 	return groups
 
+def compare_with_each(groups, groups_names):
+	if len(groups) >= 2:
+		start = 1
+		for group in groups:
+			for i in range(start, len(groups)):
+				make_ttest(group, groups[i], groups_names[groups.index(group)], groups_names[i])
+			start += 1
+	else:
+		logger.info("There is nothing to compare!")
+
+def make_ttest(group1, group2, group1_name, group2_name):
+	stat_score, p_value = stats.ttest_ind(group1, group2) #, equal_var = False)
+	if p_value <= 0.05:
+		logger.info("The average of " + str(group1_name) + " is NOT EQUAL with " + str(group2_name))
+		logger.info("Ttest p-value is " + str(p_value))
+	else:
+		logger.info("The average of " + str(group1_name) + " is EQUAL with " + str(group2_name))
+		logger.info("Ttest p-value is " + str(p_value))
+
 def make_box_plot(groups_files_names, groups_genes_of_interest, output_directory, figure_name):
 
 	groups_names = []
@@ -104,6 +138,10 @@ def make_box_plot(groups_files_names, groups_genes_of_interest, output_directory
 
 		data.append(group_data)
 
+	logger.info("Making a statistical test")
+
+	compare_with_each(data, groups_names)
+
 	fig, ax = plt.subplots(figsize=(10,10))
 
 	bp = ax.boxplot(data, notch=False, sym='o', vert=True, whis=1.5, patch_artist=True)
@@ -122,6 +160,7 @@ def make_box_plot(groups_files_names, groups_genes_of_interest, output_directory
 	ax.set_xticklabels(groups_names, rotation=45, fontsize=8)
 
 	fig.savefig(os.path.join(output_directory, figure_name))
+	logger.info("saving the plot to the file " + str(os.path.join(output_directory, figure_name)))
 
 	plt.show()
 
@@ -146,9 +185,9 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color
 	
 	#plt.show()
 
-def read_correlation(correlation_file, groups, variable):
-	#read the text file and make a dictionary containing genes with array of their scores
+def read_correlation(correlation_file, groups, variable, rna_dict):
 
+	#read the text file and make a dictionary containing genes with array of their scores
 	genes_to_show_groups = []
 
 	for genes_of_interest in groups:
@@ -162,29 +201,31 @@ def read_correlation(correlation_file, groups, variable):
 			
 			#first look for the genes in the gtf_genes file
 			genes_family = subprocess.getoutput("grep -i " + gene_name + " " + correlation_file)
-			print(genes_family)
-
+		
 			if len(genes_family) != 0:
 				logger.info("working with the gene " + gene_name)
 
-				#the score is in [3], the tsl is in [2]
+				#the score is in [3], the tsl is in [2], the ensemle_gene_id is in [7]
 
 				for genes_line in re.split(r'\n', genes_family):
 					genes_array = re.split(r'\t', genes_line)
-
-					check_gene_name = genes_array[0].lower()
-
-					print(genes_array)
-					if check_gene_name == orig_gene_name:
-						#this is a perfect match <3
-						genes_to_show[check_gene_name].append(float(genes_array[3])) #append score to the array
-					elif check_gene_name.startswith(orig_gene_name) and len(genes_array[0]) <= len(gene_name) + variable: #the found gene is by default 1 char longer than the gene name
-						if check_gene_name in genes_to_show.keys():
-							genes_to_show[check_gene_name].append(float(genes_array[3]))
-						else: #this gene is not yet saved
-							#make a new place in the dictionary for this gene
-							genes_to_show[check_gene_name] = genes_to_show.get(check_gene_name, [])
-							genes_to_show[check_gene_name].append(float(genes_array[3]))
+					
+					if genes_array[7] in rna_dict.keys():
+					#if this geneis is saved in the rnaseq data
+						check_gene_name = genes_array[0].lower()
+
+						if check_gene_name == orig_gene_name:
+							#this is a perfect match <3
+							genes_to_show[check_gene_name].append(float(genes_array[3])) #append score to the array
+						elif check_gene_name.startswith(orig_gene_name) and len(genes_array[0]) <= len(gene_name) + variable: #the found gene is by default 1 char longer than the gene name
+							if check_gene_name in genes_to_show.keys():
+								genes_to_show[check_gene_name].append(float(genes_array[3]))
+							else: #this gene is not yet saved
+								#make a new place in the dictionary for this gene
+								genes_to_show[check_gene_name] = genes_to_show.get(check_gene_name, [])
+								genes_to_show[check_gene_name].append(float(genes_array[3]))
+					else:
+						logger.info(gene_name + " has no corresponding information from the RNA-seq data and will not be considered while making a plot")
 
 			else:
 				logger.info("no similar genes to " + gene_name + " were found")
@@ -193,9 +234,29 @@ def read_correlation(correlation_file, groups, variable):
 		genes_to_show = dict([(k, v) for k, v in genes_to_show.items() if len(v) > 0])
 		genes_to_show_groups.append(genes_to_show)
 
-	print(genes_to_show_groups)
 	return genes_to_show_groups
 
+def read_bindetect(output_directory, bindetect, group_size):
+	#first sort the file after the column mDuxNeg_mDuxPos_change
+	#use the unix sort sort -k5nr
+	logger.info("sorting the bindetect file after the column mDuxNeg_mDuxPos_change")
+	sorted_bindetect = os.path.join(output_directory, "sorted_" + os.path.splitext(os.path.basename(bindetect))[0] + ".txt")
+	os.system("sort -k5nr " + bindetect + " > " + sorted_bindetect)
+	logger.info("sorted file is " + str(sorted_bindetect))
+
+def read_rna_seq(rna_seq):
+	rna_dict = {}
+
+	with open(rna_seq) as read_file:
+		for line in read_file:
+			line_array = re.split(r'\s', line.rstrip('\n'))
+			#ensid is in the first column, the basemean is in the second
+			rna_dict[line_array[0]] = line_array[1]
+
+		read_file.close()
+
+	return rna_dict
+
 def main():
 
 	start = time.time()
@@ -203,12 +264,11 @@ def main():
 	args = parse_args()
 
 	groups = check_existing_input_files(args)
+	output_directory = find_directory(args)
 
 	#check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user
 	#check_directory(args.output_directory)
 
-	output_directory = find_directory(args)
-
 	fh = logging.FileHandler(os.path.join(output_directory, "visualize.log"))
 	fh.setLevel(logging.INFO)
 	fh.setFormatter(formatter)
@@ -219,13 +279,19 @@ def main():
 	ch.setFormatter(formatter)
 	logger.addHandler(ch)
 
-	genes_to_show_groups = read_correlation(args.correlation, groups, args.variable)
+	rna_dict = {}
+	if args.rna_seq:
+		rna_dict =  read_rna_seq(args.rna_seq)
+
+	#read_bindetect(output_directory, args.bindetect, args.group_size)
+
+	genes_to_show_groups = read_correlation(args.correlation, groups, args.variable, rna_dict)
 
 	logger.info("making a plot")
 
-	make_box_plot(args.groups, genes_to_show_groups, output_directory, "boxplot.png")
+	make_box_plot(args.groups, genes_to_show_groups, output_directory, get_name_from_path(args.output))
 
-	logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2)))
+	logger.info("visualize.py needed %s seconds to generate the output" % (round(time.time() - start, 2)))
 	
 	for handler in logger.handlers:
 		handler.close()