adding R skripts and modification in find_exons.py and visualize.py

anastasiia · Jan 20, 2019 · 4056411 · 4056411
1 parent b10fba0
commit 4056411
Show file tree

Hide file tree

Showing 6 changed files with 356 additions and 48 deletions.
diff --git a/concatenate_rna_info.R b/concatenate_rna_info.R
@@ -0,0 +1,72 @@
+library(data.table)
+
+if (!require(optparse)) install.packages("optparse"); library(optparse)
+
+option_list <- list(
+  make_option(opt_str = c("-b", "--rna_input"), default = NULL, help = "Input TSV-file with RNA-seq information", metavar = "character"),
+  make_option(opt_str = c("-t", "--treshold"), default = 10, help = "Input the threshold to cut the baseMeanB", type = "integer")
+)
+
+opt_parser <- OptionParser(option_list = option_list, 
+                           description = "This script creates a TSV-file with information from the RNA-seq with EnsemblID and B-baseMean",
+                           epilogue = "Author: Anastasiia Petrova <Anastasiia.Petrova@mpi-bn.mpg.de>")
+
+opt <- parse_args(opt_parser)
+
+test_fun <- function(tobias_input, sample_size, rna_input, threshold){
+  #message(input)
+
+  #read the input file as data table
+  rna_seq_file <- rna_input
+  rna_seq <- fread(rna_seq_file, header = TRUE, sep = "\t", fill = TRUE)
+
+  subset_rna_seq <- rna_seq[rna_seq$`mdux-GFPneg-rna_vs_mdux-GFPpos-rna baseMeanB mdux-GFPpos-rna` > threshold]
+
+  #sort the table by the column baseMean descending
+  rna_seq_sorted <- subset_rna_seq[order(-subset_rna_seq$`mdux-GFPneg-rna_vs_mdux-GFPpos-rna baseMeanB mdux-GFPpos-rna`), ]
+
+
+  ens_ids <- unlist(rna_seq_sorted$`Ensembl gene id`)
+  base_Means <- unlist(rna_seq_sorted$`mdux-GFPneg-rna_vs_mdux-GFPpos-rna baseMeanB mdux-GFPpos-rna`)
+
+  output <- cbind(ens_ids, base_Means)
+
+  file_test <- file("test.txt")
+  write.table(output, file = file_test, row.names = FALSE, col.names = FALSE, quote = FALSE)
+  close(file_test)
+
+
+  #tobias_results_file <- "./bindetect_results.txt"
+  #sample_size = 10
+  #tobias_results_file <- tobias_input
+  #tobias_results <- fread(tobias_results_file, header = TRUE, sep = "\t", fill = TRUE)
+
+  #sort the table by the column mDuxNeg_mDuxPos_change
+  #tobias_results_sorted <- tobias_results[order(-tobias_results$mDuxNeg_mDuxPos_change), ]
+
+  #take top 10 from the tobias results and save only the gene names
+  #top_10 <- tobias_results_sorted[1:sample_size, ]$TF_name
+  #c_top_10 <- unlist(top_10, use.names = FALSE)
+
+  #concatenate the Jaspar ID
+  #c_top_10 <- gsub("_.*", "", c_top_10)
+
+  #make a subset of the same length as top_10, with random samples, replace = False excludes using one gene twice
+  #random_10 <- tobias_results_sorted[sample(sample_size + 1:nrow(tobias_results_sorted), sample_size, replace=FALSE), ]$TF_name
+  #c_random_10 <- unlist(random_10, use.names = FALSE)
+
+  #concatenate the Jaspar ID
+  #c_random_10 <- gsub("_.*", "", c_random_10)
+
+
+  #genes_exons_correlation <- "./genes_exons_correlation.txt"
+
+  #genes_exons_table <- fread(genes_exons_correlation, header = TRUE, sep = "\t")
+
+  #genes <- genes_exons_table$gene_name
+  #genes_ids <- unlist(genes_exons_table$gene_id)
+}
+
+#delete the help message from the parameter
+params <- opt[-length(opt)]
+do.call(test_fun, args = params)
diff --git a/create_groups.R b/create_groups.R
@@ -0,0 +1,70 @@
+library(data.table)
+
+if (!require(optparse)) install.packages("optparse"); library(optparse)
+
+option_list <- list(
+  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file from TOBIAS",
+              metavar = "character"),
+  make_option(opt_str = c("-s", "--sample_size"), default = 10, help = "Input the size of the sample to test", type = "integer")
+)
+
+opt_parser <- OptionParser(option_list = option_list, 
+                           description = "This script creates files for visualize.py containing two groups to compare. The first group is the top 10
+                           best genes from the input file, and the other group is a random sampe of genes from the input file.",
+                           epilogue = "Author: Anastasiia Petrova <Anastasiia.Petrova@mpi-bn.mpg.de>")
+
+opt <- parse_args(opt_parser)
+
+test_fun <- function(input, sample_size){
+  #message(input)
+  #message(sample_size)
+
+  #read the input file as data table
+  #tobias_results_file <- "./bindetect_results.txt"
+  tobias_results_file <- input
+  tobias_results <- fread(tobias_results_file, header = TRUE, sep = "\t", fill = TRUE)
+
+  #sort the table by the column mDuxNeg_mDuxPos_change
+  tobias_results_sorted <- tobias_results[order(-tobias_results$mDuxNeg_mDuxPos_change), ]
+
+  #take top 10 from the tobias results and save only the gene names
+  top_10 <- tobias_results_sorted[1:sample_size, ]$TF_name
+  c_top_10 <- unlist(top_10, use.names = FALSE)
+
+  #concatenate the Jaspar ID
+  c_top_10 <- gsub("_.*", "", c_top_10)
+
+  #make a subset of the same length as top_10, with random samples, replace = False excludes using one gene twice
+  random_10 <- tobias_results_sorted[sample(sample_size + 1:nrow(tobias_results_sorted), sample_size, replace=FALSE), ]$TF_name
+  c_random_10 <- unlist(random_10, use.names = FALSE)
+
+  #concatenate the Jaspar ID
+  c_random_10 <- gsub("_.*", "", c_random_10)
+
+  #write the top_10 and the random_10 to the txt files
+  file_top_10 <- file("top_group.txt")
+  writeLines(c_top_10, con = file_top_10, sep = "\n")
+  close(file_top_10)
+  cat("The best ", sample_size, " samples are saved to the file ./top_group.txt \n")
+
+  file_random_10 <- file("random_group.txt")
+  writeLines(c_random_10, con = file_random_10, sep = "\n")
+  close(file_random_10)
+  cat("The random ", sample_size, " samples are saved to the file ./random_group.txt \n")
+
+  #make the group of bottom genes
+  tobias_results_sorted_a <- tobias_results[order(tobias_results$mDuxNeg_mDuxPos_change), ]
+
+  bottom_10 <- tobias_results_sorted_a[1:sample_size, ]$TF_name
+  c_bottom_10 <- unlist(bottom_10, use.names = FALSE)
+  c_bottom_10 <- gsub("_.*", "", c_bottom_10)
+
+  file_bottom_10 <- file("bottom_group.txt")
+  writeLines(c_bottom_10, con = file_bottom_10, sep = "\n")
+  close(file_bottom_10)
+  cat("The bottom ", sample_size, " samples are saved to the file ./bottom_group.txt \n")
+}
+
+#delete the help message from the parameter
+params <- opt[-length(opt)]
+do.call(test_fun, args = params)
diff --git a/find_exons.py b/find_exons.py
@@ -25,7 +25,7 @@ def parse_args():
 
 	parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent('''                           
 
-		This script produces plots and a tsv-file to show the number of exons for the list of genes of interest.
+		This script produces a tsv-file to show the number of exons for the list of genes of interest.
 		'''), epilog='That is what you need to make this script work for you. Enjoy it')
 
 	required_arguments = parser.add_argument_group('required arguments')

diff --git a/find_exons.yaml b/find_exons.yaml
@@ -5,9 +5,17 @@ channels:
   - conda-forge
 
 dependencies:
-  - python
+  - python >=3
   - snakemake
+  - r-seqinr
+  - r-base>=3.5.1
+  - r-data.table
+  - r-pbapply
+  - r-igraph
+  - r-stringi
+  - r-stringr
+  - r-optparse
   - biopython
   - matplotlib
   - scipy
-  - statsmodels
+  - statsmodels
diff --git a/using_rnaseq.R b/using_rnaseq.R
@@ -0,0 +1,92 @@
+library(data.table)
+library(stringr)
+library(stringi)
+
+genes_exons_correlation <- "./genes_exons_correlation.txt"
+
+genes_exons_table <- fread(genes_exons_correlation, header = TRUE, sep = "\t")
+View(genes_exons_table)
+
+rnaseq_file <- "./bigmatrix_norm.txt"
+
+#use read.delim as read.table produces warning message EOF within quoted string
+rnaseq_table <- fread(rnaseq_file, header = TRUE, sep = "\t", fill = TRUE)
+View(rnaseq_table)
+
+tobias_results_file <- "./bindetect_results.txt"
+tobias_results <- fread(tobias_results_file, header = TRUE, sep = "\t", fill = TRUE)
+View(tobias_results)
+
+#sort the table by the column mDuxNeg_mDuxPos_change
+tobias_results_sorted <- tobias_results[order(tobias_results$mDuxNeg_mDuxPos_change), ]
+View(tobias_results_sorted)
+
+#testing around
+i <- grep("CEBPD_MA0836.1", tobias_results_sorted$TF_name) #the number of the row
+tobias_results_sorted[i,] #print this row
+
+sample_size = 10
+#take top 10 from the tobias results and save only the gene names
+top_10 <- tobias_results_sorted[1:sample_size, ]$TF_name
+View(top_10)
+c_top_10 <- unlist(top_10, use.names = FALSE)
+
+#make a subset of the same length as top_10, with random samples, replace = False excludes using one gene twice
+random_10 <- tobias_results_sorted[sample(sample_size + 1:nrow(tobias_results_sorted), sample_size, replace=FALSE), ]$TF_name
+View(random_10)
+c_random_10 <- unlist(random_10, use.names = FALSE)
+
+#write the top_10 and the random_10 to the txt files
+file_top_10 <- file("top_10.txt")
+writeLines(c_top_10, con = file_top_10, sep = "\n")
+close(file_top_10)
+
+file_random_10 <- file("random_10.txt")
+writeLines(c_random_10, con = file_random_10, sep = "\n")
+close(file_random_10)
+
+
+
+#apply(genes_exons_table, 1, function(r) any(r %in% c_top_10)) #bebe
+
+#look_for <- grepl("Cphx", genes_exons_table$gene_name)
+#look_for
+
+#genes_exons_table[look_for]
+
+#look_for2 <- stri_subset(genes_exons_table$gene_name, regex = c_top_10)
+
+#look_for3 <- apply(outer(genes_exons_table$gene_name, c_top_10, stri_detect), 1, all)
+#genes_exons_table[look_for3]
+
+#stri_detect(str = genes_exons_table$gene_name, regex = c_top_10)
+
+small_list <- lapply(c_top_10, function (x){
+  as.data.table(stri_detect(str = genes_exons_table$gene_name, regex = x))
+}) #liste von datatables
+
+?do.call
+new_table <- do.call(cbind, small_list) #containing rows with true/false
+
+new_vector <- apply(new_table, MARGIN = 1, any) #margin 1 über die zeilen
+
+any(new_vector)
+
+
+
+small_list2 <- lapply(c_top_10, function (y){
+  as.data.table(grepl(x = genes_exons_table$gene_name, pattern = y, ignore.case = TRUE, perl = TRUE))
+})
+
+new_table2 <- do.call(cbind, small_list2) #containing rows with true/false
+
+new_vector2 <- apply(new_table2, MARGIN = 1, any) #margin 1 über die zeilen
+
+any(new_vector2) #false
+any(grepl(x = genes_exons_table$gene_name, pattern = "CPHX", ignore.case = TRUE, perl = TRUE)) #true
+
+#------------------
+
+
+
+