Skip to content

Commit

Permalink
adding R skripts and modification in find_exons.py and visualize.py
Browse files Browse the repository at this point in the history
  • Loading branch information
anastasiia committed Jan 20, 2019
1 parent b10fba0 commit 4056411
Show file tree
Hide file tree
Showing 6 changed files with 356 additions and 48 deletions.
72 changes: 72 additions & 0 deletions concatenate_rna_info.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
library(data.table)

if (!require(optparse)) install.packages("optparse"); library(optparse)

option_list <- list(
make_option(opt_str = c("-b", "--rna_input"), default = NULL, help = "Input TSV-file with RNA-seq information", metavar = "character"),
make_option(opt_str = c("-t", "--treshold"), default = 10, help = "Input the threshold to cut the baseMeanB", type = "integer")
)

opt_parser <- OptionParser(option_list = option_list,
description = "This script creates a TSV-file with information from the RNA-seq with EnsemblID and B-baseMean",
epilogue = "Author: Anastasiia Petrova <Anastasiia.Petrova@mpi-bn.mpg.de>")

opt <- parse_args(opt_parser)

test_fun <- function(tobias_input, sample_size, rna_input, threshold){
#message(input)

#read the input file as data table
rna_seq_file <- rna_input
rna_seq <- fread(rna_seq_file, header = TRUE, sep = "\t", fill = TRUE)

subset_rna_seq <- rna_seq[rna_seq$`mdux-GFPneg-rna_vs_mdux-GFPpos-rna baseMeanB mdux-GFPpos-rna` > threshold]

#sort the table by the column baseMean descending
rna_seq_sorted <- subset_rna_seq[order(-subset_rna_seq$`mdux-GFPneg-rna_vs_mdux-GFPpos-rna baseMeanB mdux-GFPpos-rna`), ]


ens_ids <- unlist(rna_seq_sorted$`Ensembl gene id`)
base_Means <- unlist(rna_seq_sorted$`mdux-GFPneg-rna_vs_mdux-GFPpos-rna baseMeanB mdux-GFPpos-rna`)

output <- cbind(ens_ids, base_Means)

file_test <- file("test.txt")
write.table(output, file = file_test, row.names = FALSE, col.names = FALSE, quote = FALSE)
close(file_test)


#tobias_results_file <- "./bindetect_results.txt"
#sample_size = 10
#tobias_results_file <- tobias_input
#tobias_results <- fread(tobias_results_file, header = TRUE, sep = "\t", fill = TRUE)

#sort the table by the column mDuxNeg_mDuxPos_change
#tobias_results_sorted <- tobias_results[order(-tobias_results$mDuxNeg_mDuxPos_change), ]

#take top 10 from the tobias results and save only the gene names
#top_10 <- tobias_results_sorted[1:sample_size, ]$TF_name
#c_top_10 <- unlist(top_10, use.names = FALSE)

#concatenate the Jaspar ID
#c_top_10 <- gsub("_.*", "", c_top_10)

#make a subset of the same length as top_10, with random samples, replace = False excludes using one gene twice
#random_10 <- tobias_results_sorted[sample(sample_size + 1:nrow(tobias_results_sorted), sample_size, replace=FALSE), ]$TF_name
#c_random_10 <- unlist(random_10, use.names = FALSE)

#concatenate the Jaspar ID
#c_random_10 <- gsub("_.*", "", c_random_10)


#genes_exons_correlation <- "./genes_exons_correlation.txt"

#genes_exons_table <- fread(genes_exons_correlation, header = TRUE, sep = "\t")

#genes <- genes_exons_table$gene_name
#genes_ids <- unlist(genes_exons_table$gene_id)
}

#delete the help message from the parameter
params <- opt[-length(opt)]
do.call(test_fun, args = params)
70 changes: 70 additions & 0 deletions create_groups.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
library(data.table)

if (!require(optparse)) install.packages("optparse"); library(optparse)

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file from TOBIAS",
metavar = "character"),
make_option(opt_str = c("-s", "--sample_size"), default = 10, help = "Input the size of the sample to test", type = "integer")
)

opt_parser <- OptionParser(option_list = option_list,
description = "This script creates files for visualize.py containing two groups to compare. The first group is the top 10
best genes from the input file, and the other group is a random sampe of genes from the input file.",
epilogue = "Author: Anastasiia Petrova <Anastasiia.Petrova@mpi-bn.mpg.de>")

opt <- parse_args(opt_parser)

test_fun <- function(input, sample_size){
#message(input)
#message(sample_size)

#read the input file as data table
#tobias_results_file <- "./bindetect_results.txt"
tobias_results_file <- input
tobias_results <- fread(tobias_results_file, header = TRUE, sep = "\t", fill = TRUE)

#sort the table by the column mDuxNeg_mDuxPos_change
tobias_results_sorted <- tobias_results[order(-tobias_results$mDuxNeg_mDuxPos_change), ]

#take top 10 from the tobias results and save only the gene names
top_10 <- tobias_results_sorted[1:sample_size, ]$TF_name
c_top_10 <- unlist(top_10, use.names = FALSE)

#concatenate the Jaspar ID
c_top_10 <- gsub("_.*", "", c_top_10)

#make a subset of the same length as top_10, with random samples, replace = False excludes using one gene twice
random_10 <- tobias_results_sorted[sample(sample_size + 1:nrow(tobias_results_sorted), sample_size, replace=FALSE), ]$TF_name
c_random_10 <- unlist(random_10, use.names = FALSE)

#concatenate the Jaspar ID
c_random_10 <- gsub("_.*", "", c_random_10)

#write the top_10 and the random_10 to the txt files
file_top_10 <- file("top_group.txt")
writeLines(c_top_10, con = file_top_10, sep = "\n")
close(file_top_10)
cat("The best ", sample_size, " samples are saved to the file ./top_group.txt \n")

file_random_10 <- file("random_group.txt")
writeLines(c_random_10, con = file_random_10, sep = "\n")
close(file_random_10)
cat("The random ", sample_size, " samples are saved to the file ./random_group.txt \n")

#make the group of bottom genes
tobias_results_sorted_a <- tobias_results[order(tobias_results$mDuxNeg_mDuxPos_change), ]

bottom_10 <- tobias_results_sorted_a[1:sample_size, ]$TF_name
c_bottom_10 <- unlist(bottom_10, use.names = FALSE)
c_bottom_10 <- gsub("_.*", "", c_bottom_10)

file_bottom_10 <- file("bottom_group.txt")
writeLines(c_bottom_10, con = file_bottom_10, sep = "\n")
close(file_bottom_10)
cat("The bottom ", sample_size, " samples are saved to the file ./bottom_group.txt \n")
}

#delete the help message from the parameter
params <- opt[-length(opt)]
do.call(test_fun, args = params)
2 changes: 1 addition & 1 deletion find_exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def parse_args():

parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent('''
This script produces plots and a tsv-file to show the number of exons for the list of genes of interest.
This script produces a tsv-file to show the number of exons for the list of genes of interest.
'''), epilog='That is what you need to make this script work for you. Enjoy it')

required_arguments = parser.add_argument_group('required arguments')
Expand Down
12 changes: 10 additions & 2 deletions find_exons.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,17 @@ channels:
- conda-forge

dependencies:
- python
- python >=3
- snakemake
- r-seqinr
- r-base>=3.5.1
- r-data.table
- r-pbapply
- r-igraph
- r-stringi
- r-stringr
- r-optparse
- biopython
- matplotlib
- scipy
- statsmodels
- statsmodels
92 changes: 92 additions & 0 deletions using_rnaseq.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
library(data.table)
library(stringr)
library(stringi)

genes_exons_correlation <- "./genes_exons_correlation.txt"

genes_exons_table <- fread(genes_exons_correlation, header = TRUE, sep = "\t")
View(genes_exons_table)

rnaseq_file <- "./bigmatrix_norm.txt"

#use read.delim as read.table produces warning message EOF within quoted string
rnaseq_table <- fread(rnaseq_file, header = TRUE, sep = "\t", fill = TRUE)
View(rnaseq_table)

tobias_results_file <- "./bindetect_results.txt"
tobias_results <- fread(tobias_results_file, header = TRUE, sep = "\t", fill = TRUE)
View(tobias_results)

#sort the table by the column mDuxNeg_mDuxPos_change
tobias_results_sorted <- tobias_results[order(tobias_results$mDuxNeg_mDuxPos_change), ]
View(tobias_results_sorted)

#testing around
i <- grep("CEBPD_MA0836.1", tobias_results_sorted$TF_name) #the number of the row
tobias_results_sorted[i,] #print this row

sample_size = 10
#take top 10 from the tobias results and save only the gene names
top_10 <- tobias_results_sorted[1:sample_size, ]$TF_name
View(top_10)
c_top_10 <- unlist(top_10, use.names = FALSE)

#make a subset of the same length as top_10, with random samples, replace = False excludes using one gene twice
random_10 <- tobias_results_sorted[sample(sample_size + 1:nrow(tobias_results_sorted), sample_size, replace=FALSE), ]$TF_name
View(random_10)
c_random_10 <- unlist(random_10, use.names = FALSE)

#write the top_10 and the random_10 to the txt files
file_top_10 <- file("top_10.txt")
writeLines(c_top_10, con = file_top_10, sep = "\n")
close(file_top_10)

file_random_10 <- file("random_10.txt")
writeLines(c_random_10, con = file_random_10, sep = "\n")
close(file_random_10)



#apply(genes_exons_table, 1, function(r) any(r %in% c_top_10)) #bebe

#look_for <- grepl("Cphx", genes_exons_table$gene_name)
#look_for

#genes_exons_table[look_for]

#look_for2 <- stri_subset(genes_exons_table$gene_name, regex = c_top_10)

#look_for3 <- apply(outer(genes_exons_table$gene_name, c_top_10, stri_detect), 1, all)
#genes_exons_table[look_for3]

#stri_detect(str = genes_exons_table$gene_name, regex = c_top_10)

small_list <- lapply(c_top_10, function (x){
as.data.table(stri_detect(str = genes_exons_table$gene_name, regex = x))
}) #liste von datatables

?do.call
new_table <- do.call(cbind, small_list) #containing rows with true/false

new_vector <- apply(new_table, MARGIN = 1, any) #margin 1 über die zeilen

any(new_vector)



small_list2 <- lapply(c_top_10, function (y){
as.data.table(grepl(x = genes_exons_table$gene_name, pattern = y, ignore.case = TRUE, perl = TRUE))
})

new_table2 <- do.call(cbind, small_list2) #containing rows with true/false

new_vector2 <- apply(new_table2, MARGIN = 1, any) #margin 1 über die zeilen

any(new_vector2) #false
any(grepl(x = genes_exons_table$gene_name, pattern = "CPHX", ignore.case = TRUE, perl = TRUE)) #true

#------------------




Loading

0 comments on commit 4056411

Please sign in to comment.