create_groups.R

library(data.table)

if (!require(optparse)) install.packages("optparse"); library(optparse)

option_list <- list(
  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file from TOBIAS",
              metavar = "character"),
  make_option(opt_str = c("-s", "--sample_size"), default = 10, help = "Input the size of the sample to test", type = "integer")
)

opt_parser <- OptionParser(option_list = option_list,
                           description = "This script creates files for visualize.py containing two groups to compare. The first group is the top 10
                           best genes from the input file, and the other group is a random sampe of genes from the input file.",
                           epilogue = "Author: Anastasiia Petrova <Anastasiia.Petrova@mpi-bn.mpg.de>")

opt <- parse_args(opt_parser)

test_fun <- function(input, sample_size){
  #message(input)
  #message(sample_size)

  #read the input file as data table
  #tobias_results_file <- "./bindetect_results.txt"
  tobias_results_file <- input
  tobias_results <- fread(tobias_results_file, header = TRUE, sep = "\t", fill = TRUE)

  #sort the table by the column mDuxNeg_mDuxPos_change
  tobias_results_sorted <- tobias_results[order(-tobias_results$mDuxNeg_mDuxPos_change), ]

  #take top 10 from the tobias results and save only the gene names
  top_10 <- tobias_results_sorted[1:sample_size, ]$TF_name
  c_top_10 <- unlist(top_10, use.names = FALSE)

  #concatenate the Jaspar ID
  c_top_10 <- gsub("_.*", "", c_top_10)

  #make a subset of the same length as top_10, with random samples, replace = False excludes using one gene twice
  random_10 <- tobias_results_sorted[sample(sample_size + 1:nrow(tobias_results_sorted), sample_size, replace=FALSE), ]$TF_name
  c_random_10 <- unlist(random_10, use.names = FALSE)

  #concatenate the Jaspar ID
  c_random_10 <- gsub("_.*", "", c_random_10)

  #write the top_10 and the random_10 to the txt files
  file_top_10 <- file("top_group.txt")
  writeLines(c_top_10, con = file_top_10, sep = "\n")
  close(file_top_10)
  cat("The best ", sample_size, " samples are saved to the file ./top_group.txt \n")

  file_random_10 <- file("random_group.txt")
  writeLines(c_random_10, con = file_random_10, sep = "\n")
  close(file_random_10)
  cat("The random ", sample_size, " samples are saved to the file ./random_group.txt \n")

  #make the group of bottom genes
  tobias_results_sorted_a <- tobias_results[order(tobias_results$mDuxNeg_mDuxPos_change), ]

  bottom_10 <- tobias_results_sorted_a[1:sample_size, ]$TF_name
  c_bottom_10 <- unlist(bottom_10, use.names = FALSE)
  c_bottom_10 <- gsub("_.*", "", c_bottom_10)

  file_bottom_10 <- file("bottom_group.txt")
  writeLines(c_bottom_10, con = file_bottom_10, sep = "\n")
  close(file_bottom_10)
  cat("The bottom ", sample_size, " samples are saved to the file ./bottom_group.txt \n")
}

#delete the help message from the parameter
params <- opt[-length(opt)]
do.call(test_fun, args = params)
	library(data.table)

	if (!require(optparse)) install.packages("optparse"); library(optparse)

	option_list <- list(
	make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file from TOBIAS",
	metavar = "character"),
	make_option(opt_str = c("-s", "--sample_size"), default = 10, help = "Input the size of the sample to test", type = "integer")
	)

	opt_parser <- OptionParser(option_list = option_list,
	description = "This script creates files for visualize.py containing two groups to compare. The first group is the top 10
	best genes from the input file, and the other group is a random sampe of genes from the input file.",
	epilogue = "Author: Anastasiia Petrova <Anastasiia.Petrova@mpi-bn.mpg.de>")

	opt <- parse_args(opt_parser)

	test_fun <- function(input, sample_size){
	#message(input)
	#message(sample_size)

	#read the input file as data table
	#tobias_results_file <- "./bindetect_results.txt"
	tobias_results_file <- input
	tobias_results <- fread(tobias_results_file, header = TRUE, sep = "\t", fill = TRUE)

	#sort the table by the column mDuxNeg_mDuxPos_change
	tobias_results_sorted <- tobias_results[order(-tobias_results$mDuxNeg_mDuxPos_change), ]

	#take top 10 from the tobias results and save only the gene names
	top_10 <- tobias_results_sorted[1:sample_size, ]$TF_name
	c_top_10 <- unlist(top_10, use.names = FALSE)

	#concatenate the Jaspar ID
	c_top_10 <- gsub("_.*", "", c_top_10)

	#make a subset of the same length as top_10, with random samples, replace = False excludes using one gene twice
	random_10 <- tobias_results_sorted[sample(sample_size + 1:nrow(tobias_results_sorted), sample_size, replace=FALSE), ]$TF_name
	c_random_10 <- unlist(random_10, use.names = FALSE)

	#concatenate the Jaspar ID
	c_random_10 <- gsub("_.*", "", c_random_10)

	#write the top_10 and the random_10 to the txt files
	file_top_10 <- file("top_group.txt")
	writeLines(c_top_10, con = file_top_10, sep = "\n")
	close(file_top_10)
	cat("The best ", sample_size, " samples are saved to the file ./top_group.txt \n")

	file_random_10 <- file("random_group.txt")
	writeLines(c_random_10, con = file_random_10, sep = "\n")
	close(file_random_10)
	cat("The random ", sample_size, " samples are saved to the file ./random_group.txt \n")

	#make the group of bottom genes
	tobias_results_sorted_a <- tobias_results[order(tobias_results$mDuxNeg_mDuxPos_change), ]

	bottom_10 <- tobias_results_sorted_a[1:sample_size, ]$TF_name
	c_bottom_10 <- unlist(bottom_10, use.names = FALSE)
	c_bottom_10 <- gsub("_.*", "", c_bottom_10)

	file_bottom_10 <- file("bottom_group.txt")
	writeLines(c_bottom_10, con = file_bottom_10, sep = "\n")
	close(file_bottom_10)
	cat("The bottom ", sample_size, " samples are saved to the file ./bottom_group.txt \n")
	}

	#delete the help message from the parameter
	params <- opt[-length(opt)]
	do.call(test_fun, args = params)