loosolab · renewiegandt · Jan 6, 2019 · Jan 4, 2019 · Jan 4, 2019 · Jan 4, 2019
diff --git a/bin/2.2_motif_estimation/bed_to_fasta.R b/bin/2.2_motif_estimation/bed_to_fasta.R
@@ -1,13 +1,13 @@
 #!/usr/bin/env Rscript
-library("optparse")
+if (!require(optparse)) install.packages("optparse"); library(optparse)
 
 option_list <- list(
   make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
   make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"),
   make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
 )
 
-opt_parser <- OptionParser(option_list = option_list, 
+opt_parser <- OptionParser(option_list = option_list,
                            description = "Convert BED-file to one FASTA-file per cluster")
 
 opt <- parse_args(opt_parser)
@@ -22,19 +22,19 @@ opt <- parse_args(opt_parser)
 #' @author René Wiegandt
 #' @contact rene.wiegandt(at)mpi-bn.mpg.de
 bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
-  
+
   if (is.null(bedInput)) {
     stop("ERROR: Input parameter cannot be null! Please specify the input parameter.")
   }
 
   bed <- data.table::fread(bedInput, sep = "\t")
-  
+
   # Get last column of data.table, which refers to the cluster, as a vector.
   cluster_no <- as.vector(bed[[ncol(bed)]])
-  
+
   # Split data.table bed on its last column (cluster_no) into list of data.frames
   clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)
-  
+
   # For each data.frame(cluster) in list clusters:
   discard <- lapply(1:length(clusters), function(i){
     clust <- as.data.frame(clusters[i])

diff --git a/bin/2.2_motif_estimation/label_cluster.R b/bin/2.2_motif_estimation/label_cluster.R
@@ -0,0 +1,39 @@
+#!/usr/bin/env Rscript
+if (!require(optparse)) install.packages("optparse"); library(optparse)
+
+option_list <- list(
+  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from tomtom", metavar = "character"),
+  make_option(opt_str = c("-o", "--output"), default = NULL, help = "Output TSV-file.", metavar = "character")
+)
+
+opt_parser <- OptionParser(option_list = option_list, 
+                           description = "Adding Cluster ID to Query_ID Column")
+
+opt <- parse_args(opt_parser)
+
+#' Adding Cluster ID to Query_ID Column
+#' 
+#' @param input <string> TSV-file. Output from tomtom.
+#' @param input <string> Output name.
+#'
+#' @author Ren� Wiegandt
+#' @contact rene.wiegandt(at)mpi-bn.mpg.de
+label_cluster <- function(input, output){
+  #Reading TSV-file
+  tsv <- data.table::fread(input, header = T, sep = "\t")
+
+  #Getting cluster ID/number 
+  splitted_name <- unlist(strsplit(input, "\\_|\\."))
+  cluster_number <- as.numeric(splitted_name[length(splitted_name) - 1]) + 1
+
+  #Adding cluster ID to first column
+  tsv$Query_ID <- paste(tsv$Query_ID, ".", cluster_number,sep = "")
+
+
+  data.table::fwrite(tsv, file = output, sep = "\t", col.names = FALSE)
+}
+
+# run function label_cluster with given parameteres if not in interactive context (e.g. run from shell)
+if (!interactive()) {
+  label_cluster(opt$input, opt$output)
+}
diff --git a/bin/2.2_motif_estimation/merge_similar_clusters.R b/bin/2.2_motif_estimation/merge_similar_clusters.R
@@ -1,61 +1,77 @@
 #!/usr/bin/env Rscript
+if (!require(optparse)) install.packages("optparse"); library(optparse)
 
-# Merging FASTA-files, which motifs are similar.
-#
-# @parameter tsv_in <string> Path to TSV file generated by Tomtom.
-#                            The input for Tomtom is a from all clusters merged meme-file.
-# @parameter file_list <string> Numerically sorted whitespace separated list of absolute fasta-file paths
-# @parameter min_weight <INT> Minimum weight of edge allowed in graph clusters.
-
-
-args = commandArgs(trailingOnly = TRUE)
-
-tsv_in <- args[1]
-file_list <- args[2]
-min_weight <- args[3]
-
-files <- unlist(as.list(strsplit(file_list, ",")))
-
-# split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster.
-tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character')
-query_cluster <- unlist(lapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){
-  tail(l,n=1)
-}))
-target_cluster <- unlist(lapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){
-  tail(l,n=1)
-}))
-
-# create data.table with only the cluster-numbers
-sim_not_unique <- data.table::data.table(query_cluster,target_cluster)
-# convert from character to numeric values
-sim_not_unique[, query_cluster := as.numeric(query_cluster)]
-sim_not_unique[, target_cluster := as.numeric(target_cluster)]
-
-# remove rows if column 1 is idential to column 2
-edgelist <- sim_not_unique[query_cluster != target_cluster]
-
-# create graph from data.frame
-g <- igraph::graph_from_edgelist(as.matrix(edgelist))
-# converting graph to adjacency matrix
-adj_matrix <- igraph::get.adjacency(g, names = T)
-# generating weighted graph from adjacency matrix
-g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)
-
-# get subgraphs from graph with edges of weight > min_weight
-s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F)
-png('motif_clusters.png')
-plot(s1)
-dev.off()
-clust <- igraph::clusters(s1)
-if (clust$no < 1){
-  b <- lapply(files, function(f){
-    system(paste("cat",f,">",basename(f)))
+option_list <- list(
+  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"),
+  make_option(opt_str = c("-l", "--list"), default = NULL, help = "Numerically sorted whitespace separated list of absolute fasta-file paths", metavar = "character"),
+  make_option(opt_str = c("-w", "--min"), default = NULL, help = "Minimum weight of edge allowed in graph clusters.", metavar = "character")
+)
+
+opt_parser <- OptionParser(option_list = option_list,
+                           description = "Adding Cluster ID to Query_ID Column")
+
+opt <- parse_args(opt_parser)
+
+#' Merging FASTA-files, which motifs are similar.
+#'
+#' @parameter tsv_in <string> Path to TSV file generated by Tomtom.
+#'                            The input for Tomtom is a from all clusters merged meme-file.
+#' @parameter file_list <string> Numerically sorted whitespace separated list of absolute fasta-file paths
+#' @parameter min_weight <INT> Minimum weight of edge allowed in graph clusters.
+#'
+#' @author René Wiegandt
+#' @contact rene.wiegandt(at)mpi-bn.mpg.de
+merge_similar <- function(tsv_in, file_list, min_weight){
+
+  files <- unlist(as.list(strsplit(file_list, ",")))
+
+  # split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster.
+  tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character')
+  query_cluster <- unlist(lapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){
+    tail(l,n=1)
+  }))
+  target_cluster <- unlist(lapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){
+    tail(l,n=1)
+  }))
+
+  # create data.table with only the cluster-numbers
+  sim_not_unique <- data.table::data.table(query_cluster,target_cluster)
+  # convert from character to numeric values
+  sim_not_unique[, query_cluster := as.numeric(query_cluster)]
+  sim_not_unique[, target_cluster := as.numeric(target_cluster)]
+
+  # remove rows if column 1 is idential to column 2
+  edgelist <- sim_not_unique[query_cluster != target_cluster]
+
+  # create graph from data.frame
+  g <- igraph::graph_from_edgelist(as.matrix(edgelist))
+  # converting graph to adjacency matrix
+  adj_matrix <- igraph::get.adjacency(g, names = T)
+  # generating weighted graph from adjacency matrix
+  g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)
+
+  # get subgraphs from graph with edges of weight > min_weight
+  s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F)
+  png('motif_clusters.png')
+  plot(s1)
+  dev.off()
+  clust <- igraph::clusters(s1)
+  if (clust$no < 1){
+    b <- lapply(files, function(f){
+      system(paste("cat",f,">",basename(f)))
+    })
+  }
+  # merge FASTA-files depending on the clustered graphs
+  a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
+    cl <- as.vector(which(clust$membership %in% c(i)))
+    fasta_list <- paste(files[cl], collapse = " ")
+    name <- paste0("Cluster_",i,".fasta")
+    system(paste("cat",fasta_list,">",name))
   })
 }
-# merge FASTA-files depending on the clustered graphs
-a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
-  cl <- as.vector(which(clust$membership %in% c(i)))
-  fasta_list <- paste(files[cl], collapse = " ")
-  name <- paste0("Cluster_",i,".fasta")
-  system(paste("cat",fasta_list,">",name))
-})
+
+
+# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell)
+if (!interactive()) {
+  merge_similar(opt$input, opt$list, opt$min)
+}