diff --git a/bin/2.2_motif_estimation/merge_similar_clusters.R b/bin/2.2_motif_estimation/merge_similar_clusters.R index 0104d05..d9b2373 100644 --- a/bin/2.2_motif_estimation/merge_similar_clusters.R +++ b/bin/2.2_motif_estimation/merge_similar_clusters.R @@ -1,77 +1,79 @@ -#!/usr/bin/env Rscript -if (!require(optparse)) install.packages("optparse"); library(optparse) - -option_list <- list( - make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"), - make_option(opt_str = c("-l", "--list"), default = NULL, help = "Numerically sorted whitespace separated list of absolute fasta-file paths", metavar = "character"), - make_option(opt_str = c("-w", "--min"), default = NULL, help = "Minimum weight of edge allowed in graph clusters.", metavar = "character") -) - -opt_parser <- OptionParser(option_list = option_list, - description = "Adding Cluster ID to Query_ID Column") - -opt <- parse_args(opt_parser) - -#' Merging FASTA-files, which motifs are similar. -#' -#' @parameter tsv_in Path to TSV file generated by Tomtom. -#' The input for Tomtom is a from all clusters merged meme-file. -#' @parameter file_list Numerically sorted whitespace separated list of absolute fasta-file paths -#' @parameter min_weight Minimum weight of edge allowed in graph clusters. -#' -#' @author René Wiegandt -#' @contact rene.wiegandt(at)mpi-bn.mpg.de -merge_similar <- function(tsv_in, file_list, min_weight){ - - files <- unlist(as.list(strsplit(file_list, ","))) - - # split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster. - tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character') - query_cluster <- unlist(lapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){ - tail(l,n=1) - })) - target_cluster <- unlist(lapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){ - tail(l,n=1) - })) - - # create data.table with only the cluster-numbers - sim_not_unique <- data.table::data.table(query_cluster,target_cluster) - # convert from character to numeric values - sim_not_unique[, query_cluster := as.numeric(query_cluster)] - sim_not_unique[, target_cluster := as.numeric(target_cluster)] - - # remove rows if column 1 is idential to column 2 - edgelist <- sim_not_unique[query_cluster != target_cluster] - - # create graph from data.frame - g <- igraph::graph_from_edgelist(as.matrix(edgelist)) - # converting graph to adjacency matrix - adj_matrix <- igraph::get.adjacency(g, names = T) - # generating weighted graph from adjacency matrix - g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T) - - # get subgraphs from graph with edges of weight > min_weight - s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F) - png('motif_clusters.png') - plot(s1) - dev.off() - clust <- igraph::clusters(s1) - if (clust$no < 1){ - b <- lapply(files, function(f){ - system(paste("cat",f,">",basename(f))) - }) - } - # merge FASTA-files depending on the clustered graphs - a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){ - cl <- as.vector(which(clust$membership %in% c(i))) - fasta_list <- paste(files[cl], collapse = " ") - name <- paste0("Cluster_",i,".fasta") - system(paste("cat",fasta_list,">",name)) - }) -} - - -# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell) -if (!interactive()) { - merge_similar(opt$input, opt$list, opt$min) -} +#!/usr/bin/env Rscript +if (!require(optparse)) install.packages("optparse"); library(optparse) + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"), + make_option(opt_str = c("-l", "--list"), default = NULL, help = "Numerically sorted whitespace separated list of absolute fasta-file paths", metavar = "character"), + make_option(opt_str = c("-w", "--min"), default = NULL, help = "Minimum weight of edge allowed in graph clusters.", metavar = "character") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Adding Cluster ID to Query_ID Column", + epilogue = "Author: Rene Wiegandt ") + +opt <- parse_args(opt_parser) + +#' Merging FASTA-files, which motifs are similar. +#' +#' @parameter tsv_in Path to TSV file generated by Tomtom. +#' The input for Tomtom is a from all clusters merged meme-file. +#' @parameter file_list Numerically sorted comma separated list of absolute fasta-file paths +#' @parameter min_weight Minimum weight of edge allowed in graph clusters. +#' +#' @author René Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de +merge_similar <- function(tsv_in, file_list, min_weight){ + + files <- unlist(strsplit(file_list, ",")) + + # split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster. + tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character') + query_cluster <- vapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){ + tail(l, n = 1) + },"") + target_cluster <- vapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){ + tail(l, n = 1) + },"") + + # create data.table with only the cluster-numbers + sim_not_unique <- data.table::data.table(as.numeric(query_cluster),as.numeric(target_cluster)) + + # remove rows if column 1 is identical to column 2 + edgelist <- sim_not_unique[query_cluster != target_cluster] + + # create graph from data.frame + g <- igraph::graph_from_edgelist(as.matrix(edgelist)) + # converting graph to adjacency matrix + adj_matrix <- igraph::get.adjacency(g, names = T) + # generating weighted graph from adjacency matrix + g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T) + + # get subgraphs from graph with edges of weight > min_weight + s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight > min_weight], del = F) + png('motif_clusters.png') + plot(s1) + dev.off() + clust <- igraph::clusters(s1) + if (clust$no < 1) { + b <- lapply(files, function(f){ + system(paste("cat",f,">",basename(f))) + }) + } + # merge FASTA-files depending on the clustered graphs + a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){ + #get vector with cluster ids, which are clustered together in "motif cluster" i + cl <- as.vector(which(clust$membership %in% c(i))) + #create string, which represents a list, containing all FASTA-files to be merged + fasta_list <- paste(files[cl], collapse = " ") + #create name for merged FASTA-file + name <- paste0("Cluster_",i,".fasta") + #merge fasta files + system(paste("cat",fasta_list,">",name)) + }) +} + + +# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell) +if (!interactive()) { + merge_similar(opt$input, opt$list, opt$min) +}