Skip to content

Commit

Permalink
Merge pull request #39 from loosolab/estimation_motifs
Browse files Browse the repository at this point in the history
Improved: Motif clustering
  • Loading branch information
renewiegandt authored Jan 6, 2019
2 parents 6d0da12 + 798dd77 commit eb36ef7
Show file tree
Hide file tree
Showing 4 changed files with 216 additions and 84 deletions.
23 changes: 12 additions & 11 deletions bin/2.2_motif_estimation/bed_to_fasta.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
#!/usr/bin/env Rscript
library("optparse")
if (!require(optparse)) install.packages("optparse"); library(optparse)

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster id.", metavar = "character"),
make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"),
make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
)

opt_parser <- OptionParser(option_list = option_list,
description = "Convert BED-file to one FASTA-file per cluster")
opt_parser <- OptionParser(option_list = option_list,
description = "Convert BED-file to one FASTA-file per cluster.",
epilogue = "Author: Rene Wiegandt <Rene.Wiegandt@mpi-bn.mpg.de>")

opt <- parse_args(opt_parser)

#' Splitting BED-files depending on their cluster.
#' The Sequences of each cluster are written as an FASTA-file.
#' The Sequences of each cluster are written as a FASTA-file.
#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
#' Sequence: second last column; Cluster ID: last column
#' @param prefix <string> prefix for filenames
Expand All @@ -22,21 +23,21 @@ opt <- parse_args(opt_parser)
#' @author René Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de
bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){

if (is.null(bedInput)) {
stop("ERROR: Input parameter cannot be null! Please specify the input parameter.")
}

bed <- data.table::fread(bedInput, sep = "\t")

# Get last column of data.table, which refers to the cluster, as a vector.
cluster_no <- as.vector(bed[[ncol(bed)]])
cluster_no <- bed[[ncol(bed)]]

# Split data.table bed on its last column (cluster_no) into list of data.frames
clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)

# For each data.frame(cluster) in list clusters:
discard <- lapply(1:length(clusters), function(i){
discard <- lapply(seq_len(length(clusters)), function(i){
clust <- as.data.frame(clusters[i])
# Filter data.tables(clusters), which are to small
if (nrow(clust) >= as.numeric(min_seq) ) {
Expand Down
40 changes: 40 additions & 0 deletions bin/2.2_motif_estimation/label_cluster.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env Rscript
if (!require(optparse)) install.packages("optparse"); library(optparse)

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from tomtom", metavar = "character"),
make_option(opt_str = c("-o", "--output"), default = NULL, help = "Output TSV-file.", metavar = "character")
)

opt_parser <- OptionParser(option_list = option_list,
description = "Adding Cluster ID to Query_ID Column.",
epilogue = "Author: Rene Wiegandt <Rene.Wiegandt@mpi-bn.mpg.de>")

opt <- parse_args(opt_parser)

#' Adding Cluster ID to Query_ID Column
#'
#' @param input <string> TSV-file. Output from tomtom.
#' @param input <string> Output name.
#'
#' @author René Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de
label_cluster <- function(input, output){
#Reading TSV-file
tsv <- data.table::fread(input, header = T, sep = "\t")

#Getting cluster ID/number
splitted_name <- unlist(strsplit(input, "\\_|\\."))
cluster_number <- as.numeric(splitted_name[length(splitted_name) - 1]) + 1

#Adding cluster ID to first column
tsv$Query_ID <- paste0(tsv$Query_ID, ".", cluster_number)


data.table::fwrite(tsv, file = output, sep = "\t", col.names = FALSE)
}

# run function label_cluster with given parameteres if not in interactive context (e.g. run from shell)
if (!interactive()) {
label_cluster(opt$input, opt$output)
}
140 changes: 79 additions & 61 deletions bin/2.2_motif_estimation/merge_similar_clusters.R
Original file line number Diff line number Diff line change
@@ -1,61 +1,79 @@
#!/usr/bin/env Rscript

# Merging FASTA-files, which motifs are similar.
#
# @parameter tsv_in <string> Path to TSV file generated by Tomtom.
# The input for Tomtom is a from all clusters merged meme-file.
# @parameter file_list <string> Numerically sorted whitespace separated list of absolute fasta-file paths
# @parameter min_weight <INT> Minimum weight of edge allowed in graph clusters.


args = commandArgs(trailingOnly = TRUE)

tsv_in <- args[1]
file_list <- args[2]
min_weight <- args[3]

files <- unlist(as.list(strsplit(file_list, ",")))

# split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster.
tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character')
query_cluster <- unlist(lapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){
tail(l,n=1)
}))
target_cluster <- unlist(lapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){
tail(l,n=1)
}))

# create data.table with only the cluster-numbers
sim_not_unique <- data.table::data.table(query_cluster,target_cluster)
# convert from character to numeric values
sim_not_unique[, query_cluster := as.numeric(query_cluster)]
sim_not_unique[, target_cluster := as.numeric(target_cluster)]

# remove rows if column 1 is idential to column 2
edgelist <- sim_not_unique[query_cluster != target_cluster]

# create graph from data.frame
g <- igraph::graph_from_edgelist(as.matrix(edgelist))
# converting graph to adjacency matrix
adj_matrix <- igraph::get.adjacency(g, names = T)
# generating weighted graph from adjacency matrix
g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)

# get subgraphs from graph with edges of weight > min_weight
s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F)
png('motif_clusters.png')
plot(s1)
dev.off()
clust <- igraph::clusters(s1)
if (clust$no < 1){
b <- lapply(files, function(f){
system(paste("cat",f,">",basename(f)))
})
}
# merge FASTA-files depending on the clustered graphs
a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
cl <- as.vector(which(clust$membership %in% c(i)))
fasta_list <- paste(files[cl], collapse = " ")
name <- paste0("Cluster_",i,".fasta")
system(paste("cat",fasta_list,">",name))
})
#!/usr/bin/env Rscript
if (!require(optparse)) install.packages("optparse"); library(optparse)

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"),
make_option(opt_str = c("-l", "--list"), default = NULL, help = "Numerically sorted whitespace separated list of absolute fasta-file paths", metavar = "character"),
make_option(opt_str = c("-w", "--min"), default = NULL, help = "Minimum weight of edge allowed in graph clusters.", metavar = "character")
)

opt_parser <- OptionParser(option_list = option_list,
description = "Adding Cluster ID to Query_ID Column",
epilogue = "Author: Rene Wiegandt <Rene.Wiegandt@mpi-bn.mpg.de>")

opt <- parse_args(opt_parser)

#' Merging FASTA-files, which motifs are similar.
#'
#' @parameter tsv_in <string> Path to TSV file generated by Tomtom.
#' The input for Tomtom is a from all clusters merged meme-file.
#' @parameter file_list <string> Numerically sorted comma separated list of absolute fasta-file paths
#' @parameter min_weight <INT> Minimum weight of edge allowed in graph clusters.
#'
#' @author René Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de
merge_similar <- function(tsv_in, file_list, min_weight){

files <- unlist(strsplit(file_list, ","))

# split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster.
tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character')
query_cluster <- vapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){
tail(l, n = 1)
},"")
target_cluster <- vapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){
tail(l, n = 1)
},"")

# create data.table with only the cluster-numbers
sim_not_unique <- data.table::data.table(as.numeric(query_cluster),as.numeric(target_cluster))

# remove rows if column 1 is identical to column 2
edgelist <- sim_not_unique[query_cluster != target_cluster]

# create graph from data.frame
g <- igraph::graph_from_edgelist(as.matrix(edgelist))
# converting graph to adjacency matrix
adj_matrix <- igraph::get.adjacency(g, names = T)
# generating weighted graph from adjacency matrix
g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)

# get subgraphs from graph with edges of weight > min_weight
s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight > min_weight], del = F)
png('motif_clusters.png')
plot(s1)
dev.off()
clust <- igraph::clusters(s1)
if (clust$no < 1) {
b <- lapply(files, function(f){
system(paste("cat",f,">",basename(f)))
})
}
# merge FASTA-files depending on the clustered graphs
a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
#get vector with cluster ids, which are clustered together in "motif cluster" i
cl <- as.vector(which(clust$membership %in% c(i)))
#create string, which represents a list, containing all FASTA-files to be merged
fasta_list <- paste(files[cl], collapse = " ")
#create name for merged FASTA-file
name <- paste0("Cluster_",i,".fasta")
#merge fasta files
system(paste("cat",fasta_list,">",name))
})
}


# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell)
if (!interactive()) {
merge_similar(opt$input, opt$list, opt$min)
}
Loading

0 comments on commit eb36ef7

Please sign in to comment.