From e38a469124e87c8d551cf5574cd7d7858ad6b0fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Fri, 4 Jan 2019 11:17:30 -0500 Subject: [PATCH 1/6] Added check for optparse to bed_to_fasta.R --- bin/2.2_motif_estimation/bed_to_fasta.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bin/2.2_motif_estimation/bed_to_fasta.R b/bin/2.2_motif_estimation/bed_to_fasta.R index e0ade14..8006e64 100644 --- a/bin/2.2_motif_estimation/bed_to_fasta.R +++ b/bin/2.2_motif_estimation/bed_to_fasta.R @@ -1,5 +1,5 @@ #!/usr/bin/env Rscript -library("optparse") +if (!require(optparse)) install.packages("optparse"); library(optparse) option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"), @@ -7,7 +7,7 @@ option_list <- list( make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer") ) -opt_parser <- OptionParser(option_list = option_list, +opt_parser <- OptionParser(option_list = option_list, description = "Convert BED-file to one FASTA-file per cluster") opt <- parse_args(opt_parser) @@ -22,19 +22,19 @@ opt <- parse_args(opt_parser) #' @author René Wiegandt #' @contact rene.wiegandt(at)mpi-bn.mpg.de bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ - + if (is.null(bedInput)) { stop("ERROR: Input parameter cannot be null! Please specify the input parameter.") } bed <- data.table::fread(bedInput, sep = "\t") - + # Get last column of data.table, which refers to the cluster, as a vector. cluster_no <- as.vector(bed[[ncol(bed)]]) - + # Split data.table bed on its last column (cluster_no) into list of data.frames clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) - + # For each data.frame(cluster) in list clusters: discard <- lapply(1:length(clusters), function(i){ clust <- as.data.frame(clusters[i]) From 8b281499f1519abe3f63911095a64121545058d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Fri, 4 Jan 2019 15:47:05 -0500 Subject: [PATCH 2/6] improved motif clustering --- bin/2.2_motif_estimation/label_cluster.R | 39 ++++++++++ pipeline.nf | 90 +++++++++++++++++++++--- 2 files changed, 119 insertions(+), 10 deletions(-) create mode 100644 bin/2.2_motif_estimation/label_cluster.R diff --git a/bin/2.2_motif_estimation/label_cluster.R b/bin/2.2_motif_estimation/label_cluster.R new file mode 100644 index 0000000..b00a97a --- /dev/null +++ b/bin/2.2_motif_estimation/label_cluster.R @@ -0,0 +1,39 @@ +#!/usr/bin/env Rscript +if (!require(optparse)) install.packages("optparse"); library(optparse) + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from tomtom", metavar = "character"), + make_option(opt_str = c("-o", "--output"), default = NULL, help = "Output TSV-file.", metavar = "character") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Adding Cluster ID to Query_ID Column") + +opt <- parse_args(opt_parser) + +#' Adding Cluster ID to Query_ID Column +#' +#' @param input TSV-file. Output from tomtom. +#' @param input Output name. +#' +#' @author René Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de +label_cluster <- function(input, output){ + #Reading TSV-file + tsv <- data.table::fread(input, header = T, sep = "\t") + + #Getting cluster ID/number + splitted_name <- unlist(strsplit(input, "\\_|\\.")) + cluster_number <- as.numeric(splitted_name[length(splitted_name) - 1]) + 1 + + #Adding cluster ID to first column + tsv$Query_ID <- paste(tsv$Query_ID, ".", cluster_number,sep = "") + + + data.table::fwrite(tsv, file = output, sep = "\t", col.names = FALSE) +} + +# run function label_cluster with given parameteres if not in interactive context (e.g. run from shell) +if (!interactive()) { + label_cluster(opt$input, opt$output) +} \ No newline at end of file diff --git a/pipeline.nf b/pipeline.nf index 1f2f379..b4b7537 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -315,6 +315,7 @@ process clustering { Converting BED-File to one FASTA-File per cluster */ process bed_to_clustered_fasta { + conda "${path_env}" publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy' tag{name} @@ -345,14 +346,16 @@ Generating Motifs through alignment and scoring best local matches. process glam2 { tag{name} - publishDir "${params.out}/2.2_motif_estimation/glam2/${name}/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/glam2/", mode: 'copy' + conda "${path_env}" input: set name, file (fasta) from fasta_for_glam2 - output: + output: file("${name}/*.meme") into meme_to_merge set name, file("${name}/*.meme") into meme_for_tomtom + set name, file("${name}/*.meme") into meme_to_search_in_merged set name, file("${name}/*.meme") into meme_for_filter file ('*') @@ -364,11 +367,12 @@ process glam2 { /* Combining all MEME-files to one big MEME-file. -The paths are sorted numerically depending on the cluster number. +The paths are sorted numerically depending on the cluster ID. */ process merge_meme { publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_meme/", mode: 'copy' + conda "${path_env}" input: val (memelist) from meme_to_merge.toList() @@ -380,48 +384,104 @@ process merge_meme { params.cluster_motif == 1 script: + //sorting memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") } meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer } meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"} + //create list for bash meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"") """ meme2meme ${meme_list} > merged_meme.meme """ } +to_find_similar_motifs = meme_to_search_in_merged.combine(merged_meme) + /* Running Tomtom on merged meme-files. Output table has the information which clusters are similar to each other. */ process find_similar_motifs { + tag{name} publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/cluster_similarity/", mode: 'copy' + conda "${path_env}" + + input: + set name, file (meme) ,file (merged_meme) from to_find_similar_motifs + + output: + set name, file ("${name}.tsv") into motif_similarity + + when: + params.cluster_motif == 1 + + script: + """ + tomtom ${meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > ${name}.tsv + """ +} + +/* +Label first column of TSV-file with Cluster ID +*/ +process label_cluster { + + tag{name} + conda "${path_env}" + input: - file (merged_meme) from merged_meme + set name, file (tsv) from motif_similarity output: - file ('all_to_all.tsv') into motif_similarity + file ("${name}_labeled.tsv") into labeled_tsv when: params.cluster_motif == 1 script: """ - tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv + Rscript ${path_bin}/2.2_motif_estimation/label_cluster.R -i ${tsv} -o ${name}_labeled.tsv """ } +/* +Merging tsv files_for_merge_fasta +*/ +process merge_labeled_tsv { + + publishDir "${params.out}/esimated_motifs/", mode: 'copy' + + input: + val (tsvlist) from labeled_tsv.toSortedList { it.toString().tokenize('_')[-2] as Integer } + + output: + file ('*.tsv') into merged_labeled_tsv + + when: + params.cluster_motif == 1 + + script: + tsvs = tsvlist.toString().replaceAll(/\,|\[|\]/,"") + """ + echo -e 'Query_ID\tTarget_ID\tOptimal_offset\tp-value\tE-value\tq-value\tOverlap\tQuery_consensus\tTarget_consensus\tOrientation'> merged_labeled.tsv + for i in $tsvs; do + cat \$i >> merged_labeled.tsv + done + """ +} -files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster) +files_for_merge_fasta = merged_labeled_tsv.combine(fasta_for_motif_cluster) /* Merging FASTA-files of similar clusters */ process merge_fasta { + conda "${path_env}" publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_fasta/", mode: 'copy' - echo true + input: set file (motiv_sim), val (fasta_list) from files_for_merge_fasta @@ -443,10 +503,14 @@ process merge_fasta { motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten() - +/* +Run GLAM2 on emrged FASTA-files +*/ process clustered_glam2 { publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/${name}/", mode: 'copy' + tag{name} + conda "${path_env}" input: file (fasta) from motif_clustered_fasta_flat @@ -466,6 +530,10 @@ process clustered_glam2 { """ } +/* +Forward files depending on set parameter +If motif clustering is activated or not. +*/ if(params.cluster_motif == 1){ for_tomtom = clustered_meme_for_tomtom for_filter = clustered_meme_for_filter @@ -483,6 +551,7 @@ process tomtom { tag{name} publishDir "${params.out}/2.2_motif_estimation/tomtom/", mode: 'copy' + conda "${path_env}" input: set name, file (meme) from for_tomtom @@ -527,8 +596,9 @@ process check_for_unknown_motifs { //Get the best(first) Motif from each MEME-file process get_best_motif { - conda "${path_env}" + conda "${path_env}" + tag{name} publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/", mode: 'copy' input: From 2064e527e26bb5b4843e2decda5f3ff172e8463f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Sat, 5 Jan 2019 11:15:16 -0500 Subject: [PATCH 3/6] imporved structure of merge_similar_clusters.R --- .../merge_similar_clusters.R | 130 ++++++++++-------- pipeline.nf | 9 +- 2 files changed, 79 insertions(+), 60 deletions(-) diff --git a/bin/2.2_motif_estimation/merge_similar_clusters.R b/bin/2.2_motif_estimation/merge_similar_clusters.R index 03b8cf1..0104d05 100644 --- a/bin/2.2_motif_estimation/merge_similar_clusters.R +++ b/bin/2.2_motif_estimation/merge_similar_clusters.R @@ -1,61 +1,77 @@ #!/usr/bin/env Rscript +if (!require(optparse)) install.packages("optparse"); library(optparse) -# Merging FASTA-files, which motifs are similar. -# -# @parameter tsv_in Path to TSV file generated by Tomtom. -# The input for Tomtom is a from all clusters merged meme-file. -# @parameter file_list Numerically sorted whitespace separated list of absolute fasta-file paths -# @parameter min_weight Minimum weight of edge allowed in graph clusters. - - -args = commandArgs(trailingOnly = TRUE) - -tsv_in <- args[1] -file_list <- args[2] -min_weight <- args[3] - -files <- unlist(as.list(strsplit(file_list, ","))) - -# split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster. -tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character') -query_cluster <- unlist(lapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){ - tail(l,n=1) -})) -target_cluster <- unlist(lapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){ - tail(l,n=1) -})) - -# create data.table with only the cluster-numbers -sim_not_unique <- data.table::data.table(query_cluster,target_cluster) -# convert from character to numeric values -sim_not_unique[, query_cluster := as.numeric(query_cluster)] -sim_not_unique[, target_cluster := as.numeric(target_cluster)] - -# remove rows if column 1 is idential to column 2 -edgelist <- sim_not_unique[query_cluster != target_cluster] - -# create graph from data.frame -g <- igraph::graph_from_edgelist(as.matrix(edgelist)) -# converting graph to adjacency matrix -adj_matrix <- igraph::get.adjacency(g, names = T) -# generating weighted graph from adjacency matrix -g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T) - -# get subgraphs from graph with edges of weight > min_weight -s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F) -png('motif_clusters.png') -plot(s1) -dev.off() -clust <- igraph::clusters(s1) -if (clust$no < 1){ - b <- lapply(files, function(f){ - system(paste("cat",f,">",basename(f))) +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"), + make_option(opt_str = c("-l", "--list"), default = NULL, help = "Numerically sorted whitespace separated list of absolute fasta-file paths", metavar = "character"), + make_option(opt_str = c("-w", "--min"), default = NULL, help = "Minimum weight of edge allowed in graph clusters.", metavar = "character") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Adding Cluster ID to Query_ID Column") + +opt <- parse_args(opt_parser) + +#' Merging FASTA-files, which motifs are similar. +#' +#' @parameter tsv_in Path to TSV file generated by Tomtom. +#' The input for Tomtom is a from all clusters merged meme-file. +#' @parameter file_list Numerically sorted whitespace separated list of absolute fasta-file paths +#' @parameter min_weight Minimum weight of edge allowed in graph clusters. +#' +#' @author René Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de +merge_similar <- function(tsv_in, file_list, min_weight){ + + files <- unlist(as.list(strsplit(file_list, ","))) + + # split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster. + tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character') + query_cluster <- unlist(lapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){ + tail(l,n=1) + })) + target_cluster <- unlist(lapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){ + tail(l,n=1) + })) + + # create data.table with only the cluster-numbers + sim_not_unique <- data.table::data.table(query_cluster,target_cluster) + # convert from character to numeric values + sim_not_unique[, query_cluster := as.numeric(query_cluster)] + sim_not_unique[, target_cluster := as.numeric(target_cluster)] + + # remove rows if column 1 is idential to column 2 + edgelist <- sim_not_unique[query_cluster != target_cluster] + + # create graph from data.frame + g <- igraph::graph_from_edgelist(as.matrix(edgelist)) + # converting graph to adjacency matrix + adj_matrix <- igraph::get.adjacency(g, names = T) + # generating weighted graph from adjacency matrix + g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T) + + # get subgraphs from graph with edges of weight > min_weight + s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F) + png('motif_clusters.png') + plot(s1) + dev.off() + clust <- igraph::clusters(s1) + if (clust$no < 1){ + b <- lapply(files, function(f){ + system(paste("cat",f,">",basename(f))) + }) + } + # merge FASTA-files depending on the clustered graphs + a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){ + cl <- as.vector(which(clust$membership %in% c(i))) + fasta_list <- paste(files[cl], collapse = " ") + name <- paste0("Cluster_",i,".fasta") + system(paste("cat",fasta_list,">",name)) }) } -# merge FASTA-files depending on the clustered graphs -a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){ - cl <- as.vector(which(clust$membership %in% c(i))) - fasta_list <- paste(files[cl], collapse = " ") - name <- paste0("Cluster_",i,".fasta") - system(paste("cat",fasta_list,">",name)) -}) + + +# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell) +if (!interactive()) { + merge_similar(opt$input, opt$list, opt$min) +} diff --git a/pipeline.nf b/pipeline.nf index b4b7537..8358a68 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -251,7 +251,7 @@ if(params.tfbs_path == "") { */ process overlap_with_known_TFBS { conda "${path_env}" - + echo true publishDir "${params.out}/1.2_filter_motifs/compareBed/", mode :'copy' input: @@ -263,7 +263,10 @@ process overlap_with_known_TFBS { script: motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"") """ - ${path_bin}/1.2_filter_motifs/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin}/1.2/filter_motifs + echo ${bed_footprints} + echo ${motif_path} + echo ${fasta} + ${path_bin}/1.2_filter_motifs/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp} """ } @@ -450,7 +453,7 @@ Merging tsv files_for_merge_fasta */ process merge_labeled_tsv { - publishDir "${params.out}/esimated_motifs/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/", mode: 'copy' input: val (tsvlist) from labeled_tsv.toSortedList { it.toString().tokenize('_')[-2] as Integer } From f800e77658e321bfc8e78e238cddc5e573cae464 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Sun, 6 Jan 2019 05:51:55 -0500 Subject: [PATCH 4/6] bed_to_fasta.R: improved syntax; fixed typos --- bin/2.2_motif_estimation/bed_to_fasta.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bin/2.2_motif_estimation/bed_to_fasta.R b/bin/2.2_motif_estimation/bed_to_fasta.R index 8006e64..7e716b7 100644 --- a/bin/2.2_motif_estimation/bed_to_fasta.R +++ b/bin/2.2_motif_estimation/bed_to_fasta.R @@ -2,18 +2,19 @@ if (!require(optparse)) install.packages("optparse"); library(optparse) option_list <- list( - make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"), + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster id.", metavar = "character"), make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"), make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer") ) opt_parser <- OptionParser(option_list = option_list, - description = "Convert BED-file to one FASTA-file per cluster") + description = "Convert BED-file to one FASTA-file per cluster.", + epilogue = "Author: Rene Wiegandt ") opt <- parse_args(opt_parser) #' Splitting BED-files depending on their cluster. -#' The Sequences of each cluster are written as an FASTA-file. +#' The Sequences of each cluster are written as a FASTA-file. #' @param bedInput BED-file with sequences and cluster-id as last two columns: #' Sequence: second last column; Cluster ID: last column #' @param prefix prefix for filenames @@ -30,13 +31,13 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ bed <- data.table::fread(bedInput, sep = "\t") # Get last column of data.table, which refers to the cluster, as a vector. - cluster_no <- as.vector(bed[[ncol(bed)]]) + cluster_no <- bed[[ncol(bed)]] # Split data.table bed on its last column (cluster_no) into list of data.frames clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) # For each data.frame(cluster) in list clusters: - discard <- lapply(1:length(clusters), function(i){ + discard <- lapply(seq_len(length(clusters)), function(i){ clust <- as.data.frame(clusters[i]) # Filter data.tables(clusters), which are to small if (nrow(clust) >= as.numeric(min_seq) ) { From 7ff588be8322561e249de1982cf25db2e800b8d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Sun, 6 Jan 2019 06:04:06 -0500 Subject: [PATCH 5/6] label_cluster.R: improved syntax and documentation; fixed typos --- bin/2.2_motif_estimation/label_cluster.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/2.2_motif_estimation/label_cluster.R b/bin/2.2_motif_estimation/label_cluster.R index b00a97a..60f1ec8 100644 --- a/bin/2.2_motif_estimation/label_cluster.R +++ b/bin/2.2_motif_estimation/label_cluster.R @@ -7,7 +7,8 @@ option_list <- list( ) opt_parser <- OptionParser(option_list = option_list, - description = "Adding Cluster ID to Query_ID Column") + description = "Adding Cluster ID to Query_ID Column.", + epilogue = "Author: Rene Wiegandt ") opt <- parse_args(opt_parser) @@ -27,7 +28,7 @@ label_cluster <- function(input, output){ cluster_number <- as.numeric(splitted_name[length(splitted_name) - 1]) + 1 #Adding cluster ID to first column - tsv$Query_ID <- paste(tsv$Query_ID, ".", cluster_number,sep = "") + tsv$Query_ID <- paste0(tsv$Query_ID, ".", cluster_number) data.table::fwrite(tsv, file = output, sep = "\t", col.names = FALSE) @@ -36,4 +37,4 @@ label_cluster <- function(input, output){ # run function label_cluster with given parameteres if not in interactive context (e.g. run from shell) if (!interactive()) { label_cluster(opt$input, opt$output) -} \ No newline at end of file +} From 798dd77a5039d55cd5ce8afbca9f2068f7032a8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Sun, 6 Jan 2019 06:04:41 -0500 Subject: [PATCH 6/6] merge_similar_clusters.R: improved syntax and documentation; fixed typos --- .../merge_similar_clusters.R | 156 +++++++++--------- 1 file changed, 79 insertions(+), 77 deletions(-) diff --git a/bin/2.2_motif_estimation/merge_similar_clusters.R b/bin/2.2_motif_estimation/merge_similar_clusters.R index 0104d05..d9b2373 100644 --- a/bin/2.2_motif_estimation/merge_similar_clusters.R +++ b/bin/2.2_motif_estimation/merge_similar_clusters.R @@ -1,77 +1,79 @@ -#!/usr/bin/env Rscript -if (!require(optparse)) install.packages("optparse"); library(optparse) - -option_list <- list( - make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"), - make_option(opt_str = c("-l", "--list"), default = NULL, help = "Numerically sorted whitespace separated list of absolute fasta-file paths", metavar = "character"), - make_option(opt_str = c("-w", "--min"), default = NULL, help = "Minimum weight of edge allowed in graph clusters.", metavar = "character") -) - -opt_parser <- OptionParser(option_list = option_list, - description = "Adding Cluster ID to Query_ID Column") - -opt <- parse_args(opt_parser) - -#' Merging FASTA-files, which motifs are similar. -#' -#' @parameter tsv_in Path to TSV file generated by Tomtom. -#' The input for Tomtom is a from all clusters merged meme-file. -#' @parameter file_list Numerically sorted whitespace separated list of absolute fasta-file paths -#' @parameter min_weight Minimum weight of edge allowed in graph clusters. -#' -#' @author René Wiegandt -#' @contact rene.wiegandt(at)mpi-bn.mpg.de -merge_similar <- function(tsv_in, file_list, min_weight){ - - files <- unlist(as.list(strsplit(file_list, ","))) - - # split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster. - tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character') - query_cluster <- unlist(lapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){ - tail(l,n=1) - })) - target_cluster <- unlist(lapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){ - tail(l,n=1) - })) - - # create data.table with only the cluster-numbers - sim_not_unique <- data.table::data.table(query_cluster,target_cluster) - # convert from character to numeric values - sim_not_unique[, query_cluster := as.numeric(query_cluster)] - sim_not_unique[, target_cluster := as.numeric(target_cluster)] - - # remove rows if column 1 is idential to column 2 - edgelist <- sim_not_unique[query_cluster != target_cluster] - - # create graph from data.frame - g <- igraph::graph_from_edgelist(as.matrix(edgelist)) - # converting graph to adjacency matrix - adj_matrix <- igraph::get.adjacency(g, names = T) - # generating weighted graph from adjacency matrix - g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T) - - # get subgraphs from graph with edges of weight > min_weight - s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F) - png('motif_clusters.png') - plot(s1) - dev.off() - clust <- igraph::clusters(s1) - if (clust$no < 1){ - b <- lapply(files, function(f){ - system(paste("cat",f,">",basename(f))) - }) - } - # merge FASTA-files depending on the clustered graphs - a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){ - cl <- as.vector(which(clust$membership %in% c(i))) - fasta_list <- paste(files[cl], collapse = " ") - name <- paste0("Cluster_",i,".fasta") - system(paste("cat",fasta_list,">",name)) - }) -} - - -# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell) -if (!interactive()) { - merge_similar(opt$input, opt$list, opt$min) -} +#!/usr/bin/env Rscript +if (!require(optparse)) install.packages("optparse"); library(optparse) + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"), + make_option(opt_str = c("-l", "--list"), default = NULL, help = "Numerically sorted whitespace separated list of absolute fasta-file paths", metavar = "character"), + make_option(opt_str = c("-w", "--min"), default = NULL, help = "Minimum weight of edge allowed in graph clusters.", metavar = "character") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Adding Cluster ID to Query_ID Column", + epilogue = "Author: Rene Wiegandt ") + +opt <- parse_args(opt_parser) + +#' Merging FASTA-files, which motifs are similar. +#' +#' @parameter tsv_in Path to TSV file generated by Tomtom. +#' The input for Tomtom is a from all clusters merged meme-file. +#' @parameter file_list Numerically sorted comma separated list of absolute fasta-file paths +#' @parameter min_weight Minimum weight of edge allowed in graph clusters. +#' +#' @author René Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de +merge_similar <- function(tsv_in, file_list, min_weight){ + + files <- unlist(strsplit(file_list, ",")) + + # split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster. + tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character') + query_cluster <- vapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){ + tail(l, n = 1) + },"") + target_cluster <- vapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){ + tail(l, n = 1) + },"") + + # create data.table with only the cluster-numbers + sim_not_unique <- data.table::data.table(as.numeric(query_cluster),as.numeric(target_cluster)) + + # remove rows if column 1 is identical to column 2 + edgelist <- sim_not_unique[query_cluster != target_cluster] + + # create graph from data.frame + g <- igraph::graph_from_edgelist(as.matrix(edgelist)) + # converting graph to adjacency matrix + adj_matrix <- igraph::get.adjacency(g, names = T) + # generating weighted graph from adjacency matrix + g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T) + + # get subgraphs from graph with edges of weight > min_weight + s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight > min_weight], del = F) + png('motif_clusters.png') + plot(s1) + dev.off() + clust <- igraph::clusters(s1) + if (clust$no < 1) { + b <- lapply(files, function(f){ + system(paste("cat",f,">",basename(f))) + }) + } + # merge FASTA-files depending on the clustered graphs + a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){ + #get vector with cluster ids, which are clustered together in "motif cluster" i + cl <- as.vector(which(clust$membership %in% c(i))) + #create string, which represents a list, containing all FASTA-files to be merged + fasta_list <- paste(files[cl], collapse = " ") + #create name for merged FASTA-file + name <- paste0("Cluster_",i,".fasta") + #merge fasta files + system(paste("cat",fasta_list,">",name)) + }) +} + + +# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell) +if (!interactive()) { + merge_similar(opt$input, opt$list, opt$min) +}