Merge pull request #92 from loosolab/estimation_motifs

Estimation motifs
loosolab · Mar 26, 2019 · d6b44ea · d6b44ea
2 parents 987bf7f + 07d492c
commit d6b44ea
Show file tree

Hide file tree

Showing 13 changed files with 483 additions and 238 deletions.
diff --git a/README.md b/README.md
@@ -54,14 +54,14 @@ Required arguments:
 	--genome_fasta		 Path to genome in FASTA-format
 	--motif_db		 Path to motif-database in MEME-format
 	--config		 Path to UROPA configuration file
- 	--organism 		 Input organism [hg38 | hg19 | mm9 | mm10]
+	--organism 		 Input organism [hg38 | hg19 | mm9 | mm10]
 	--out			 Output Directory (Default: './out/')
 
 Optional arguments:
 
 	--help [0|1]		1 to show this help message. (Default: 0)
 	--gtf_path		Path to gtf-file. If path is set the process which creates a gtf-file is skipped.
-	--tfbs_path 		Path to directory with output from tfbsscan. If given tfbsscan will be skipped.
+	--tfbs_path 		Path to directory with tfbsscan output. If given tfbsscan will be skipped.
 
 	Footprint extraction:
 	--window_length INT	This parameter sets the length of a sliding window. (Default: 200)
@@ -78,7 +78,7 @@ Optional arguments:
 	Sequence preparation/ reduction:
 	--kmer INT		K-mer length (Default: 10)
 	--aprox_motif_len INT	Motif length (Default: 10)
-	--motif_occurence FLOAT	Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
+	--motif_occurrence FLOAT	Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
 	--min_seq_length Interations	Remove all sequences below this value. (Default: 10)
 	Clustering:
 	--global INT		Global (=1) or local (=0) alignment. (Default: 0)
@@ -96,14 +96,17 @@ Optional arguments:
 	--tomtom_treshold FLOAT	Threshold for similarity score. (Default: 0.01)
 	--best_motif INT	Get the best X motifs per cluster. (Default: 3)
 	--gap_penalty INT	Set penalty for gaps in GLAM2 (Default: 1000)
+	--seed Set seed for GLAM2 (Default: 123456789)
 	Moitf clustering:
-	--cluster_motif	Boolean	If 1 pipeline clusters motifs. If its 0 it does not. (Default: 0)
+	--cluster_motif	Boolean	If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0)
 	--edge_weight INT	Minimum weight of edges in motif-cluster-graph (Default: 5)
 	--motif_similarity_thresh FLOAT	Threshold for motif similarity score (Default: 0.00001)
 
 	Creating GTF:
 	--tissues List/String 	List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON
 				config
+	Evaluation:
+	--max_uropa_runs INT	 Maximum number UROPA runs running parallelized (Default: 10)
 All arguments can be set in the configuration files
  ```
 

diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
@@ -5,10 +5,10 @@
 
 # desciption: This scripts uses bedtools to find new, non overlapping regions
 #             between input data in BED-format and a group of known motifs in
-#             BED-format. 
+#             BED-format.
 
 #             For parameter description, run the script without parameters or -h.
-#             The output is a file with the filtered footprints and the log file compareBed.stats
+#             The output is a file with the filtered footprints and the log file compareBed.log
 
 # One R scripts is used, compareBed_runinfo.R, stored in the same directory.
 # Parameter values (file/directory paths) must not contain the | symbol, because it is chosen as "sed"-field separator.
@@ -124,7 +124,7 @@ then
 	echo "              -o   --output       output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta"
 	echo "              -h --help           shows this help message"
 	echo ""
-	echo " Note: file paths must not contain the '|' pipe symbol" 
+	echo " Note: file paths must not contain the '|' pipe symbol"
 	exit 0
 fi
 
@@ -188,7 +188,7 @@ cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed
 # motiffiles either from a directory OR comma separated list
 if [ -d "$motifs" ]
 then
-	# creates an array of all files with bed in its name in the directory $motifs  
+	# creates an array of all files with bed in its name in the directory $motifs
 	declare -a motiffiles=(`ls $motifs | grep -i '.bed$' | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`)
 
 # the else case means, that the motiffiles were passed comma separated with no whitespace.
@@ -264,9 +264,9 @@ fi
 
 # remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
 # also creates a small output file with information about the comparison
-Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.stats
+Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.log
 # check if Rscript executed without errors
-if [ $? -gt 0 ] 
+if [ $? -gt 0 ]
 then
 	exit 1
 fi
@@ -279,15 +279,15 @@ then
 	# add initial number of footprints to the log file
 	fp_initial=`cat $data | wc -l`
 	fp_initial=`expr $fp_initial - 1`
-	echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
+	echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.log
 else
 	# output will be overwritten if it exists
 	rm -f $output
 	# add initial number of footprints to the log file
-	cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
+	cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.log
 fi
 # add number of footprints after filtering to the log file
-cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats
+cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.log
 
 # add fasta sequences to bed and create fasta file
 out_fasta=`echo $output | sed "s|.bed$|.fasta|g"`

diff --git a/bin/2.2_motif_estimation/get_motif_seq.R → bin/2.2_motif_estimation/get_motif_seq_id.R b/bin/2.2_motif_estimation/get_motif_seq.R → bin/2.2_motif_estimation/get_motif_seq_id.R
@@ -68,7 +68,7 @@ create_seq_json <- function(input, output, num, tmp_path, cluster_id) {
 
   # Create json file
   ## naming
-  names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id)
+  names(datalist) <- paste0("Motif_", seq(1,as.numeric(num),1), " Cluster_", cluster_id)
   ## creating json object
   json <- RJSONIO::toJSON(datalist, pretty = T , .withNames = T)
   ## writing file

diff --git a/bin/2.2_motif_estimation/merge_similar_clusters.R b/bin/2.2_motif_estimation/merge_similar_clusters.R
@@ -1,5 +1,6 @@
 #!/usr/bin/env Rscript
 if (!require(optparse)) install.packages("optparse"); library(optparse)
+if (!require(igraph)) install.packages("igraph"); library(igraph)
 
 option_list <- list(
   make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"),
@@ -40,37 +41,39 @@ merge_similar <- function(tsv_in, file_list, min_weight){
 
   # remove rows if column 1 is identical to column 2
   edgelist <- sim_not_unique[query_cluster != target_cluster]
-
+  unique_edgelist <- unique(edgelist)
+
   # create graph from data.frame
-  g <- igraph::graph_from_edgelist(as.matrix(edgelist))
-  # converting graph to adjacency matrix
-  adj_matrix <- igraph::get.adjacency(g, names = T)
-  # generating weighted graph from adjacency matrix
-  g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)
+  g <- graph_from_edgelist(as.matrix(unique(edgelist)), directed = F)
+  g_tidy <- delete.vertices(simplify(g), degree(g) == 0)
+  g_tidy <- set_vertex_attr(g_tidy,"name",V(g_tidy), value = (sort(unique(c(unique_edgelist$V1,unique_edgelist$V2)))) - 1)
 
-  # get subgraphs from graph with edges of weight > min_weight
-  s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight > min_weight], del = F)
+  # plotting
   png('motif_clusters.png')
-  plot(s1)
+  plot(g_tidy)
   dev.off()
-  clust <- igraph::clusters(s1)
+  clust <- clusters(g)
   if (clust$no < 1) {
     b <- lapply(files, function(f){
       system(paste("cat",f,">",basename(f)))
     })
   }
+  clust_out <- file("cluster.txt")
   # merge FASTA-files depending on the clustered graphs
-  a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
+  a <- vapply(seq(from = 1, to = clust$no, by = 1), function(i){
     #get vector with cluster ids, which are clustered together in "motif cluster" i
     cl <- as.vector(which(clust$membership %in% c(i)))
     #create string, which represents a list, containing all FASTA-files to be merged
     fasta_list <- paste(files[cl], collapse = " ")
     #create name for merged FASTA-file
-    name <- paste0("Cluster_",i,".fasta")
+    name <- paste0("Cluster_",i - 1 ,".fasta")
     #merge fasta files
     system(paste("cat",fasta_list,">",name))
-  })
+    paste("Cluster",i - 1,"is generated by merging cluster:",paste(cl - 1,collapse = ","))
+  }, "")
+  writeLines(a[a != ""], con = clust_out)
 }
+
 
 
 # run function merge_similar with given parameteres if not in interactive context (e.g. run from shell)

diff --git a/bin/2.2_motif_estimation/png_to_pdf.R b/bin/2.2_motif_estimation/png_to_pdf.R
@@ -0,0 +1,94 @@
+#!/usr/bin/env Rscript
+if (!require(optparse)) install.packages("optparse"); library(optparse)
+if (!require(png)) install.packages("png"); library(png)
+if (!require(grid)) install.packages("grid"); library(grid)
+if (!require(gridExtra)) install.packages("gridExtra"); library(gridExtra)
+
+
+option_list <- list(
+  make_option(opt_str = c("-i", "--png_new"), default = NULL, help = "List of png paths separated by \',\'", metavar = "character"),
+  make_option(opt_str = c("-p", "--png_old"), default = NULL, help = "List of png paths separated by \',\'", metavar = "character"),
+  make_option(opt_str = c("-f", "--index"), default = NULL, help = "Path to index file", metavar = "character")
+)
+
+opt_parser <- OptionParser(option_list = option_list,
+                           description = "Generating a PDF from multiple png files",
+                           epilogue = "Author: Rene Wiegandt <Rene.Wiegandt@mpi-bn.mpg.de>")
+
+opt <- parse_args(opt_parser)
+
+
+
+#' Generating a PDF from multiple png files.
+#'
+#' @parameter png_new <string> List of png paths separated by ,
+#' @parameter png_old <string> List of png paths separated by ,
+#' @parameter cluster_ids <string> List of IDs separated by ,
+#' @parameter new_id <int> New Cluster ID
+#'
+#' @author Rene Wiegandt
+#' @contact rene.wiegandt(at)mpi-bn.mpg.de
+png_to_pdf <- function(png_top, png_list, cluster_ids, new_id, out = "cluster.pdf"){
+  #png_top <- "G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/04_glam2/Cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/04_glam2/Cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/04_glam2/Cluster_0/logo3.png"
+  #png_list <- "G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png"
+  #cluster_ids <- c(1,2,3,4,5,6,7,8,9,10)
+  top_split <- unlist(strsplit(png_top, ","))
+  png_split <- unlist(strsplit(png_list, ","))
+  cluster_list <- unlist(strsplit(cluster_ids, ","))
+  top_grob <- lapply(top_split,function(p){rasterGrob(readPNG(p),interpolate = FALSE)})
+  grobs <- lapply(png_split,function(p){rasterGrob(readPNG(p),interpolate = FALSE)})
+
+  split_grobs <- split(grobs, rep(1:ceiling(length(grobs)/3), each = 3)[1:length(grobs)])
+
+  rows <- lapply(seq(1,length(split_grobs)), function(sg){
+    arrangeGrob(grobs = split_grobs[[sg]], top = textGrob(paste0("Cluster ",cluster_list[sg]),gp = gpar(fontsize = 20,font = 3)), ncol = 3)
+  })
+  split_rows <- split(rows, rep(1:ceiling(length(rows)/4), each = 4)[1:length(rows)])
+
+  pdf(out,width = 17, height = 11)
+  grid.arrange(grobs = top_grob, nrow = 3, top = textGrob(paste0("New Cluster ",new_id),gp = gpar(fontsize = 30,font = 3)))
+  lapply(split_rows, function(r){
+    grid.arrange(grobs = r, nrow = 4, top = textGrob("Generated from...",gp = gpar(fontsize = 30,font = 3))) 
+  })
+
+  dev.off()
+
+}
+#### index_file <- "G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/03_merged_fasta/cluster.txt"
+
+
+#' Parsing index file.
+#'
+#' @parameter index_file Path to index file.
+#'
+#' @author Rene Wiegandt
+#' @contact rene.wiegandt(at)mpi-bn.mpg.de
+get_index <- function(index_file){
+  f <- data.table::fread(index_file, header = F)
+  index <- f[,c(2,8)]
+  index$V8 <- strsplit(index$V8,",")
+  return(index[unlist(lapply(index$V8, function(v){ifelse(length(v) > 1, TRUE, FALSE)}))])
+
+}
+
+
+png_to_pdf_set_up <- function(png_top, png_list, index_file){
+
+  index <- get_index(index_file)
+  t <- lapply(seq(nrow(index)), function(i){
+    new_id <- index[i,1]
+    cluster_ids <- index[i,2]
+    out <- paste0("Summary_cluster_", new_id, ".pdf")
+    png_to_pdf(png_top = ,png_list = ,cluster_ids = cluster_ids, new_id = new_id, out = out )
+  })
+
+}
+
+
+# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell)
+if (!interactive()) {
+  png_to_pdf_set_up(opt$png_new, opt$png_old , opt$index)
+}
+
+
+