Skip to content

Commit

Permalink
Merge pull request #92 from loosolab/estimation_motifs
Browse files Browse the repository at this point in the history
Estimation motifs
  • Loading branch information
renewiegandt authored Mar 26, 2019
2 parents 987bf7f + 07d492c commit d6b44ea
Show file tree
Hide file tree
Showing 13 changed files with 483 additions and 238 deletions.
11 changes: 7 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,14 @@ Required arguments:
--genome_fasta Path to genome in FASTA-format
--motif_db Path to motif-database in MEME-format
--config Path to UROPA configuration file
--organism Input organism [hg38 | hg19 | mm9 | mm10]
--organism Input organism [hg38 | hg19 | mm9 | mm10]
--out Output Directory (Default: './out/')
Optional arguments:
--help [0|1] 1 to show this help message. (Default: 0)
--gtf_path Path to gtf-file. If path is set the process which creates a gtf-file is skipped.
--tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will be skipped.
--tfbs_path Path to directory with tfbsscan output. If given tfbsscan will be skipped.
Footprint extraction:
--window_length INT This parameter sets the length of a sliding window. (Default: 200)
Expand All @@ -78,7 +78,7 @@ Optional arguments:
Sequence preparation/ reduction:
--kmer INT K-mer length (Default: 10)
--aprox_motif_len INT Motif length (Default: 10)
--motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
--motif_occurrence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
--min_seq_length Interations Remove all sequences below this value. (Default: 10)
Clustering:
--global INT Global (=1) or local (=0) alignment. (Default: 0)
Expand All @@ -96,14 +96,17 @@ Optional arguments:
--tomtom_treshold FLOAT Threshold for similarity score. (Default: 0.01)
--best_motif INT Get the best X motifs per cluster. (Default: 3)
--gap_penalty INT Set penalty for gaps in GLAM2 (Default: 1000)
--seed Set seed for GLAM2 (Default: 123456789)
Moitf clustering:
--cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Default: 0)
--cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0)
--edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5)
--motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001)
Creating GTF:
--tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON
config
Evaluation:
--max_uropa_runs INT Maximum number UROPA runs running parallelized (Default: 10)
All arguments can be set in the configuration files
```

Expand Down
18 changes: 9 additions & 9 deletions bin/1.2_filter_motifs/compareBed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@

# desciption: This scripts uses bedtools to find new, non overlapping regions
# between input data in BED-format and a group of known motifs in
# BED-format.
# BED-format.

# For parameter description, run the script without parameters or -h.
# The output is a file with the filtered footprints and the log file compareBed.stats
# The output is a file with the filtered footprints and the log file compareBed.log

# One R scripts is used, compareBed_runinfo.R, stored in the same directory.
# Parameter values (file/directory paths) must not contain the | symbol, because it is chosen as "sed"-field separator.
Expand Down Expand Up @@ -124,7 +124,7 @@ then
echo " -o --output output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta"
echo " -h --help shows this help message"
echo ""
echo " Note: file paths must not contain the '|' pipe symbol"
echo " Note: file paths must not contain the '|' pipe symbol"
exit 0
fi

Expand Down Expand Up @@ -188,7 +188,7 @@ cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed
# motiffiles either from a directory OR comma separated list
if [ -d "$motifs" ]
then
# creates an array of all files with bed in its name in the directory $motifs
# creates an array of all files with bed in its name in the directory $motifs
declare -a motiffiles=(`ls $motifs | grep -i '.bed$' | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`)

# the else case means, that the motiffiles were passed comma separated with no whitespace.
Expand Down Expand Up @@ -264,9 +264,9 @@ fi

# remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
# also creates a small output file with information about the comparison
Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.stats
Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.log
# check if Rscript executed without errors
if [ $? -gt 0 ]
if [ $? -gt 0 ]
then
exit 1
fi
Expand All @@ -279,15 +279,15 @@ then
# add initial number of footprints to the log file
fp_initial=`cat $data | wc -l`
fp_initial=`expr $fp_initial - 1`
echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.log
else
# output will be overwritten if it exists
rm -f $output
# add initial number of footprints to the log file
cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.log
fi
# add number of footprints after filtering to the log file
cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats
cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.log

# add fasta sequences to bed and create fasta file
out_fasta=`echo $output | sed "s|.bed$|.fasta|g"`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ create_seq_json <- function(input, output, num, tmp_path, cluster_id) {

# Create json file
## naming
names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id)
names(datalist) <- paste0("Motif_", seq(1,as.numeric(num),1), " Cluster_", cluster_id)
## creating json object
json <- RJSONIO::toJSON(datalist, pretty = T , .withNames = T)
## writing file
Expand Down
29 changes: 16 additions & 13 deletions bin/2.2_motif_estimation/merge_similar_clusters.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env Rscript
if (!require(optparse)) install.packages("optparse"); library(optparse)
if (!require(igraph)) install.packages("igraph"); library(igraph)

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"),
Expand Down Expand Up @@ -40,37 +41,39 @@ merge_similar <- function(tsv_in, file_list, min_weight){

# remove rows if column 1 is identical to column 2
edgelist <- sim_not_unique[query_cluster != target_cluster]

unique_edgelist <- unique(edgelist)

# create graph from data.frame
g <- igraph::graph_from_edgelist(as.matrix(edgelist))
# converting graph to adjacency matrix
adj_matrix <- igraph::get.adjacency(g, names = T)
# generating weighted graph from adjacency matrix
g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)
g <- graph_from_edgelist(as.matrix(unique(edgelist)), directed = F)
g_tidy <- delete.vertices(simplify(g), degree(g) == 0)
g_tidy <- set_vertex_attr(g_tidy,"name",V(g_tidy), value = (sort(unique(c(unique_edgelist$V1,unique_edgelist$V2)))) - 1)

# get subgraphs from graph with edges of weight > min_weight
s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight > min_weight], del = F)
# plotting
png('motif_clusters.png')
plot(s1)
plot(g_tidy)
dev.off()
clust <- igraph::clusters(s1)
clust <- clusters(g)
if (clust$no < 1) {
b <- lapply(files, function(f){
system(paste("cat",f,">",basename(f)))
})
}
clust_out <- file("cluster.txt")
# merge FASTA-files depending on the clustered graphs
a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
a <- vapply(seq(from = 1, to = clust$no, by = 1), function(i){
#get vector with cluster ids, which are clustered together in "motif cluster" i
cl <- as.vector(which(clust$membership %in% c(i)))
#create string, which represents a list, containing all FASTA-files to be merged
fasta_list <- paste(files[cl], collapse = " ")
#create name for merged FASTA-file
name <- paste0("Cluster_",i,".fasta")
name <- paste0("Cluster_",i - 1 ,".fasta")
#merge fasta files
system(paste("cat",fasta_list,">",name))
})
paste("Cluster",i - 1,"is generated by merging cluster:",paste(cl - 1,collapse = ","))
}, "")
writeLines(a[a != ""], con = clust_out)
}



# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell)
Expand Down
94 changes: 94 additions & 0 deletions bin/2.2_motif_estimation/png_to_pdf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env Rscript
if (!require(optparse)) install.packages("optparse"); library(optparse)
if (!require(png)) install.packages("png"); library(png)
if (!require(grid)) install.packages("grid"); library(grid)
if (!require(gridExtra)) install.packages("gridExtra"); library(gridExtra)


option_list <- list(
make_option(opt_str = c("-i", "--png_new"), default = NULL, help = "List of png paths separated by \',\'", metavar = "character"),
make_option(opt_str = c("-p", "--png_old"), default = NULL, help = "List of png paths separated by \',\'", metavar = "character"),
make_option(opt_str = c("-f", "--index"), default = NULL, help = "Path to index file", metavar = "character")
)

opt_parser <- OptionParser(option_list = option_list,
description = "Generating a PDF from multiple png files",
epilogue = "Author: Rene Wiegandt <Rene.Wiegandt@mpi-bn.mpg.de>")

opt <- parse_args(opt_parser)



#' Generating a PDF from multiple png files.
#'
#' @parameter png_new <string> List of png paths separated by ,
#' @parameter png_old <string> List of png paths separated by ,
#' @parameter cluster_ids <string> List of IDs separated by ,
#' @parameter new_id <int> New Cluster ID
#'
#' @author Rene Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de
png_to_pdf <- function(png_top, png_list, cluster_ids, new_id, out = "cluster.pdf"){
#png_top <- "G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/04_glam2/Cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/04_glam2/Cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/04_glam2/Cluster_0/logo3.png"
#png_list <- "G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png"
#cluster_ids <- c(1,2,3,4,5,6,7,8,9,10)
top_split <- unlist(strsplit(png_top, ","))
png_split <- unlist(strsplit(png_list, ","))
cluster_list <- unlist(strsplit(cluster_ids, ","))
top_grob <- lapply(top_split,function(p){rasterGrob(readPNG(p),interpolate = FALSE)})
grobs <- lapply(png_split,function(p){rasterGrob(readPNG(p),interpolate = FALSE)})

split_grobs <- split(grobs, rep(1:ceiling(length(grobs)/3), each = 3)[1:length(grobs)])

rows <- lapply(seq(1,length(split_grobs)), function(sg){
arrangeGrob(grobs = split_grobs[[sg]], top = textGrob(paste0("Cluster ",cluster_list[sg]),gp = gpar(fontsize = 20,font = 3)), ncol = 3)
})
split_rows <- split(rows, rep(1:ceiling(length(rows)/4), each = 4)[1:length(rows)])

pdf(out,width = 17, height = 11)
grid.arrange(grobs = top_grob, nrow = 3, top = textGrob(paste0("New Cluster ",new_id),gp = gpar(fontsize = 30,font = 3)))
lapply(split_rows, function(r){
grid.arrange(grobs = r, nrow = 4, top = textGrob("Generated from...",gp = gpar(fontsize = 30,font = 3)))
})

dev.off()

}
#### index_file <- "G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/03_merged_fasta/cluster.txt"


#' Parsing index file.
#'
#' @parameter index_file Path to index file.
#'
#' @author Rene Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de
get_index <- function(index_file){
f <- data.table::fread(index_file, header = F)
index <- f[,c(2,8)]
index$V8 <- strsplit(index$V8,",")
return(index[unlist(lapply(index$V8, function(v){ifelse(length(v) > 1, TRUE, FALSE)}))])

}


png_to_pdf_set_up <- function(png_top, png_list, index_file){

index <- get_index(index_file)
t <- lapply(seq(nrow(index)), function(i){
new_id <- index[i,1]
cluster_ids <- index[i,2]
out <- paste0("Summary_cluster_", new_id, ".pdf")
png_to_pdf(png_top = ,png_list = ,cluster_ids = cluster_ids, new_id = new_id, out = out )
})

}


# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell)
if (!interactive()) {
png_to_pdf_set_up(opt$png_new, opt$png_old , opt$index)
}



Loading

0 comments on commit d6b44ea

Please sign in to comment.