Skip to content

Estimation motifs #92

Merged
merged 25 commits into from
Mar 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
170ca1e
get_motif_seq.R: Removed redundant code
renewiegandt Jan 28, 2019
565d4b8
Rename get_motif_seq.R
renewiegandt Jan 28, 2019
e470bed
pipeline.nf: changed output dir of logs
renewiegandt Jan 28, 2019
b931ea7
compareBed.sh: rename stats to log
renewiegandt Jan 28, 2019
35d6f0c
Added venn.R to 3.2_evaluation
renewiegandt Feb 4, 2019
98654e1
Update uropa.config
renewiegandt Feb 4, 2019
2186cc6
Added uropa to the enviroment yaml
renewiegandt Feb 4, 2019
3793c70
Added uropa and venn process; mergeing gtf files; added parameter for…
renewiegandt Feb 4, 2019
1a95d4c
Added yaml-file for meme-suite enviroment
renewiegandt Feb 11, 2019
011b99a
Added meme_env parameter
renewiegandt Feb 11, 2019
10a3df3
Fixed channel for venn; added conda meme-env
renewiegandt Feb 11, 2019
4c08eeb
added parameter gap penalty to 2.2config
renewiegandt Feb 11, 2019
9bf3ed9
merge_similar_clusters.R: bugfix
renewiegandt Feb 11, 2019
50ec488
fixed label_cluster; minor changes
renewiegandt Feb 11, 2019
e11e0af
merge_similar_clusters: bugfix; improved cluster plot
renewiegandt Mar 4, 2019
a46476f
pipeline: added uropa summary to output
renewiegandt Mar 4, 2019
51dcfc0
merge_similar_clusters: fixed filenames
renewiegandt Mar 4, 2019
f1eb5f1
pipeline: Added seed for glam2
renewiegandt Mar 4, 2019
c034878
Added skript png_to_pdf
renewiegandt Mar 20, 2019
fae46c6
Better output directory names
renewiegandt Mar 20, 2019
cdc73a2
Minor formating changes of output file
renewiegandt Mar 20, 2019
89ae057
Update parameters in README
renewiegandt Mar 26, 2019
f8c572b
Added new parameters to config files
renewiegandt Mar 26, 2019
5d9fb6c
Minor changes
renewiegandt Mar 26, 2019
07d492c
Minor changes
renewiegandt Mar 26, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 7 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,14 @@ Required arguments:
--genome_fasta Path to genome in FASTA-format
--motif_db Path to motif-database in MEME-format
--config Path to UROPA configuration file
--organism Input organism [hg38 | hg19 | mm9 | mm10]
--organism Input organism [hg38 | hg19 | mm9 | mm10]
--out Output Directory (Default: './out/')

Optional arguments:

--help [0|1] 1 to show this help message. (Default: 0)
--gtf_path Path to gtf-file. If path is set the process which creates a gtf-file is skipped.
--tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will be skipped.
--tfbs_path Path to directory with tfbsscan output. If given tfbsscan will be skipped.

Footprint extraction:
--window_length INT This parameter sets the length of a sliding window. (Default: 200)
Expand All @@ -78,7 +78,7 @@ Optional arguments:
Sequence preparation/ reduction:
--kmer INT K-mer length (Default: 10)
--aprox_motif_len INT Motif length (Default: 10)
--motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
--motif_occurrence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
--min_seq_length Interations Remove all sequences below this value. (Default: 10)
Clustering:
--global INT Global (=1) or local (=0) alignment. (Default: 0)
Expand All @@ -96,14 +96,17 @@ Optional arguments:
--tomtom_treshold FLOAT Threshold for similarity score. (Default: 0.01)
--best_motif INT Get the best X motifs per cluster. (Default: 3)
--gap_penalty INT Set penalty for gaps in GLAM2 (Default: 1000)
--seed Set seed for GLAM2 (Default: 123456789)
Moitf clustering:
--cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Default: 0)
--cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0)
--edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5)
--motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001)

Creating GTF:
--tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON
config
Evaluation:
--max_uropa_runs INT Maximum number UROPA runs running parallelized (Default: 10)
All arguments can be set in the configuration files
```

Expand Down
18 changes: 9 additions & 9 deletions bin/1.2_filter_motifs/compareBed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@

# desciption: This scripts uses bedtools to find new, non overlapping regions
# between input data in BED-format and a group of known motifs in
# BED-format.
# BED-format.

# For parameter description, run the script without parameters or -h.
# The output is a file with the filtered footprints and the log file compareBed.stats
# The output is a file with the filtered footprints and the log file compareBed.log

# One R scripts is used, compareBed_runinfo.R, stored in the same directory.
# Parameter values (file/directory paths) must not contain the | symbol, because it is chosen as "sed"-field separator.
Expand Down Expand Up @@ -124,7 +124,7 @@ then
echo " -o --output output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta"
echo " -h --help shows this help message"
echo ""
echo " Note: file paths must not contain the '|' pipe symbol"
echo " Note: file paths must not contain the '|' pipe symbol"
exit 0
fi

Expand Down Expand Up @@ -188,7 +188,7 @@ cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed
# motiffiles either from a directory OR comma separated list
if [ -d "$motifs" ]
then
# creates an array of all files with bed in its name in the directory $motifs
# creates an array of all files with bed in its name in the directory $motifs
declare -a motiffiles=(`ls $motifs | grep -i '.bed$' | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`)

# the else case means, that the motiffiles were passed comma separated with no whitespace.
Expand Down Expand Up @@ -264,9 +264,9 @@ fi

# remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
# also creates a small output file with information about the comparison
Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.stats
Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.log
# check if Rscript executed without errors
if [ $? -gt 0 ]
if [ $? -gt 0 ]
then
exit 1
fi
Expand All @@ -279,15 +279,15 @@ then
# add initial number of footprints to the log file
fp_initial=`cat $data | wc -l`
fp_initial=`expr $fp_initial - 1`
echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.log
else
# output will be overwritten if it exists
rm -f $output
# add initial number of footprints to the log file
cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.log
fi
# add number of footprints after filtering to the log file
cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats
cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.log

# add fasta sequences to bed and create fasta file
out_fasta=`echo $output | sed "s|.bed$|.fasta|g"`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ create_seq_json <- function(input, output, num, tmp_path, cluster_id) {

# Create json file
## naming
names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id)
names(datalist) <- paste0("Motif_", seq(1,as.numeric(num),1), " Cluster_", cluster_id)
## creating json object
json <- RJSONIO::toJSON(datalist, pretty = T , .withNames = T)
## writing file
Expand Down
29 changes: 16 additions & 13 deletions bin/2.2_motif_estimation/merge_similar_clusters.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env Rscript
if (!require(optparse)) install.packages("optparse"); library(optparse)
if (!require(igraph)) install.packages("igraph"); library(igraph)

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"),
Expand Down Expand Up @@ -40,37 +41,39 @@ merge_similar <- function(tsv_in, file_list, min_weight){

# remove rows if column 1 is identical to column 2
edgelist <- sim_not_unique[query_cluster != target_cluster]

unique_edgelist <- unique(edgelist)

# create graph from data.frame
g <- igraph::graph_from_edgelist(as.matrix(edgelist))
# converting graph to adjacency matrix
adj_matrix <- igraph::get.adjacency(g, names = T)
# generating weighted graph from adjacency matrix
g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)
g <- graph_from_edgelist(as.matrix(unique(edgelist)), directed = F)
g_tidy <- delete.vertices(simplify(g), degree(g) == 0)
g_tidy <- set_vertex_attr(g_tidy,"name",V(g_tidy), value = (sort(unique(c(unique_edgelist$V1,unique_edgelist$V2)))) - 1)

# get subgraphs from graph with edges of weight > min_weight
s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight > min_weight], del = F)
# plotting
png('motif_clusters.png')
plot(s1)
plot(g_tidy)
dev.off()
clust <- igraph::clusters(s1)
clust <- clusters(g)
if (clust$no < 1) {
b <- lapply(files, function(f){
system(paste("cat",f,">",basename(f)))
})
}
clust_out <- file("cluster.txt")
# merge FASTA-files depending on the clustered graphs
a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
a <- vapply(seq(from = 1, to = clust$no, by = 1), function(i){
#get vector with cluster ids, which are clustered together in "motif cluster" i
cl <- as.vector(which(clust$membership %in% c(i)))
#create string, which represents a list, containing all FASTA-files to be merged
fasta_list <- paste(files[cl], collapse = " ")
#create name for merged FASTA-file
name <- paste0("Cluster_",i,".fasta")
name <- paste0("Cluster_",i - 1 ,".fasta")
#merge fasta files
system(paste("cat",fasta_list,">",name))
})
paste("Cluster",i - 1,"is generated by merging cluster:",paste(cl - 1,collapse = ","))
}, "")
writeLines(a[a != ""], con = clust_out)
}



# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell)
Expand Down
94 changes: 94 additions & 0 deletions bin/2.2_motif_estimation/png_to_pdf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env Rscript
if (!require(optparse)) install.packages("optparse"); library(optparse)
if (!require(png)) install.packages("png"); library(png)
if (!require(grid)) install.packages("grid"); library(grid)
if (!require(gridExtra)) install.packages("gridExtra"); library(gridExtra)


option_list <- list(
make_option(opt_str = c("-i", "--png_new"), default = NULL, help = "List of png paths separated by \',\'", metavar = "character"),
make_option(opt_str = c("-p", "--png_old"), default = NULL, help = "List of png paths separated by \',\'", metavar = "character"),
make_option(opt_str = c("-f", "--index"), default = NULL, help = "Path to index file", metavar = "character")
)

opt_parser <- OptionParser(option_list = option_list,
description = "Generating a PDF from multiple png files",
epilogue = "Author: Rene Wiegandt <Rene.Wiegandt@mpi-bn.mpg.de>")

opt <- parse_args(opt_parser)



#' Generating a PDF from multiple png files.
#'
#' @parameter png_new <string> List of png paths separated by ,
#' @parameter png_old <string> List of png paths separated by ,
#' @parameter cluster_ids <string> List of IDs separated by ,
#' @parameter new_id <int> New Cluster ID
#'
#' @author Rene Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de
png_to_pdf <- function(png_top, png_list, cluster_ids, new_id, out = "cluster.pdf"){
#png_top <- "G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/04_glam2/Cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/04_glam2/Cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/04_glam2/Cluster_0/logo3.png"
#png_list <- "G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png"
#cluster_ids <- c(1,2,3,4,5,6,7,8,9,10)
top_split <- unlist(strsplit(png_top, ","))
png_split <- unlist(strsplit(png_list, ","))
cluster_list <- unlist(strsplit(cluster_ids, ","))
top_grob <- lapply(top_split,function(p){rasterGrob(readPNG(p),interpolate = FALSE)})
grobs <- lapply(png_split,function(p){rasterGrob(readPNG(p),interpolate = FALSE)})

split_grobs <- split(grobs, rep(1:ceiling(length(grobs)/3), each = 3)[1:length(grobs)])

rows <- lapply(seq(1,length(split_grobs)), function(sg){
arrangeGrob(grobs = split_grobs[[sg]], top = textGrob(paste0("Cluster ",cluster_list[sg]),gp = gpar(fontsize = 20,font = 3)), ncol = 3)
})
split_rows <- split(rows, rep(1:ceiling(length(rows)/4), each = 4)[1:length(rows)])

pdf(out,width = 17, height = 11)
grid.arrange(grobs = top_grob, nrow = 3, top = textGrob(paste0("New Cluster ",new_id),gp = gpar(fontsize = 30,font = 3)))
lapply(split_rows, function(r){
grid.arrange(grobs = r, nrow = 4, top = textGrob("Generated from...",gp = gpar(fontsize = 30,font = 3)))
})

dev.off()

}
#### index_file <- "G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/03_merged_fasta/cluster.txt"


#' Parsing index file.
#'
#' @parameter index_file Path to index file.
#'
#' @author Rene Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de
get_index <- function(index_file){
f <- data.table::fread(index_file, header = F)
index <- f[,c(2,8)]
index$V8 <- strsplit(index$V8,",")
return(index[unlist(lapply(index$V8, function(v){ifelse(length(v) > 1, TRUE, FALSE)}))])

}


png_to_pdf_set_up <- function(png_top, png_list, index_file){

index <- get_index(index_file)
t <- lapply(seq(nrow(index)), function(i){
new_id <- index[i,1]
cluster_ids <- index[i,2]
out <- paste0("Summary_cluster_", new_id, ".pdf")
png_to_pdf(png_top = ,png_list = ,cluster_ids = cluster_ids, new_id = new_id, out = out )
})

}


# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell)
if (!interactive()) {
png_to_pdf_set_up(opt$png_new, opt$png_old , opt$index)
}