diff --git a/README.md b/README.md index d080783..3c6e740 100644 --- a/README.md +++ b/README.md @@ -54,14 +54,14 @@ Required arguments: --genome_fasta Path to genome in FASTA-format --motif_db Path to motif-database in MEME-format --config Path to UROPA configuration file - --organism Input organism [hg38 | hg19 | mm9 | mm10] + --organism Input organism [hg38 | hg19 | mm9 | mm10] --out Output Directory (Default: './out/') Optional arguments: --help [0|1] 1 to show this help message. (Default: 0) --gtf_path Path to gtf-file. If path is set the process which creates a gtf-file is skipped. - --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will be skipped. + --tfbs_path Path to directory with tfbsscan output. If given tfbsscan will be skipped. Footprint extraction: --window_length INT This parameter sets the length of a sliding window. (Default: 200) @@ -78,7 +78,7 @@ Optional arguments: Sequence preparation/ reduction: --kmer INT K-mer length (Default: 10) --aprox_motif_len INT Motif length (Default: 10) - --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. + --motif_occurrence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. --min_seq_length Interations Remove all sequences below this value. (Default: 10) Clustering: --global INT Global (=1) or local (=0) alignment. (Default: 0) @@ -96,14 +96,17 @@ Optional arguments: --tomtom_treshold FLOAT Threshold for similarity score. (Default: 0.01) --best_motif INT Get the best X motifs per cluster. (Default: 3) --gap_penalty INT Set penalty for gaps in GLAM2 (Default: 1000) + --seed Set seed for GLAM2 (Default: 123456789) Moitf clustering: - --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Default: 0) + --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) Creating GTF: --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON config + Evaluation: + --max_uropa_runs INT Maximum number UROPA runs running parallelized (Default: 10) All arguments can be set in the configuration files ``` diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index aa97c95..3d25631 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -5,10 +5,10 @@ # desciption: This scripts uses bedtools to find new, non overlapping regions # between input data in BED-format and a group of known motifs in -# BED-format. +# BED-format. # For parameter description, run the script without parameters or -h. -# The output is a file with the filtered footprints and the log file compareBed.stats +# The output is a file with the filtered footprints and the log file compareBed.log # One R scripts is used, compareBed_runinfo.R, stored in the same directory. # Parameter values (file/directory paths) must not contain the | symbol, because it is chosen as "sed"-field separator. @@ -124,7 +124,7 @@ then echo " -o --output output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta" echo " -h --help shows this help message" echo "" - echo " Note: file paths must not contain the '|' pipe symbol" + echo " Note: file paths must not contain the '|' pipe symbol" exit 0 fi @@ -188,7 +188,7 @@ cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed # motiffiles either from a directory OR comma separated list if [ -d "$motifs" ] then - # creates an array of all files with bed in its name in the directory $motifs + # creates an array of all files with bed in its name in the directory $motifs declare -a motiffiles=(`ls $motifs | grep -i '.bed$' | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`) # the else case means, that the motiffiles were passed comma separated with no whitespace. @@ -264,9 +264,9 @@ fi # remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition # also creates a small output file with information about the comparison -Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.stats +Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.log # check if Rscript executed without errors -if [ $? -gt 0 ] +if [ $? -gt 0 ] then exit 1 fi @@ -279,15 +279,15 @@ then # add initial number of footprints to the log file fp_initial=`cat $data | wc -l` fp_initial=`expr $fp_initial - 1` - echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats + echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.log else # output will be overwritten if it exists rm -f $output # add initial number of footprints to the log file - cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats + cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.log fi # add number of footprints after filtering to the log file -cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats +cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.log # add fasta sequences to bed and create fasta file out_fasta=`echo $output | sed "s|.bed$|.fasta|g"` diff --git a/bin/2.2_motif_estimation/get_motif_seq.R b/bin/2.2_motif_estimation/get_motif_seq_id.R similarity index 94% rename from bin/2.2_motif_estimation/get_motif_seq.R rename to bin/2.2_motif_estimation/get_motif_seq_id.R index bbcaff6..ca07eb6 100644 --- a/bin/2.2_motif_estimation/get_motif_seq.R +++ b/bin/2.2_motif_estimation/get_motif_seq_id.R @@ -68,7 +68,7 @@ create_seq_json <- function(input, output, num, tmp_path, cluster_id) { # Create json file ## naming - names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id) + names(datalist) <- paste0("Motif_", seq(1,as.numeric(num),1), " Cluster_", cluster_id) ## creating json object json <- RJSONIO::toJSON(datalist, pretty = T , .withNames = T) ## writing file diff --git a/bin/2.2_motif_estimation/merge_similar_clusters.R b/bin/2.2_motif_estimation/merge_similar_clusters.R index d9b2373..4cf8c47 100644 --- a/bin/2.2_motif_estimation/merge_similar_clusters.R +++ b/bin/2.2_motif_estimation/merge_similar_clusters.R @@ -1,5 +1,6 @@ #!/usr/bin/env Rscript if (!require(optparse)) install.packages("optparse"); library(optparse) +if (!require(igraph)) install.packages("igraph"); library(igraph) option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"), @@ -40,37 +41,39 @@ merge_similar <- function(tsv_in, file_list, min_weight){ # remove rows if column 1 is identical to column 2 edgelist <- sim_not_unique[query_cluster != target_cluster] - + unique_edgelist <- unique(edgelist) + # create graph from data.frame - g <- igraph::graph_from_edgelist(as.matrix(edgelist)) - # converting graph to adjacency matrix - adj_matrix <- igraph::get.adjacency(g, names = T) - # generating weighted graph from adjacency matrix - g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T) + g <- graph_from_edgelist(as.matrix(unique(edgelist)), directed = F) + g_tidy <- delete.vertices(simplify(g), degree(g) == 0) + g_tidy <- set_vertex_attr(g_tidy,"name",V(g_tidy), value = (sort(unique(c(unique_edgelist$V1,unique_edgelist$V2)))) - 1) - # get subgraphs from graph with edges of weight > min_weight - s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight > min_weight], del = F) + # plotting png('motif_clusters.png') - plot(s1) + plot(g_tidy) dev.off() - clust <- igraph::clusters(s1) + clust <- clusters(g) if (clust$no < 1) { b <- lapply(files, function(f){ system(paste("cat",f,">",basename(f))) }) } + clust_out <- file("cluster.txt") # merge FASTA-files depending on the clustered graphs - a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){ + a <- vapply(seq(from = 1, to = clust$no, by = 1), function(i){ #get vector with cluster ids, which are clustered together in "motif cluster" i cl <- as.vector(which(clust$membership %in% c(i))) #create string, which represents a list, containing all FASTA-files to be merged fasta_list <- paste(files[cl], collapse = " ") #create name for merged FASTA-file - name <- paste0("Cluster_",i,".fasta") + name <- paste0("Cluster_",i - 1 ,".fasta") #merge fasta files system(paste("cat",fasta_list,">",name)) - }) + paste("Cluster",i - 1,"is generated by merging cluster:",paste(cl - 1,collapse = ",")) + }, "") + writeLines(a[a != ""], con = clust_out) } + # run function merge_similar with given parameteres if not in interactive context (e.g. run from shell) diff --git a/bin/2.2_motif_estimation/png_to_pdf.R b/bin/2.2_motif_estimation/png_to_pdf.R new file mode 100644 index 0000000..7c86920 --- /dev/null +++ b/bin/2.2_motif_estimation/png_to_pdf.R @@ -0,0 +1,94 @@ +#!/usr/bin/env Rscript +if (!require(optparse)) install.packages("optparse"); library(optparse) +if (!require(png)) install.packages("png"); library(png) +if (!require(grid)) install.packages("grid"); library(grid) +if (!require(gridExtra)) install.packages("gridExtra"); library(gridExtra) + + +option_list <- list( + make_option(opt_str = c("-i", "--png_new"), default = NULL, help = "List of png paths separated by \',\'", metavar = "character"), + make_option(opt_str = c("-p", "--png_old"), default = NULL, help = "List of png paths separated by \',\'", metavar = "character"), + make_option(opt_str = c("-f", "--index"), default = NULL, help = "Path to index file", metavar = "character") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Generating a PDF from multiple png files", + epilogue = "Author: Rene Wiegandt ") + +opt <- parse_args(opt_parser) + + + +#' Generating a PDF from multiple png files. +#' +#' @parameter png_new List of png paths separated by , +#' @parameter png_old List of png paths separated by , +#' @parameter cluster_ids List of IDs separated by , +#' @parameter new_id New Cluster ID +#' +#' @author Rene Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de +png_to_pdf <- function(png_top, png_list, cluster_ids, new_id, out = "cluster.pdf"){ + #png_top <- "G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/04_glam2/Cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/04_glam2/Cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/04_glam2/Cluster_0/logo3.png" + #png_list <- "G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png" + #cluster_ids <- c(1,2,3,4,5,6,7,8,9,10) + top_split <- unlist(strsplit(png_top, ",")) + png_split <- unlist(strsplit(png_list, ",")) + cluster_list <- unlist(strsplit(cluster_ids, ",")) + top_grob <- lapply(top_split,function(p){rasterGrob(readPNG(p),interpolate = FALSE)}) + grobs <- lapply(png_split,function(p){rasterGrob(readPNG(p),interpolate = FALSE)}) + + split_grobs <- split(grobs, rep(1:ceiling(length(grobs)/3), each = 3)[1:length(grobs)]) + + rows <- lapply(seq(1,length(split_grobs)), function(sg){ + arrangeGrob(grobs = split_grobs[[sg]], top = textGrob(paste0("Cluster ",cluster_list[sg]),gp = gpar(fontsize = 20,font = 3)), ncol = 3) + }) + split_rows <- split(rows, rep(1:ceiling(length(rows)/4), each = 4)[1:length(rows)]) + + pdf(out,width = 17, height = 11) + grid.arrange(grobs = top_grob, nrow = 3, top = textGrob(paste0("New Cluster ",new_id),gp = gpar(fontsize = 30,font = 3))) + lapply(split_rows, function(r){ + grid.arrange(grobs = r, nrow = 4, top = textGrob("Generated from...",gp = gpar(fontsize = 30,font = 3))) + }) + + dev.off() + +} +#### index_file <- "G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/03_merged_fasta/cluster.txt" + + +#' Parsing index file. +#' +#' @parameter index_file Path to index file. +#' +#' @author Rene Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de +get_index <- function(index_file){ + f <- data.table::fread(index_file, header = F) + index <- f[,c(2,8)] + index$V8 <- strsplit(index$V8,",") + return(index[unlist(lapply(index$V8, function(v){ifelse(length(v) > 1, TRUE, FALSE)}))]) + +} + + +png_to_pdf_set_up <- function(png_top, png_list, index_file){ + + index <- get_index(index_file) + t <- lapply(seq(nrow(index)), function(i){ + new_id <- index[i,1] + cluster_ids <- index[i,2] + out <- paste0("Summary_cluster_", new_id, ".pdf") + png_to_pdf(png_top = ,png_list = ,cluster_ids = cluster_ids, new_id = new_id, out = out ) + }) + +} + + +# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell) +if (!interactive()) { + png_to_pdf_set_up(opt$png_new, opt$png_old , opt$index) +} + + + diff --git a/bin/3.2_evaluation/venn.R b/bin/3.2_evaluation/venn.R new file mode 100644 index 0000000..94d8cd2 --- /dev/null +++ b/bin/3.2_evaluation/venn.R @@ -0,0 +1,87 @@ +#!/usr/bin/env Rscript + +### master JLU 2018 Modul 3.2 Motif Evaluation ### +# author: afust +# capabilities: +# ratio: sequenced used to create motif vs sites found by rescan + +# dependencies +suppressMessages(library(data.table, quietly = TRUE)) +suppressMessages(library(VennDiagram, quietly = TRUE)) +suppressMessages(library(RJSONIO, quietly = TRUE)) +suppressMessages(library(getopt, quietly = TRUE)) + +# ---------------------------- options ----------------------------------------- # +# 0 flag +# 1 mandatory parameter +# 2 optional parameter +options(warn = 1) +options <- matrix(c( +# tfbs is either the directory including all accessible regions +# or the directory including precalculated score matrices etc +"cluster_dir", "i", 1, "character", "directory with rescan results", +"pattern", "p", 1, "character", "cluster pattern, e.g. 'Cluster_96'", +"seq_dir", "l", 1, "character", "directory with fp assignment per cluster", +"clustered_fp", "c", 1, "character", "clustering results", +"outdir", "o", 1, "character", "output directory", +"help", "h", 0, "logical", "Provides command line help" +), byrow = TRUE, ncol = 5) +opt <- getopt(options) + +# ---------------------------- options processing ----------------------------- # +# check for mandatory parameters and print help if required +if (!is.null(opt$help) ){ + cat(getopt(options, usage = TRUE), file = stderr()); q(status = 0) +} + +# load input + +# list used files: due to rescan script there is one output per motif for each cluster +# all should be used in one plot result -> list files with pattern +rescan_files <- list.files(opt$cluster_dir, pattern = opt$pattern, ignore.case = FALSE) +# list of fps representing a motif +fp_list <- fromJSON(opt$seq_dir) + +# clustering information needed to extract genomic locations of fps +# -> reduce to current cluster +dt.clustering <- fread(opt$clustered_fp, sep="\t", header = TRUE) +colnames(dt.clustering) <- sub("#chr", "chr", colnames(dt.clustering)) +cl <- as.numeric(strsplit(opt$pattern, "_")[[1]][2]) +dt.clustering <- dt.clustering[dt.clustering$cluster == cl, ] + +# pdf including venn diagrams for all motifs +pdf(paper="a4", file=file.path(opt$outdir, paste0("Cluster_",cl,".pdf")), useDingbats = FALSE) +plot.new() +for(f in rescan_files){ + # load genomic locations found by rescan with motif + dt.rescan <- fread(file.path(opt$cluster_dir,f), sep="\t", header = FALSE, select = c(1:4), stringsAsFactors=FALSE) + colnames(dt.rescan) <- c("chr", "start", "end", "name") + + # extract fps that build up current motif + tmp <- strsplit(as.character(dt.rescan[1,"name"]), "_")[[1]] + cluster <- paste(tmp[1],tmp[2],sep="_") + id <- paste(paste("Motif", tmp[3], sep="_"), cluster, sep=" ") + #cat("\nfor",id, "there are", length(unname(unlist(fp_list[id]))), "sequences assigned") + # reduce cluster info to sites used for current motif to find overlaps + dt.motif_seq <- dt.clustering[dt.clustering$name %in% unname(unlist(fp_list[id])),c("chr","start", "end")] + setkeyv(dt.rescan, c("chr","start","end")) + setkeyv(dt.motif_seq, c("chr","start","end")) + dt.overlaps <- foverlaps(dt.motif_seq, dt.rescan, mult="first") + dt.overlaps <- dt.overlaps[complete.cases(dt.overlaps),] + dt.overlaps <- dt.overlaps[!duplicated(dt.overlaps$i.start),] + # number of locations found in both: motif setup sequences and rescan + hits <- nrow(dt.overlaps) + #cat("\nnumbers:\nregions assignet to motif", nrow(dt.motif_seq), "\nregions found by rescan", nrow(dt.rescan), "\noverlaps", hits) + # draw venn + draw.pairwise.venn(area1 = nrow(dt.motif_seq), area2 = nrow(dt.rescan), cross.area = hits, + category = c(paste0("related motif sites of ",cluster), paste0("rescan with ", id)), + lty = rep("blank", 2), scaled = TRUE, + fill= c("blue", "red"), alpha = c(0.5,0.5), + cat.pos = c(0, 0), cat.dist = rep(0.025, 2) ) + title(id) + if(f != tail(rescan_files,1)){ + grid.newpage() + } +} + +invisible(dev.off()) diff --git a/config/cluster.config b/config/cluster.config index 1ba0b33..9ca8601 100644 --- a/config/cluster.config +++ b/config/cluster.config @@ -2,7 +2,7 @@ params{ //reduce_bed kmer=10 aprox_motif_len=10 - motif_occurence=1 + motif_occurrence=1 min_seq_length=10 //cdhit_wrapper diff --git a/config/motif_estimation.config b/config/motif_estimation.config index c829265..e82c4c6 100644 --- a/config/motif_estimation.config +++ b/config/motif_estimation.config @@ -1,11 +1,12 @@ params { //bed_to_clustered_fasta - min_seq = 10 + min_seq = 100 //glam2 motif_min_key = 8 motif_max_key = 20 iteration = 10000 + gap_penalty = 1000 //tomtom tomtom_treshold = 0.01 diff --git a/config/uropa.config b/config/uropa.config index 3c4a189..0b93f22 100644 --- a/config/uropa.config +++ b/config/uropa.config @@ -1,16 +1,8 @@ { -"queries":[ - {"feature":"", - "feature.anchor": "", - "distance": [ , ], - "strand":"", - "direction":"", - "internals":"", - "filter.attribute":"", - "attribute.value":"", - "show.attributes":"" } - ], -"priority": "", -"gtf": "placeholder_gtf", -"bed": "placeholder_bed" -} \ No newline at end of file + "queries":[ + {"feature":"gene","distance":[5000,1000], "show.attributes":"gene_id"}, + {"feature":["ctcf_binding_site","enhancer", "open_chromatin", "promoter", "promoter_flanking_region", "tf_binding_site"], "distance":[1000], "internals":"TRUE"} + ], + "gtf": "placeholder_gtf", + "bed": "placeholder_bed" +} diff --git a/masterenv.yml b/masterenv.yml index dc19c1a..8f2ad34 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -24,3 +24,4 @@ dependencies: - seaborn - crossmap - r-bit64 + - uropa diff --git a/meme_suite.yml b/meme_suite.yml new file mode 100644 index 0000000..2ac242e --- /dev/null +++ b/meme_suite.yml @@ -0,0 +1,5 @@ + +name: meme_suite +dependencies: + - python=2.7 + - meme diff --git a/nextflow.config b/nextflow.config index 653afd6..c935ad0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,11 +1,11 @@ -wd = "" +wd = "/mnt/agnerds/Rene.Wiegandt/repo2/masterJLU2018" createTimeout = 40 -params.threads=60 //Parameter for for scripts! Not for nextflow processes. -params.config="${wd}/config/uropa.config" +params.threads = 10 //Parameter for for scripts! Not for nextflow processes. +params.config = "${wd}/config/uropa.config" env { path_bin = "${wd}/bin" - path_env = "${wd}/masterenv.yml" + meme_env = "${wd}/meme_suite.yml" } includeConfig "${wd}/config/footprint_extraction.config" diff --git a/pipeline.nf b/pipeline.nf index 4e3b4fc..afd3399 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -1,5 +1,6 @@ #!/usr/bin/env nextflow +disable_mo_clu = 1 //setting default values params.bigwig="" @@ -10,6 +11,7 @@ params.tfbs_path="" params.help = 0 params.gtf_path="" + params.gtf2="" params.out = "./out/" //footprint_extraction @@ -24,19 +26,19 @@ params.tfbsscan_method = "moods" //clustering - //reduce_sequence - params.kmer=10 - params.aprox_motif_len=10 - params.motif_occurence=1 - params.min_seq_length=10 - - //cdhit_wrapper - params.global=0 - params.identity=0.8 - params.sequence_coverage=8 - params.memory=800 - params.throw_away_seq=9 - params.strand=0 + //reduce_sequence + params.kmer=10 + params.aprox_motif_len=10 + params.motif_occurrence=1 + params.min_seq_length=10 + + //cdhit_wrapper + params.global=0 + params.identity=0.8 + params.sequence_coverage=8 + params.memory=800 + params.throw_away_seq=9 + params.strand=0 //motif_estimation //bed_to_clustered_fasta @@ -47,6 +49,7 @@ params.motif_max_key = 20 // Maximum number of key positions (aligned columns) params.iteration = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime. params.gap_penalty = 1000 + params.seed = 123456789 //tomtom params.tomtom_treshold = 0.01 // threshold for similarity score. @@ -62,94 +65,103 @@ params.organism="" params.tissues="" -if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" != "0" ){ -log.info """ -Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file] - -Required arguments: - --bigwig Path to BigWig-file - --bed Path to BED-file - --genome_fasta Path to genome in FASTA-format - --motif_db Path to motif-database in MEME-format - --config Path to UROPA configuration file - --organism Input organism [hg38 | hg19 | mm9 | mm10] - --out Output Directory (Default: './out/') - -Optional arguments: - - --help [0|1] 1 to show this help message. (Default: 0) - --gtf_path Path to gtf-file. If path is set the process which creates a gtf-file is skipped. - --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will be skipped. - - Footprint extraction: - --window_length INT This parameter sets the length of a sliding window. (Default: 200) - --step INT This parameter sets the number of positions to slide the window forward. (Default: 100) - --percentage INT Threshold in percent (Default: 0) - --min_gap INT If footprints are less than X bases apart the footprints will be merged (Default: 6) - - Filter motifs: - --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) - --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 200) - --tfbsscan_method [moods|fimo] Method used by tfbsscan. (Default: moods) - - Cluster: - Sequence preparation/ reduction: - --kmer INT K-mer length (Default: 10) - --aprox_motif_len INT Motif length (Default: 10) - --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. - --min_seq_length Interations Remove all sequences below this value. (Default: 10) - Clustering: - --global INT Global (=1) or local (=0) alignment. (Default: 0) - --identity FLOAT Identity threshold. (Default: 0.8) - --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) - --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) - --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) - --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) - - Motif estimation: - --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) - --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) - --motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 20) - --iteration INT Number of iterations done by GLAM2. More Iterations: better results, higher runtime. (Default: 10000) - --tomtom_treshold FLOAT Threshold for similarity score. (Default: 0.01) - --best_motif INT Get the best X motifs per cluster. (Default: 3) - --gap_penalty INT Set penalty for gaps in GLAM2 (Default: 1000) - Moitf clustering: - --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) - --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) - --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) - - Creating GTF: - --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON - config -All arguments can be set in the configuration files - ``` -""" -System.exit(2) +//evaluation + params.max_uropa_runs = 10 + +if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || params.gtf2 == "" || "${params.help}" != "0" ) { + log.info """ + Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file] + + Required arguments: + --bigwig Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --motif_db Path to motif-database in MEME-format + --config Path to UROPA configuration file + --organism Input organism [hg38 | hg19 | mm9 | mm10] + --out Output Directory (Default: './out/') + + Optional arguments: + + --help [0|1] 1 to show this help message. (Default: 0) + --gtf_path Path to gtf-file. If path is set the process which creates a gtf-file is skipped. + --tfbs_path Path to directory with tfbsscan output. If given tfbsscan will be skipped. + + Footprint extraction: + --window_length INT This parameter sets the length of a sliding window. (Default: 200) + --step INT This parameter sets the number of positions to slide the window forward. (Default: 100) + --percentage INT Threshold in percent (Default: 0) + --min_gap INT If footprints are less than X bases apart the footprints will be merged (Default: 6) + + Filter motifs: + --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) + --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 200) + --tfbsscan_method [moods|fimo] Method used by tfbsscan. (Default: moods) + + Cluster: + Sequence preparation/ reduction: + --kmer INT K-mer length (Default: 10) + --aprox_motif_len INT Motif length (Default: 10) + --motif_occurrence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. + --min_seq_length Interations Remove all sequences below this value. (Default: 10) + Clustering: + --global INT Global (=1) or local (=0) alignment. (Default: 0) + --identity FLOAT Identity threshold. (Default: 0.8) + --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) + + Motif estimation: + --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) + --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) + --motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 20) + --iteration INT Number of iterations done by GLAM2. More Iterations: better results, higher runtime. (Default: 10000) + --tomtom_treshold FLOAT Threshold for similarity score. (Default: 0.01) + --best_motif INT Get the best X motifs per cluster. (Default: 3) + --gap_penalty INT Set penalty for gaps in GLAM2 (Default: 1000) + --seed Set seed for GLAM2 (Default: 123456789) + Moitf clustering: + --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) + --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) + --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) + + Creating GTF: + --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON + config + Evaluation: + --max_uropa_runs INT Maximum number UROPA runs running parallelized (Default: 10) + All arguments can be set in the configuration files + ``` + """ + System.exit(2) } else { Channel.fromPath(params.bigwig).map {it -> [it.simpleName, it]}.set {bigwig_input} - Channel.fromPath(params.bed).into {bed_input; bed_for_tfbsscan} + Channel.fromPath(params.bed).into {bed_input; bed_for_tfbsscan; bed_for_rescan} Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} Channel.fromPath(params.motif_db).into {db_for_motivscan; db_for_tomtom} Channel.fromPath(params.config).set {config} + Channel.fromPath(params.gtf2).set {gtf_to_merge} if (params.tfbs_path != ""){ known_tfbs = file(params.tfbs_path).toAbsolutePath() } } + + /* Checking for parameter input! */ int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer", - "aprox_motif_len", "motif_occurence", "min_seq_length", "global", + "aprox_motif_len", "motif_occurrence", "min_seq_length", "global", "sequence_coverage", "memory", "throw_away_seq", "strand", "min_seq", "motif_min_key", "motif_max_key", "iteration", "edge_weight", "best_motif", "min_gap", "gap_penalty", "edge_weight", - "threads", ] + "threads", "max_uropa_runs"] req_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config"] all_params = int_params + req_params + ["organism" , "identity", "tfbsscan_method", "percentage", "tomtom_treshold", "motif_similarity_thresh", "out", - "tissues", "gtf_path", "cluster_motif", "tfbs_path", "help"] + "tissues", "gtf_path", "cluster_motif", "tfbs_path", "help", "gtf2", "seed"] valid_organism = ["hg38", "hg19", "mm9", "mm10"] valid_tfbsscan_methods = ["moods","fimo"] @@ -212,7 +224,7 @@ process footprint_extraction { tag{name} publishDir "${params.out}/1.1_footprint_extraction/", mode: 'copy', pattern: '*.bed' - publishDir "${params.out}/1.1_footprint_extraction/log", mode: 'copy', pattern: '*.log' + publishDir "${params.out}/log", mode: 'copy', pattern: '*.log' input: set name, file (bigWig), file (bed) from footprint_in @@ -235,7 +247,8 @@ Find postitions of known tfbs with tfbsscan and discard the overlaps with compar */ process overlap_with_known_TFBS { //conda "${path_env}" - publishDir "${params.out}/1.2_filter_motifs", mode :'copy' + publishDir "${params.out}/1.2_filter_motifs", mode :'copy', pattern: '*_unknown.bed' + publishDir "${params.out}/log", mode: 'copy', pattern: '*.log' tag{name} errorStrategy 'finish' @@ -245,7 +258,7 @@ process overlap_with_known_TFBS { output: set name, file ("${name}_unknown.bed") into bed_for_reducing file ('./known_tfbs/*.bed') optional true - file ('*.stats') + file ('*.log') script: if(params.tfbs_path == ""){ @@ -280,7 +293,7 @@ process reduce_sequence { tag{name} publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy', pattern: '*.bed' - publishDir "${params.out}/2.1_clustering/log", mode: 'copy', pattern: '*.log' + publishDir "${params.out}/log", mode: 'copy', pattern: '*.log' input: set name, file (bed) from bed_for_reducing @@ -291,7 +304,7 @@ process reduce_sequence { script: """ - Rscript ${path_bin}/2.1_clustering/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} --summary reduce_sequence.log + Rscript ${path_bin}/2.1_clustering/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurrence} -s ${params.min_seq_length} --summary reduce_sequence.log """ } @@ -303,13 +316,14 @@ process clustering { //conda "${path_env}" publishDir "${params.out}/2.1_clustering/", mode: 'copy', pattern: '*.bed' - publishDir "${params.out}/2.1_clustering/log", mode: 'copy', pattern: '*.log' + publishDir "${params.out}/log", mode: 'copy', pattern: '*.log' input: set name, file (bed) from bed_for_clustering output: set name, file ('*.bed') into bed_for_motif_esitmation + file ('*.bed') into bed_for_venn file ('*.log') script: @@ -325,21 +339,21 @@ Converting BED-File to one FASTA-File per cluster process bed_to_clustered_fasta { //conda "${path_env}" - publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy', pattern: '*.FASTA' - tag{name} + publishDir "${params.out}/2.2_motif_estimation/01_fasta/", mode: 'copy', pattern: '*.FASTA' + tag{name} - input: - set name, file (bed) from bed_for_motif_esitmation + input: + set name, file (bed) from bed_for_motif_esitmation - output: - file ('*.FASTA') into fasta_for_glam2 - file ('*.FASTA') into fasta_for_motif_cluster + output: + file ('*.FASTA') into fasta_for_glam2 + file ('*.FASTA') into fasta_for_motif_cluster file ('*.log') into log22 - script: - """ - Rscript ${path_bin}/2.2_motif_estimation/bed_to_fasta.R -i ${bed} -p ${name} -m ${params.min_seq} - """ + script: + """ + Rscript ${path_bin}/2.2_motif_estimation/bed_to_fasta.R -i ${bed} -p ${name} -m ${params.min_seq} + """ } @@ -354,35 +368,36 @@ Generating Motifs through alignment and scoring best local matches. */ process glam2 { - tag{name} - publishDir "${params.out}/2.2_motif_estimation/glam2/", mode: 'copy' - //conda "${path_env}" + tag{name} + publishDir "${params.out}/2.2_motif_estimation/02_glam2/", mode: 'copy' + conda "${meme_env}" - input: - set name, file (fasta) from fasta_for_glam2 + input: + set name, file (fasta) from fasta_for_glam2 output: - file("${name}/*.meme") into meme_to_merge - set name, file("${name}/*.meme") into meme_for_tomtom - set name, file("${name}/*.meme") into meme_to_search_in_merged - set name, file("${name}/*.meme") into meme_for_filter + file("${name}/*.meme") into meme_to_merge + set name, file("${name}/*.meme") into meme_for_tomtom, meme_to_search_in_merged, meme_for_filter set name, file("${name}/*.txt") into glam_for_seq + set name, file("${name}/logo[1-9]*.png") into motif_png file ('*') - script: - """ - glam2 n ${fasta} -O ./${name}/ -E ${params.gap_penalty} -J ${params.gap_penalty} -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} - """ + script: + """ + glam2 n ${fasta} -O ./${name}/ -E ${params.gap_penalty} -J ${params.gap_penalty} -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} -s ${params.seed} + """ } +//motif_png.toSortedList()).println() + /* Combining all MEME-files to one big MEME-file. The paths are sorted numerically depending on the cluster ID. */ process merge_meme { - publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_meme/", mode: 'copy' - //conda "${path_env}" + publishDir "${params.out}/2.2_motif_estimation/03_motif_cluster/", mode: 'copy' + // conda "/home/rwiegan/.conda/envs/meme" input: val (memelist) from meme_to_merge.toList() @@ -392,6 +407,7 @@ process merge_meme { when: params.cluster_motif == 1 + disable_mo_clu == 0 script: //sorting @@ -401,6 +417,7 @@ process merge_meme { //create list for bash meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"") """ + set +u; source activate meme; meme2meme ${meme_list} > merged_meme.meme """ } @@ -411,11 +428,11 @@ to_find_similar_motifs = meme_to_search_in_merged.combine(merged_meme) Running Tomtom on merged meme-files. Output table has the information which clusters are similar to each other. */ -process find_similar_motifs { +process compare_cluster { tag{name} - publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/cluster_similarity/", mode: 'copy' - //conda "${path_env}" + publishDir "${params.out}/2.2_motif_estimation/03_motif_cluster/01_tomtom/", mode: 'copy' + conda "${meme_env}" input: set name, file (meme) ,file (merged_meme) from to_find_similar_motifs @@ -438,7 +455,7 @@ Label first column of TSV-file with Cluster ID process label_cluster { tag{name} - //conda "${path_env}" + publishDir "${params.out}/2.2_motif_estimation/03_motif_cluster/02_labeled_tsv/", mode: 'copy' input: set name, file (tsv) from motif_similarity @@ -450,17 +467,19 @@ process label_cluster { params.cluster_motif == 1 script: + id = name.split('_')[-1].toInteger() + 1 """ - Rscript ${path_bin}/2.2_motif_estimation/label_cluster.R -i ${tsv} -o ${name}_labeled.tsv + tail -n +2 ${tsv} | awk -F \$'\t' 'BEGIN {OFS = FS}(\$1=\$1".${id}")' > ${name}_labeled.tsv """ } + /* Merging tsv files_for_merge_fasta */ process merge_labeled_tsv { - publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/03_motif_cluster/", mode: 'copy' input: val (tsvlist) from labeled_tsv.toSortedList { it.toString().tokenize('_')[-2] as Integer } @@ -474,7 +493,7 @@ process merge_labeled_tsv { script: tsvs = tsvlist.toString().replaceAll(/\,|\[|\]/,"") """ - echo -e 'Query_ID\tTarget_ID\tOptimal_offset\tp-value\tE-value\tq-value\tOverlap\tQuery_consensus\tTarget_consensus\tOrientation'> merged_labeled.tsv + echo -e 'Query_ID\tTarget_ID\tOptimal_offset\tp-value\tE-value\tq-value\tOverlap\tQuery_consensus\tTarget_consensus\tOrientation' > merged_labeled.tsv for i in $tsvs; do cat \$i >> merged_labeled.tsv done @@ -490,23 +509,23 @@ Merging FASTA-files of similar clusters process merge_fasta { //conda "${path_env}" - publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_fasta/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/03_motif_cluster/03_merged_fasta/", mode: 'copy' input: set file (motiv_sim), val (fasta_list) from files_for_merge_fasta output: file ('*.fasta') into motif_clustered_fasta_list - file('*.png') + file('*') - when: - params.cluster_motif == 1 + when: + params.cluster_motif == 1 script: - fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } - fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") + fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } + fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") """ - Rscript ${path_bin}/2.2_motif_estimation/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight} + Rscript ${path_bin}/2.2_motif_estimation/merge_similar_clusters.R -i ${motiv_sim} -l ${fastalist} -w ${params.edge_weight} """ } @@ -518,26 +537,25 @@ Run GLAM2 on emrged FASTA-files */ process clustered_glam2 { - publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/03_motif_cluster/04_glam2/", mode: 'copy' tag{name} - //conda "${path_env}" + conda "${meme_env}" input: file (fasta) from motif_clustered_fasta_flat output: - set name, file ("/${name}/*.meme") into clustered_meme_for_tomtom - set name, file ("/${name}/*.meme") into clustered_meme_for_filter - set name, file("/${name}/glam2.txt") into clust_glam_for_seq - file('*') + set name, file ("./${name}/glam2.meme") into clustered_meme_for_tomtom, clustered_meme_for_filter + set name, file("./${name}/glam2.txt") into clust_glam_for_seq + file('*') when: params.cluster_motif == 1 script: - name = fasta.getBaseName() + name = fasta.getBaseName() """ - glam2 n ${fasta} -O ./${name}/ -E ${params.gap_penalty} -J ${params.gap_penalty} -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + glam2 n ${fasta} -O ./${name}/ -E ${params.gap_penalty} -J ${params.gap_penalty} -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} -s ${params.seed} """ } @@ -562,20 +580,19 @@ Tomtom searches motifs in databases. */ process tomtom { - tag{name} - publishDir "${params.out}/2.2_motif_estimation/tomtom/", mode: 'copy' - //conda "${path_env}" + tag{name} + publishDir "${params.out}/2.2_motif_estimation/04_tomtom/", mode: 'copy' - input: - set name, file (meme) from for_tomtom + input: + set name, file (meme) from for_tomtom - output: - set name, file ('*.tsv') into tsv_for_filter + output: + set name, file ('*.tsv') into tsv_for_filter - script: - """ - tomtom ${meme} ${params.motif_db} -thresh ${params.tomtom_treshold} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv - """ + script: + """ + tomtom ${meme} ${params.motif_db} -thresh ${params.tomtom_treshold} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv + """ } @@ -597,7 +614,7 @@ log22.combine(count_cluster).combine(count_cluster_before_filter ).set {log22_fi //Writes log file for 2.2_motif_estimation process write_log_for_motif_estimation { - publishDir "${params.out}/2.2_motif_estimation/log/", mode: 'copy' + publishDir "${params.out}/log/", mode: 'copy' input: set file (logfile), after_filter, before_filter from log22_final @@ -613,20 +630,20 @@ process write_log_for_motif_estimation { """ } + //If channel 'check' is empty print errormessage process check_for_unknown_motifs { - echo true - - input: - val x from check.ifEmpty('EMPTY') + echo true - when: - x == 'EMPTY' + input: + val x from check.ifEmpty('EMPTY') - """ - echo '>>> STOPPED: No unknown Motifs were found.' - """ + when: + x == 'EMPTY' + """ + echo '>>> STOPPED: No unknown Motifs were found.' + """ } @@ -635,13 +652,13 @@ process get_best_motif { //conda "${path_env}" tag{name} - publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/05_best_unknown_motifs/motifs/", mode: 'copy' input: set name, file(meme), file(tsv) from meme_for_scan output: - set name, file('*_best.meme') into best_motif + set name, file('*_best.meme') into best_motif, best_motif_for_venn script: cluster_id = name.split('_')[-1] @@ -654,56 +671,73 @@ process get_best_motif { /* Get the sequence names used to create best X Motifs as a JSON-file */ -process get_best_motif_seq { +process get_best_motif_seq_id { //conda "${path_env}" tag{name} - publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/motif_sequences/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/05_best_unknown_motifs/motif_sequence_ids/", mode: 'copy' input: set name, file (txt) from seq_to_json output: - file("${name}_seq.json") + set name, file("${name}_seq.json") into seq_id_for_venn script: cluster_id = name.split('_')[-1] """ - Rscript ${path_bin}/2.2_motif_estimation/get_motif_seq.R -i ${txt} -o ${name}_seq.json -n ${params.best_motif} -t ./motif_seq_tmp/cluster_${cluster_id} -c ${cluster_id} + Rscript ${path_bin}/2.2_motif_estimation/get_motif_seq_id.R -i ${txt} -o ${name}_seq.json -n ${params.best_motif} -t ./motif_seq_tmp/cluster_${cluster_id} -c ${cluster_id} """ } -best_motif.combine(fa_scan).set {files_for_genome_scan} +best_motif.combine(fa_scan).combine(bed_for_rescan).set {files_for_re_scan} /* -process genome_scan { +*/ +process re_scan { //conda "${path_env}" + tag{name} + publishDir "${params.out}/3.2_evaluation/rescan/${name}", mode: 'copy' input: - set name, file(meme), file(fasta) from files_for_genome_scan + set name, file(meme), file(fasta), file (peaks) from files_for_re_scan output: - file ('.bed') into bed_for_uropa, bed_for_cluster_quality + set name, val ('link') into link_re_scan_to_venn + file ("*.bed") into re_scan_for_uropa script: """ + mkdir -p ./tmp/re_scan/${name} + python ${path_bin}/1.2_filter_motifs/tfbsscan.py -m ${meme} -g ${fasta} -b ${peaks} -o . --cores 4 --use moods --resolve_overlaps + mkdir -p ${workflow.workDir}/tmp/re_scan_final + cp ./Cluster*.bed ${workflow.workDir}/tmp/re_scan_final """ } -process cluster_quality { +seq_id_for_venn.combine(link_re_scan_to_venn, by: 0).combine(bed_for_venn).set {for_venn} + +/* +*/ +process venn { + + tag{name} + publishDir "${params.out}/3.2_evaluation/venn", mode: 'copy' input: - file (bed) from bed_for_cluster_quality + set name, file(seq_ids), link, file (clustered_bed) from for_venn output: - file ('*.bed') into bed_for_final_filter + file ('*.pdf') script: + cluster_id = name.split('_')[-1] """ + Rscript ${path_bin}/3.2_evaluation/venn.R -i ${workflow.workDir}/tmp/re_scan_final/ -p Cluster_${cluster_id}_ -l ${seq_ids} -c ${clustered_bed} -o . """ -} */ +} process create_GTF { @@ -723,60 +757,83 @@ process create_GTF { """ } -if (params.gtf_path == "") { - gtf_for_uropa = gtf -} else { - gtf_for_uropa = Channel.fromPath(params.gtf_path) -} -/* -bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in} +gtf.combine(gtf_to_merge).set {gtf_list} -// Create configuration file for UROPA. -// Takes template and replaces bed- and gtf-placeholders with actual paths. -process create_uropa_config { +process merge_gtf { - publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy' + stageInMode 'copy' + publishDir "${params.out}/3.1_create_gtf/", mode: 'copy' input: - set val(bed), val(gtf) from uropa_in.toList() - file (conf) from config + set file (gtf_reg), file (gtf_genes) from gtf_list output: - file ('uropa.config') into uropa_config + file ('merged.gtf') into merged_gtf + + when: + params.gtf_path == "" script: """ - sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final + cat ${gtf_genes} > merged.gtf + cat ${gtf_reg} >> merged.gtf """ } +if (params.gtf_path == "") { + gtf_for_uropa = merged_gtf +} else { + gtf_for_uropa = Channel.fromPath(params.gtf_path) +} + +re_scan_for_uropa.flatten().combine(gtf_for_uropa).combine(config).set {uropa_in} -process UROPA { +// Create configuration file for UROPA. +// Takes template and replaces bed- and gtf-placeholders with actual paths. +process create_uropa_config { + + tag{bed} + echo true + publishDir "${params.out}/3.2_evaluation/uropa/config", mode: 'copy' input: - file (config) from uropa_config + set val(bed), val (gtf), file (conf) from uropa_in output: - set file ("*_allhits.txt"), file ("*_finalhits.txt") into uropa_for_filter + set name, file ("${name}_uropa.config") into uropa_config script: + name = bed.getBaseName() """ + sed -- 's=placeholder_gtf=${gtf}=g; s=placeholder_bed=${bed}=g' ${conf} > ${name}_uropa.config """ } -process filter { +process UROPA { + + tag{config} + maxForks params.max_uropa_runs + publishDir "${params.out}/3.2_evaluation/uropa", mode: 'copy', pattern: "*.txt" + publishDir "${params.out}/3.2_evaluation/uropa/summary", mode: 'copy', pattern: "*.pdf" + publishDir "${params.out}/log/uropa/${name}", mode: 'copy', pattern: "*.log" input: + set name, file (config) from uropa_config output: + set file ("*_allhits.txt"), file ("*_finalhits.txt") into uropa_for_filter + file("*.pdf") + file("*.log") script: """ + uropa -i ${config} -l uropa.log -t 4 -p ${name} -s -d """ -} */ +} + workflow.onComplete { log.info""" @@ -789,4 +846,6 @@ workDir : ${workflow.workDir} exit status : ${workflow.exitStatus} Error report: ${workflow.errorReport ?: '-'} """ +f = file(workflow.workDir + '/tmp/re_scan_final') +f.deleteDir() }