diff --git a/README.md b/README.md index d080783..317c3c2 100644 --- a/README.md +++ b/README.md @@ -6,19 +6,11 @@ For further information read the [documentation](https://github.molgen.mpg.de/lo ## Dependencies * [conda](https://conda.io/docs/user-guide/install/linux.html) -* [Nextflow](https://www.nextflow.io/) -* [MEME-Suite](http://meme-suite.org/doc/install.html?man_type=web) ## Installation -1. Start with installing all dependencies listed above (Nextflow, conda, MEME-Suite) and downloading all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). -2. It is required to set the [environment paths for meme-suite](http://meme-suite.org/doc/install.html?man_type=web#installingtar). -this can be done with following commands: -``` -export PATH=[meme-suite instalation path]/libexec/meme-[meme-suite version]:$PATH -export PATH=[meme-suite instalation path]/bin:$PATH -``` +1. Start with installing conda and downloading all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). -3. Every other dependency will be automatically installed using conda. For that a conda environment has to be created from the yaml-file given in this repository. +2. Every other dependency will be automatically installed using conda. For that a conda environment has to be created from the yaml-file given in this repository. It is required to create and activate the environment from the yaml-file beforehand. This can be done with following commands: ```condsole @@ -26,84 +18,87 @@ conda env create -f masterenv.yml conda activate masterenv ``` -4. Set the wd parameter in the nextflow.config file as path where the repository is saved. For example: '~/masterJLU2018/'. +3. Set the wd parameter in the nextflow.config file as path where the repository is saved. For example: '~/masterJLU2018/'. **Important Notes:** -1. For conda the channel bioconda needs to be set as highest priority! This is required due to two different packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfish package from the channel conda-forge! +1. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfish package from the channel conda-forge! Please make sure that the right jellyfish package is installed. ## Quick Start ```console -nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --organism [mm10|mm9|hg19|hg38] +nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --organism [mm10|mm9|hg19|hg38] --gtf_annotation [GTF-file] ``` ### Demo run There are files provided inside ./demo/ for a demo run. Go to the main directory and run following command: ``` -nextflow run pipeline.nf --bigwig ./demo/buenrostro50k_chr1_fp.bw --bed ./demo/buenrostro50k_chr1_peaks.bed --genome_fasta ./demo/hg38_chr1.fa --motif_db ./demo/jaspar_vertebrates.meme --out ./demo/buenrostro50k_chr1_out/ --organism hg38 +nextflow run pipeline.nf --bigwig ./demo/buenrostro50k_chr1_fp.bw --bed ./demo/buenrostro50k_chr1_peaks.bed --genome_fasta ./demo/hg38_chr1.fa --motif_db ./demo/jaspar_vertebrates.meme --out ./demo/buenrostro50k_chr1_out/ --organism hg38 --gtf_annotation ./demo/homo_sapiens.94.mainChr.gtf ``` ## Parameters -For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). +For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/List-of-Parameters). ``` Required arguments: - --bigwig Path to BigWig-file - --bed Path to BED-file - --genome_fasta Path to genome in FASTA-format - --motif_db Path to motif-database in MEME-format - --config Path to UROPA configuration file - --organism Input organism [hg38 | hg19 | mm9 | mm10] - --out Output Directory (Default: './out/') + --bigwig Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --motif_db Path to motif-database in MEME-format + --config Path to UROPA configuration file + --gtf_annotation Path to gtf annotation file + --organism Input organism [hg38 | hg19 | mm9 | mm10] + --out Output Directory (Default: './out/') Optional arguments: - --help [0|1] 1 to show this help message. (Default: 0) - --gtf_path Path to gtf-file. If path is set the process which creates a gtf-file is skipped. - --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will be skipped. + --help [0|1] 1 to show this help message. (Default: 0) + --gtf_merged Path to gtf-file. If path is set the process which creates a gtf-file is skipped. + --tfbs_path Path to directory with tfbsscan output. If given tfbsscan will be skipped. Footprint extraction: - --window_length INT This parameter sets the length of a sliding window. (Default: 200) - --step INT This parameter sets the number of positions to slide the window forward. (Default: 100) - --percentage INT Threshold in percent (Default: 0) - --min_gap INT If footprints are less than X bases apart the footprints will be merged (Default: 6) + --window_length INT This parameter sets the length of a sliding window. (Default: 200) + --step INT This parameter sets the number of positions to slide the window forward. (Default: 100) + --percentage INT Threshold in percent (Default: 0) + --min_gap INT If footprints are less than X bases apart the footprints will be merged (Default: 6) Filter motifs: - --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) - --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 200) - --tfbsscan_method [moods|fimo] Method used by tfbsscan. (Default: moods) + --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) + --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 200) + --tfbsscan_method [moods|fimo] Method used by tfbsscan. (Default: moods) Cluster: Sequence preparation/ reduction: - --kmer INT K-mer length (Default: 10) - --aprox_motif_len INT Motif length (Default: 10) - --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. + --kmer INT K-mer length (Default: 10) + --aprox_motif_len INT Motif length (Default: 10) + --motif_occurrence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. --min_seq_length Interations Remove all sequences below this value. (Default: 10) Clustering: - --global INT Global (=1) or local (=0) alignment. (Default: 0) - --identity FLOAT Identity threshold. (Default: 0.8) - --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) - --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) - --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) - --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) + --global INT Global (=1) or local (=0) alignment. (Default: 0) + --identity FLOAT Identity threshold. (Default: 0.8) + --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) Motif estimation: - --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) - --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) - --motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 20) - --iteration INT Number of iterations done by GLAM2. More Iterations: better results, higher runtime. (Default: 10000) - --tomtom_treshold FLOAT Threshold for similarity score. (Default: 0.01) - --best_motif INT Get the best X motifs per cluster. (Default: 3) - --gap_penalty INT Set penalty for gaps in GLAM2 (Default: 1000) + --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) + --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) + --motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 20) + --iteration INT Number of iterations done by GLAM2. More Iterations: better results, higher runtime. (Default: 10000) + --tomtom_treshold FLOAT T hreshold for similarity score. (Default: 0.01) + --best_motif INT Get the best X motifs per cluster. (Default: 3) + --gap_penalty INT Set penalty for gaps in GLAM2 (Default: 1000) + --seed String Set seed for GLAM2 (Default: 123456789) Moitf clustering: - --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Default: 0) - --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) + --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) + --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) Creating GTF: - --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON - config + --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON config + Evaluation: + --max_uropa_runs INT Maximum number UROPA runs running parallelized (Default: 10) All arguments can be set in the configuration files ``` diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index aa97c95..5097004 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -5,10 +5,10 @@ # desciption: This scripts uses bedtools to find new, non overlapping regions # between input data in BED-format and a group of known motifs in -# BED-format. +# BED-format. # For parameter description, run the script without parameters or -h. -# The output is a file with the filtered footprints and the log file compareBed.stats +# The output is a file with the filtered footprints and the log file filter_motifs.log # One R scripts is used, compareBed_runinfo.R, stored in the same directory. # Parameter values (file/directory paths) must not contain the | symbol, because it is chosen as "sed"-field separator. @@ -124,7 +124,7 @@ then echo " -o --output output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta" echo " -h --help shows this help message" echo "" - echo " Note: file paths must not contain the '|' pipe symbol" + echo " Note: file paths must not contain the '|' pipe symbol" exit 0 fi @@ -188,7 +188,7 @@ cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed # motiffiles either from a directory OR comma separated list if [ -d "$motifs" ] then - # creates an array of all files with bed in its name in the directory $motifs + # creates an array of all files with bed in its name in the directory $motifs declare -a motiffiles=(`ls $motifs | grep -i '.bed$' | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`) # the else case means, that the motiffiles were passed comma separated with no whitespace. @@ -264,9 +264,9 @@ fi # remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition # also creates a small output file with information about the comparison -Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.stats +Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/filter_motifs.log # check if Rscript executed without errors -if [ $? -gt 0 ] +if [ $? -gt 0 ] then exit 1 fi @@ -279,15 +279,15 @@ then # add initial number of footprints to the log file fp_initial=`cat $data | wc -l` fp_initial=`expr $fp_initial - 1` - echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats + echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/filter_motifs.log else # output will be overwritten if it exists rm -f $output # add initial number of footprints to the log file - cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats + cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/filter_motifs.log fi # add number of footprints after filtering to the log file -cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats +cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/filter_motifs.log # add fasta sequences to bed and create fasta file out_fasta=`echo $output | sed "s|.bed$|.fasta|g"` diff --git a/bin/2.2_motif_estimation/get_motif_seq.R b/bin/2.2_motif_estimation/get_motif_seq_id.R similarity index 94% rename from bin/2.2_motif_estimation/get_motif_seq.R rename to bin/2.2_motif_estimation/get_motif_seq_id.R index bbcaff6..ca07eb6 100644 --- a/bin/2.2_motif_estimation/get_motif_seq.R +++ b/bin/2.2_motif_estimation/get_motif_seq_id.R @@ -68,7 +68,7 @@ create_seq_json <- function(input, output, num, tmp_path, cluster_id) { # Create json file ## naming - names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id) + names(datalist) <- paste0("Motif_", seq(1,as.numeric(num),1), " Cluster_", cluster_id) ## creating json object json <- RJSONIO::toJSON(datalist, pretty = T , .withNames = T) ## writing file diff --git a/bin/2.2_motif_estimation/merge_similar_clusters.R b/bin/2.2_motif_estimation/merge_similar_clusters.R index d9b2373..4cf8c47 100644 --- a/bin/2.2_motif_estimation/merge_similar_clusters.R +++ b/bin/2.2_motif_estimation/merge_similar_clusters.R @@ -1,5 +1,6 @@ #!/usr/bin/env Rscript if (!require(optparse)) install.packages("optparse"); library(optparse) +if (!require(igraph)) install.packages("igraph"); library(igraph) option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"), @@ -40,37 +41,39 @@ merge_similar <- function(tsv_in, file_list, min_weight){ # remove rows if column 1 is identical to column 2 edgelist <- sim_not_unique[query_cluster != target_cluster] - + unique_edgelist <- unique(edgelist) + # create graph from data.frame - g <- igraph::graph_from_edgelist(as.matrix(edgelist)) - # converting graph to adjacency matrix - adj_matrix <- igraph::get.adjacency(g, names = T) - # generating weighted graph from adjacency matrix - g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T) + g <- graph_from_edgelist(as.matrix(unique(edgelist)), directed = F) + g_tidy <- delete.vertices(simplify(g), degree(g) == 0) + g_tidy <- set_vertex_attr(g_tidy,"name",V(g_tidy), value = (sort(unique(c(unique_edgelist$V1,unique_edgelist$V2)))) - 1) - # get subgraphs from graph with edges of weight > min_weight - s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight > min_weight], del = F) + # plotting png('motif_clusters.png') - plot(s1) + plot(g_tidy) dev.off() - clust <- igraph::clusters(s1) + clust <- clusters(g) if (clust$no < 1) { b <- lapply(files, function(f){ system(paste("cat",f,">",basename(f))) }) } + clust_out <- file("cluster.txt") # merge FASTA-files depending on the clustered graphs - a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){ + a <- vapply(seq(from = 1, to = clust$no, by = 1), function(i){ #get vector with cluster ids, which are clustered together in "motif cluster" i cl <- as.vector(which(clust$membership %in% c(i))) #create string, which represents a list, containing all FASTA-files to be merged fasta_list <- paste(files[cl], collapse = " ") #create name for merged FASTA-file - name <- paste0("Cluster_",i,".fasta") + name <- paste0("Cluster_",i - 1 ,".fasta") #merge fasta files system(paste("cat",fasta_list,">",name)) - }) + paste("Cluster",i - 1,"is generated by merging cluster:",paste(cl - 1,collapse = ",")) + }, "") + writeLines(a[a != ""], con = clust_out) } + # run function merge_similar with given parameteres if not in interactive context (e.g. run from shell) diff --git a/bin/2.2_motif_estimation/png_to_pdf.R b/bin/2.2_motif_estimation/png_to_pdf.R new file mode 100644 index 0000000..4ea85b9 --- /dev/null +++ b/bin/2.2_motif_estimation/png_to_pdf.R @@ -0,0 +1,92 @@ +#!/usr/bin/env Rscript +if (!require(optparse)) install.packages("optparse"); library(optparse) +if (!require(png)) install.packages("png"); library(png) +if (!require(grid)) install.packages("grid"); library(grid) +if (!require(gridExtra)) install.packages("gridExtra"); library(gridExtra) + + +option_list <- list( + make_option(opt_str = c("-i", "--png_new"), default = NULL, help = "List of png paths separated by \',\'", metavar = "character"), + make_option(opt_str = c("-p", "--png_old"), default = NULL, help = "List of png paths separated by \',\'", metavar = "character"), + make_option(opt_str = c("-f", "--index"), default = NULL, help = "Path to index file", metavar = "character") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Generating a PDF from multiple png files", + epilogue = "Author: Rene Wiegandt ") + +opt <- parse_args(opt_parser) + + + +#' Generating a PDF from multiple png files. +#' +#' @parameter png_new List of png paths separated by , +#' @parameter png_old List of png paths separated by , +#' @parameter cluster_ids List of IDs separated by , +#' @parameter new_id New Cluster ID +#' +#' @author Rene Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de +png_to_pdf <- function(png_top, png_list, cluster_ids, new_id, out = "cluster.pdf"){ + #png_top <- "G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/04_glam2/Cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/04_glam2/Cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/04_glam2/Cluster_0/logo3.png" + #png_list <- "G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_0/logo3.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo1.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo2.png,G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/02_glam2/buenrostro50k_chr1_fp_cluster_1/logo3.png" + #cluster_ids <- c(1,2,3,4,5,6,7,8,9,10) + top_split <- unlist(strsplit(png_top, ",")) + png_split <- unlist(strsplit(png_list, ",")) + cluster_list <- unlist(strsplit(cluster_ids, ",")) + top_grob <- lapply(top_split,function(p){rasterGrob(readPNG(p),interpolate = FALSE)}) + grobs <- lapply(png_split,function(p){rasterGrob(readPNG(p),interpolate = FALSE)}) + + split_grobs <- split(grobs, rep(1:ceiling(length(grobs)/3), each = 3)[1:length(grobs)]) + + rows <- lapply(seq(1,length(split_grobs)), function(sg){ + arrangeGrob(grobs = split_grobs[[sg]], top = textGrob(paste0("Cluster ",cluster_list[sg]),gp = gpar(fontsize = 20,font = 3)), ncol = 3) + }) + split_rows <- split(rows, rep(1:ceiling(length(rows)/4), each = 4)[1:length(rows)]) + + pdf(out,width = 17, height = 11) + grid.arrange(grobs = top_grob, nrow = 3, top = textGrob(paste0("New Cluster ",new_id),gp = gpar(fontsize = 30,font = 3))) + lapply(split_rows, function(r){ + grid.arrange(grobs = r, nrow = 4, top = textGrob("Generated from...",gp = gpar(fontsize = 30,font = 3))) + }) + + dev.off() + +} +#### index_file <- "G://Rene.Wiegandt/10_Master/tmp/test_pipeline_11_3_19/2.2_motif_estimation/03_motif_cluster/03_merged_fasta/cluster.txt" + + +#' Parsing index file. +#' +#' @parameter index_file Path to index file. +#' +#' @author Rene Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de +get_index <- function(index_file){ + f <- data.table::fread(index_file, header = F) + index <- f[,c(2,8)] + index$V8 <- strsplit(index$V8,",") + return(index[unlist(lapply(index$V8, function(v){ifelse(length(v) > 1, TRUE, FALSE)}))]) + +} + +@TODO +png_to_pdf_set_up <- function(png_top, png_list, index_file){ + + index <- get_index(index_file) + t <- lapply(seq(nrow(index)), function(i){ + new_id <- index[i,1] + cluster_ids <- index[i,2] + out <- paste0("Summary_cluster_", new_id, ".pdf") + #regex_png <- paste0("*_cluster_", new_id , "/logo${NUM}.png") #???? + png_to_pdf(png_top = "TODO" ,png_list = "TODO" ,cluster_ids = cluster_ids, new_id = new_id, out = out ) + }) + +} + + +# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell) +if (!interactive()) { + png_to_pdf_set_up(opt$png_new, opt$png_old , opt$index) +} diff --git a/bin/3.2_evaluation/venn.R b/bin/3.2_evaluation/venn.R new file mode 100644 index 0000000..94d8cd2 --- /dev/null +++ b/bin/3.2_evaluation/venn.R @@ -0,0 +1,87 @@ +#!/usr/bin/env Rscript + +### master JLU 2018 Modul 3.2 Motif Evaluation ### +# author: afust +# capabilities: +# ratio: sequenced used to create motif vs sites found by rescan + +# dependencies +suppressMessages(library(data.table, quietly = TRUE)) +suppressMessages(library(VennDiagram, quietly = TRUE)) +suppressMessages(library(RJSONIO, quietly = TRUE)) +suppressMessages(library(getopt, quietly = TRUE)) + +# ---------------------------- options ----------------------------------------- # +# 0 flag +# 1 mandatory parameter +# 2 optional parameter +options(warn = 1) +options <- matrix(c( +# tfbs is either the directory including all accessible regions +# or the directory including precalculated score matrices etc +"cluster_dir", "i", 1, "character", "directory with rescan results", +"pattern", "p", 1, "character", "cluster pattern, e.g. 'Cluster_96'", +"seq_dir", "l", 1, "character", "directory with fp assignment per cluster", +"clustered_fp", "c", 1, "character", "clustering results", +"outdir", "o", 1, "character", "output directory", +"help", "h", 0, "logical", "Provides command line help" +), byrow = TRUE, ncol = 5) +opt <- getopt(options) + +# ---------------------------- options processing ----------------------------- # +# check for mandatory parameters and print help if required +if (!is.null(opt$help) ){ + cat(getopt(options, usage = TRUE), file = stderr()); q(status = 0) +} + +# load input + +# list used files: due to rescan script there is one output per motif for each cluster +# all should be used in one plot result -> list files with pattern +rescan_files <- list.files(opt$cluster_dir, pattern = opt$pattern, ignore.case = FALSE) +# list of fps representing a motif +fp_list <- fromJSON(opt$seq_dir) + +# clustering information needed to extract genomic locations of fps +# -> reduce to current cluster +dt.clustering <- fread(opt$clustered_fp, sep="\t", header = TRUE) +colnames(dt.clustering) <- sub("#chr", "chr", colnames(dt.clustering)) +cl <- as.numeric(strsplit(opt$pattern, "_")[[1]][2]) +dt.clustering <- dt.clustering[dt.clustering$cluster == cl, ] + +# pdf including venn diagrams for all motifs +pdf(paper="a4", file=file.path(opt$outdir, paste0("Cluster_",cl,".pdf")), useDingbats = FALSE) +plot.new() +for(f in rescan_files){ + # load genomic locations found by rescan with motif + dt.rescan <- fread(file.path(opt$cluster_dir,f), sep="\t", header = FALSE, select = c(1:4), stringsAsFactors=FALSE) + colnames(dt.rescan) <- c("chr", "start", "end", "name") + + # extract fps that build up current motif + tmp <- strsplit(as.character(dt.rescan[1,"name"]), "_")[[1]] + cluster <- paste(tmp[1],tmp[2],sep="_") + id <- paste(paste("Motif", tmp[3], sep="_"), cluster, sep=" ") + #cat("\nfor",id, "there are", length(unname(unlist(fp_list[id]))), "sequences assigned") + # reduce cluster info to sites used for current motif to find overlaps + dt.motif_seq <- dt.clustering[dt.clustering$name %in% unname(unlist(fp_list[id])),c("chr","start", "end")] + setkeyv(dt.rescan, c("chr","start","end")) + setkeyv(dt.motif_seq, c("chr","start","end")) + dt.overlaps <- foverlaps(dt.motif_seq, dt.rescan, mult="first") + dt.overlaps <- dt.overlaps[complete.cases(dt.overlaps),] + dt.overlaps <- dt.overlaps[!duplicated(dt.overlaps$i.start),] + # number of locations found in both: motif setup sequences and rescan + hits <- nrow(dt.overlaps) + #cat("\nnumbers:\nregions assignet to motif", nrow(dt.motif_seq), "\nregions found by rescan", nrow(dt.rescan), "\noverlaps", hits) + # draw venn + draw.pairwise.venn(area1 = nrow(dt.motif_seq), area2 = nrow(dt.rescan), cross.area = hits, + category = c(paste0("related motif sites of ",cluster), paste0("rescan with ", id)), + lty = rep("blank", 2), scaled = TRUE, + fill= c("blue", "red"), alpha = c(0.5,0.5), + cat.pos = c(0, 0), cat.dist = rep(0.025, 2) ) + title(id) + if(f != tail(rescan_files,1)){ + grid.newpage() + } +} + +invisible(dev.off()) diff --git a/config/cluster.config b/config/cluster.config index 1ba0b33..9ca8601 100644 --- a/config/cluster.config +++ b/config/cluster.config @@ -2,7 +2,7 @@ params{ //reduce_bed kmer=10 aprox_motif_len=10 - motif_occurence=1 + motif_occurrence=1 min_seq_length=10 //cdhit_wrapper diff --git a/config/create_gtf.config b/config/create_gtf.config index c38afaa..192b793 100644 --- a/config/create_gtf.config +++ b/config/create_gtf.config @@ -1,3 +1,3 @@ params{ - tissue="" + tissues="" } diff --git a/config/footprint_extraction.config b/config/footprint_extraction.config index 0661e2a..bcf54cf 100644 --- a/config/footprint_extraction.config +++ b/config/footprint_extraction.config @@ -2,5 +2,5 @@ params{ window_length = 200 step = 100 percentage = 0 - max_bp_between = 6 + gap_penalty = 6 } diff --git a/config/motif_estimation.config b/config/motif_estimation.config index 2445d12..e82c4c6 100644 --- a/config/motif_estimation.config +++ b/config/motif_estimation.config @@ -1,11 +1,12 @@ params { //bed_to_clustered_fasta - min_seq = 10 + min_seq = 100 //glam2 - motif_min_len = 8 - motif_max_len = 20 - interation = 10000 + motif_min_key = 8 + motif_max_key = 20 + iteration = 10000 + gap_penalty = 1000 //tomtom tomtom_treshold = 0.01 diff --git a/config/uropa.config b/config/uropa.config index 3c4a189..0b93f22 100644 --- a/config/uropa.config +++ b/config/uropa.config @@ -1,16 +1,8 @@ { -"queries":[ - {"feature":"", - "feature.anchor": "", - "distance": [ , ], - "strand":"", - "direction":"", - "internals":"", - "filter.attribute":"", - "attribute.value":"", - "show.attributes":"" } - ], -"priority": "", -"gtf": "placeholder_gtf", -"bed": "placeholder_bed" -} \ No newline at end of file + "queries":[ + {"feature":"gene","distance":[5000,1000], "show.attributes":"gene_id"}, + {"feature":["ctcf_binding_site","enhancer", "open_chromatin", "promoter", "promoter_flanking_region", "tf_binding_site"], "distance":[1000], "internals":"TRUE"} + ], + "gtf": "placeholder_gtf", + "bed": "placeholder_bed" +} diff --git a/demo/homo_sapiens.94.mainChr.gtf b/demo/homo_sapiens.94.mainChr.gtf new file mode 100644 index 0000000..e4bed46 Binary files /dev/null and b/demo/homo_sapiens.94.mainChr.gtf differ diff --git a/masterenv.yml b/masterenv.yml index dc19c1a..9c50870 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -6,7 +6,7 @@ dependencies: - numpy=1.15.4 - pybigWig - cd-hit - - jellyfish + - bioconda::jellyfish - r-base>=3.5.1 - r-data.table - r-pbapply @@ -24,3 +24,5 @@ dependencies: - seaborn - crossmap - r-bit64 + - uropa + - nextflow diff --git a/meme_suite.yml b/meme_suite.yml new file mode 100644 index 0000000..2ac242e --- /dev/null +++ b/meme_suite.yml @@ -0,0 +1,5 @@ + +name: meme_suite +dependencies: + - python=2.7 + - meme diff --git a/nextflow.config b/nextflow.config index 653afd6..c935ad0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,11 +1,11 @@ -wd = "" +wd = "/mnt/agnerds/Rene.Wiegandt/repo2/masterJLU2018" createTimeout = 40 -params.threads=60 //Parameter for for scripts! Not for nextflow processes. -params.config="${wd}/config/uropa.config" +params.threads = 10 //Parameter for for scripts! Not for nextflow processes. +params.config = "${wd}/config/uropa.config" env { path_bin = "${wd}/bin" - path_env = "${wd}/masterenv.yml" + meme_env = "${wd}/meme_suite.yml" } includeConfig "${wd}/config/footprint_extraction.config" diff --git a/pipeline.nf b/pipeline.nf index 2e60f35..2084666 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -1,5 +1,6 @@ #!/usr/bin/env nextflow +disable_mo_clu = 1 //setting default values params.bigwig="" @@ -9,7 +10,8 @@ params.config="" params.tfbs_path="" params.help = 0 - params.gtf_path="" + params.gtf_merged="" + params.gtf_annotation="" params.out = "./out/" //footprint_extraction @@ -24,19 +26,19 @@ params.tfbsscan_method = "moods" //clustering - //reduce_sequence - params.kmer=10 - params.aprox_motif_len=10 - params.motif_occurence=1 - params.min_seq_length=10 - - //cdhit_wrapper - params.global=0 - params.identity=0.8 - params.sequence_coverage=8 - params.memory=800 - params.throw_away_seq=9 - params.strand=0 + //reduce_sequence + params.kmer=10 + params.aprox_motif_len=10 + params.motif_occurrence=1 + params.min_seq_length=10 + + //cdhit_wrapper + params.global=0 + params.identity=0.8 + params.sequence_coverage=8 + params.memory=800 + params.throw_away_seq=9 + params.strand=0 //motif_estimation //bed_to_clustered_fasta @@ -47,6 +49,7 @@ params.motif_max_key = 20 // Maximum number of key positions (aligned columns) params.iteration = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime. params.gap_penalty = 1000 + params.seed = 123456789 //tomtom params.tomtom_treshold = 0.01 // threshold for similarity score. @@ -60,110 +63,152 @@ //creating_gtf params.organism="" - params.tissue="" + params.tissues="" -if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" != "0" ){ +//evaluation + params.max_uropa_runs = 10 + +/* +Checking for parameter input! +*/ +int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer", + "aprox_motif_len", "motif_occurrence", "min_seq_length", "global", + "sequence_coverage", "memory", "throw_away_seq", "strand", + "min_seq", "motif_min_key", "motif_max_key", "iteration", + "edge_weight", "best_motif", "min_gap", "gap_penalty", "edge_weight", + "threads", "max_uropa_runs"] +file_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config", "gtf_annotation",] +all_params = int_params + file_params + ["organism" , "identity", "tfbsscan_method", + "percentage", "tomtom_treshold", "motif_similarity_thresh", "out", + "tissues", "gtf_merged", "cluster_motif", "tfbs_path", "help", "seed"] +req_params = file_params + ["organism"] + +valid_organism = ["hg38", "hg19", "mm9", "mm10"] +valid_tfbsscan_methods = ["moods","fimo"] +val_missing = false +send_help = false +missing_params = [] + +req_params.each { key -> + if (req_params.contains(key)){ + if (key == "gtf_annotation") { + if (params[key] == "" && params.gtf_merged == "") { + val_missing = true + missing_params.add("$key or gtf_merged") + } + } else { + if (params[key] == ""){ + val_missing = true + missing_params.add(key) + } + } + } +} + +if (val_missing){ + send_help = true + println("Error: Following required parameters are missing: $missing_params") +} + +if (send_help || "${params.help}" != "0") { log.info """ Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file] -Required arguments: - --bigwig Path to BigWig-file - --bed Path to BED-file - --genome_fasta Path to genome in FASTA-format - --motif_db Path to motif-database in MEME-format - --config Path to UROPA configuration file - --organism Input organism [hg38 | hg19 | mm9 | mm10] - --out Output Directory (Default: './out/') - -Optional arguments: - - --help [0|1] 1 to show this help message. (Default: 0) - --gtf_path Path to gtf-file. If path is set the process which creates a gtf-file is skipped. - --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will be skipped. - - Footprint extraction: - --window_length INT This parameter sets the length of a sliding window. (Default: 200) - --step INT This parameter sets the number of positions to slide the window forward. (Default: 100) - --percentage INT Threshold in percent (Default: 0) - --min_gap INT If footprints are less than X bases apart the footprints will be merged (Default: 6) - - Filter motifs: - --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) - --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 200) - --tfbsscan_method [moods|fimo] Method used by tfbsscan. (Default: moods) - - Cluster: - Sequence preparation/ reduction: - --kmer INT K-mer length (Default: 10) - --aprox_motif_len INT Motif length (Default: 10) - --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. - --min_seq_length Interations Remove all sequences below this value. (Default: 10) - Clustering: - --global INT Global (=1) or local (=0) alignment. (Default: 0) - --identity FLOAT Identity threshold. (Default: 0.8) - --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) - --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) - --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) - --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) - - Motif estimation: - --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) - --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) - --motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 20) - --iteration INT Number of iterations done by GLAM2. More Iterations: better results, higher runtime. (Default: 10000) - --tomtom_treshold FLOAT Threshold for similarity score. (Default: 0.01) - --best_motif INT Get the best X motifs per cluster. (Default: 3) - --gap_penalty INT Set penalty for gaps in GLAM2 (Default: 1000) - Moitf clustering: - --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) - --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) - --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) - - Creating GTF: - --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON - config + Required arguments: + --bigwig Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --motif_db Path to motif-database in MEME-format + --config Path to UROPA configuration file + --gtf_annotation Path to gtf annotation file + --organism Input organism [hg38 | hg19 | mm9 | mm10] + --out Output Directory (Default: './out/') + + Optional arguments: + + --help [0|1] 1 to show this help message. (Default: 0) + --gtf_merged Path to gtf-file. If path is set the process which creates a gtf-file is skipped. + --tfbs_path Path to directory with tfbsscan output. If given tfbsscan will be skipped. + + Footprint extraction: + --window_length INT This parameter sets the length of a sliding window. (Default: 200) + --step INT This parameter sets the number of positions to slide the window forward. (Default: 100) + --percentage INT Threshold in percent (Default: 0) + --min_gap INT If footprints are less than X bases apart the footprints will be merged (Default: 6) + + Filter motifs: + --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) + --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 200) + --tfbsscan_method [moods|fimo] Method used by tfbsscan. (Default: moods) + + Cluster: + Sequence preparation/ reduction: + --kmer INT K-mer length (Default: 10) + --aprox_motif_len INT Motif length (Default: 10) + --motif_occurrence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. + --min_seq_length Interations Remove all sequences below this value. (Default: 10) + Clustering: + --global INT Global (=1) or local (=0) alignment. (Default: 0) + --identity FLOAT Identity threshold. (Default: 0.8) + --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) + + Motif estimation: + --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) + --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) + --motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 20) + --iteration INT Number of iterations done by GLAM2. More Iterations: better results, higher runtime. (Default: 10000) + --tomtom_treshold FLOAT Threshold for similarity score. (Default: 0.01) + --best_motif INT Get the best X motifs per cluster. (Default: 3) + --gap_penalty INT Set penalty for gaps in GLAM2 (Default: 1000) + --seed Set seed for GLAM2 (Default: 123456789) + Moitf clustering: + --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) + --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) + --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) + + Creating GTF: + --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON + config + Evaluation: + --max_uropa_runs INT Maximum number UROPA runs running parallelized (Default: 10) All arguments can be set in the configuration files - ``` """ -System.exit(2) + System.exit(2) } else { Channel.fromPath(params.bigwig).map {it -> [it.simpleName, it]}.set {bigwig_input} - Channel.fromPath(params.bed).into {bed_input; bed_for_tfbsscan} + Channel.fromPath(params.bed).into {bed_input; bed_for_tfbsscan; bed_for_rescan} Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} Channel.fromPath(params.motif_db).into {db_for_motivscan; db_for_tomtom} Channel.fromPath(params.config).set {config} + if (params.gtf_annotation != ""){ + Channel.fromPath(params.gtf_annotation).set {gtf_to_merge} + } if (params.tfbs_path != ""){ known_tfbs = file(params.tfbs_path).toAbsolutePath() } } -/* -Checking for parameter input! -*/ -int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer", - "aprox_motif_len", "motif_occurence", "min_seq_length", "global", - "sequence_coverage", "memory", "throw_away_seq", "strand", - "min_seq", "motif_min_key", "motif_max_key", "iteration", - "edge_weight", "best_motif", "min_gap", "gap_penalty", "edge_weight"] -req_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config"] - -valid_organism = ["hg38", "hg19", "mm9", "mm10"] -valid_tfbsscan_methods = ["moods","fimo"] - params.each { key, value -> + if (!(all_params.contains(key))){ + println("Warning: Parameter $key is unknown. Please check for typos or the parameter list!") + } if(int_params.contains(key)) { if (!("${value}" ==~ /\d+/ )){ println("ERROR: $key needs to be an Integer") System.exit(2) } } - if(req_params.contains(key)) { + if(file_params.contains(key) || (key == "gtf_merged" && value != "") ) { if(!file(value).exists()) { println("ERROR: $key not found. Please check the given path.") System.exit(2) } } } + if (!("${params.identity}" ==~ /^0\.[8-9][[0-9]*]?|^1(\.0)?/ )){ println("ERROR: --identity needs to be float in range 0.8 to 1.0") System.exit(2) @@ -205,7 +250,7 @@ process footprint_extraction { tag{name} publishDir "${params.out}/1.1_footprint_extraction/", mode: 'copy', pattern: '*.bed' - publishDir "${params.out}/1.1_footprint_extraction/log", mode: 'copy', pattern: '*.log' + publishDir "${params.out}/log", mode: 'copy', pattern: '*.log' input: set name, file (bigWig), file (bed) from footprint_in @@ -228,7 +273,8 @@ Find postitions of known tfbs with tfbsscan and discard the overlaps with compar */ process overlap_with_known_TFBS { //conda "${path_env}" - publishDir "${params.out}/1.2_filter_motifs", mode :'copy' + publishDir "${params.out}/1.2_filter_motifs", mode :'copy', pattern: "${name}_unknown.bed" + publishDir "${params.out}/log", mode: 'copy', pattern: '*.log' tag{name} errorStrategy 'finish' @@ -237,18 +283,25 @@ process overlap_with_known_TFBS { output: set name, file ("${name}_unknown.bed") into bed_for_reducing - file ('./known_tfbs/*.bed') optional true - file ('*.stats') + file ('*.log') script: if(params.tfbs_path == ""){ """ + if [[ ! -x "${path_bin}/1.2_filter_motifs/compareBed.sh" ]] + then + chmod +x ${path_bin}/1.2_filter_motifs/compareBed.sh + fi python ${path_bin}/1.2_filter_motifs/tfbsscan.py --use ${params.tfbsscan_method} --core ${params.threads} -m ${db} -g ${fasta} -o ./known_tfbs -b ${bed_peaks} ${path_bin}/1.2_filter_motifs/compareBed.sh --data ${bed_footprints} --motifs ./known_tfbs --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp} cp -r ./known_tfbs/ ${params.out}/1.2_filter_motifs/ """ } else { """ + if [[ ! -x "${path_bin}/1.2_filter_motifs/compareBed.sh" ]] + then + chmod +x ${path_bin}/1.2_filter_motifs/compareBed.sh + fi ${path_bin}/1.2_filter_motifs/compareBed.sh --data ${bed_footprints} --motifs ${known_tfbs} --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp} """ } @@ -265,7 +318,7 @@ process reduce_sequence { tag{name} publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy', pattern: '*.bed' - publishDir "${params.out}/2.1_clustering/log", mode: 'copy', pattern: '*.log' + publishDir "${params.out}/log", mode: 'copy', pattern: '*.log' input: set name, file (bed) from bed_for_reducing @@ -276,7 +329,7 @@ process reduce_sequence { script: """ - Rscript ${path_bin}/2.1_clustering/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} --summary reduce_sequence.log + Rscript ${path_bin}/2.1_clustering/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurrence} -s ${params.min_seq_length} --summary reduce_sequence.log """ } @@ -288,13 +341,14 @@ process clustering { //conda "${path_env}" publishDir "${params.out}/2.1_clustering/", mode: 'copy', pattern: '*.bed' - publishDir "${params.out}/2.1_clustering/log", mode: 'copy', pattern: '*.log' + publishDir "${params.out}/log", mode: 'copy', pattern: '*.log' input: set name, file (bed) from bed_for_clustering output: set name, file ('*.bed') into bed_for_motif_esitmation + file ('*.bed') into bed_for_venn file ('*.log') script: @@ -310,21 +364,21 @@ Converting BED-File to one FASTA-File per cluster process bed_to_clustered_fasta { //conda "${path_env}" - publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy', pattern: '*.FASTA' - tag{name} + publishDir "${params.out}/2.2_motif_estimation/01_fasta/", mode: 'copy', pattern: '*.FASTA' + tag{name} - input: - set name, file (bed) from bed_for_motif_esitmation + input: + set name, file (bed) from bed_for_motif_esitmation - output: - file ('*.FASTA') into fasta_for_glam2 - file ('*.FASTA') into fasta_for_motif_cluster + output: + file ('*.FASTA') into fasta_for_glam2 + file ('*.FASTA') into fasta_for_motif_cluster file ('*.log') into log22 - script: - """ - Rscript ${path_bin}/2.2_motif_estimation/bed_to_fasta.R -i ${bed} -p ${name} -m ${params.min_seq} - """ + script: + """ + Rscript ${path_bin}/2.2_motif_estimation/bed_to_fasta.R -i ${bed} -p ${name} -m ${params.min_seq} + """ } @@ -339,35 +393,36 @@ Generating Motifs through alignment and scoring best local matches. */ process glam2 { - tag{name} - publishDir "${params.out}/2.2_motif_estimation/glam2/", mode: 'copy' - //conda "${path_env}" + tag{name} + publishDir "${params.out}/2.2_motif_estimation/02_glam2/", mode: 'copy' + conda "${meme_env}" - input: - set name, file (fasta) from fasta_for_glam2 + input: + set name, file (fasta) from fasta_for_glam2 output: - file("${name}/*.meme") into meme_to_merge - set name, file("${name}/*.meme") into meme_for_tomtom - set name, file("${name}/*.meme") into meme_to_search_in_merged - set name, file("${name}/*.meme") into meme_for_filter + file("${name}/*.meme") into meme_to_merge + set name, file("${name}/*.meme") into meme_for_tomtom, meme_to_search_in_merged, meme_for_filter set name, file("${name}/*.txt") into glam_for_seq + set name, file("${name}/logo[1-9]*.png") into motif_png file ('*') - script: - """ - glam2 n ${fasta} -O ./${name}/ -E ${params.gap_penalty} -J ${params.gap_penalty} -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} - """ + script: + """ + glam2 n ${fasta} -O ./${name}/ -E ${params.gap_penalty} -J ${params.gap_penalty} -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} -s ${params.seed} + """ } +//motif_png.toSortedList()).println() + /* Combining all MEME-files to one big MEME-file. The paths are sorted numerically depending on the cluster ID. */ process merge_meme { - publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_meme/", mode: 'copy' - //conda "${path_env}" + publishDir "${params.out}/2.2_motif_estimation/03_motif_cluster/", mode: 'copy' + // conda "/home/rwiegan/.conda/envs/meme" input: val (memelist) from meme_to_merge.toList() @@ -377,6 +432,7 @@ process merge_meme { when: params.cluster_motif == 1 + disable_mo_clu == 0 script: //sorting @@ -386,6 +442,7 @@ process merge_meme { //create list for bash meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"") """ + set +u; source activate meme; meme2meme ${meme_list} > merged_meme.meme """ } @@ -396,11 +453,11 @@ to_find_similar_motifs = meme_to_search_in_merged.combine(merged_meme) Running Tomtom on merged meme-files. Output table has the information which clusters are similar to each other. */ -process find_similar_motifs { +process compare_cluster { tag{name} - publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/cluster_similarity/", mode: 'copy' - //conda "${path_env}" + publishDir "${params.out}/2.2_motif_estimation/03_motif_cluster/01_tomtom/", mode: 'copy' + conda "${meme_env}" input: set name, file (meme) ,file (merged_meme) from to_find_similar_motifs @@ -423,7 +480,7 @@ Label first column of TSV-file with Cluster ID process label_cluster { tag{name} - //conda "${path_env}" + publishDir "${params.out}/2.2_motif_estimation/03_motif_cluster/02_labeled_tsv/", mode: 'copy' input: set name, file (tsv) from motif_similarity @@ -435,17 +492,19 @@ process label_cluster { params.cluster_motif == 1 script: + id = name.split('_')[-1].toInteger() + 1 """ - Rscript ${path_bin}/2.2_motif_estimation/label_cluster.R -i ${tsv} -o ${name}_labeled.tsv + tail -n +2 ${tsv} | awk -F \$'\t' 'BEGIN {OFS = FS}(\$1=\$1".${id}")' > ${name}_labeled.tsv """ } + /* Merging tsv files_for_merge_fasta */ process merge_labeled_tsv { - publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/03_motif_cluster/", mode: 'copy' input: val (tsvlist) from labeled_tsv.toSortedList { it.toString().tokenize('_')[-2] as Integer } @@ -459,7 +518,7 @@ process merge_labeled_tsv { script: tsvs = tsvlist.toString().replaceAll(/\,|\[|\]/,"") """ - echo -e 'Query_ID\tTarget_ID\tOptimal_offset\tp-value\tE-value\tq-value\tOverlap\tQuery_consensus\tTarget_consensus\tOrientation'> merged_labeled.tsv + echo -e 'Query_ID\tTarget_ID\tOptimal_offset\tp-value\tE-value\tq-value\tOverlap\tQuery_consensus\tTarget_consensus\tOrientation' > merged_labeled.tsv for i in $tsvs; do cat \$i >> merged_labeled.tsv done @@ -475,23 +534,23 @@ Merging FASTA-files of similar clusters process merge_fasta { //conda "${path_env}" - publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_fasta/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/03_motif_cluster/03_merged_fasta/", mode: 'copy' input: set file (motiv_sim), val (fasta_list) from files_for_merge_fasta output: file ('*.fasta') into motif_clustered_fasta_list - file('*.png') + file('*') - when: - params.cluster_motif == 1 + when: + params.cluster_motif == 1 script: - fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } - fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") + fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } + fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") """ - Rscript ${path_bin}/2.2_motif_estimation/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight} + Rscript ${path_bin}/2.2_motif_estimation/merge_similar_clusters.R -i ${motiv_sim} -l ${fastalist} -w ${params.edge_weight} """ } @@ -503,26 +562,25 @@ Run GLAM2 on emrged FASTA-files */ process clustered_glam2 { - publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/03_motif_cluster/04_glam2/", mode: 'copy' tag{name} - //conda "${path_env}" + conda "${meme_env}" input: file (fasta) from motif_clustered_fasta_flat output: - set name, file ("/${name}/*.meme") into clustered_meme_for_tomtom - set name, file ("/${name}/*.meme") into clustered_meme_for_filter - set name, file("/${name}/glam2.txt") into clust_glam_for_seq - file('*') + set name, file ("./${name}/glam2.meme") into clustered_meme_for_tomtom, clustered_meme_for_filter + set name, file("./${name}/glam2.txt") into clust_glam_for_seq + file('*') when: params.cluster_motif == 1 script: - name = fasta.getBaseName() + name = fasta.getBaseName() """ - glam2 n ${fasta} -O ./${name}/ -E ${params.gap_penalty} -J ${params.gap_penalty} -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + glam2 n ${fasta} -O ./${name}/ -E ${params.gap_penalty} -J ${params.gap_penalty} -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} -s ${params.seed} """ } @@ -547,20 +605,19 @@ Tomtom searches motifs in databases. */ process tomtom { - tag{name} - publishDir "${params.out}/2.2_motif_estimation/tomtom/", mode: 'copy' - //conda "${path_env}" + tag{name} + publishDir "${params.out}/2.2_motif_estimation/04_tomtom/", mode: 'copy' - input: - set name, file (meme) from for_tomtom + input: + set name, file (meme) from for_tomtom - output: - set name, file ('*.tsv') into tsv_for_filter + output: + set name, file ('*.tsv') into tsv_for_filter - script: - """ - tomtom ${meme} ${params.motif_db} -thresh ${params.tomtom_treshold} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv - """ + script: + """ + tomtom ${meme} ${params.motif_db} -thresh ${params.tomtom_treshold} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv + """ } @@ -582,7 +639,7 @@ log22.combine(count_cluster).combine(count_cluster_before_filter ).set {log22_fi //Writes log file for 2.2_motif_estimation process write_log_for_motif_estimation { - publishDir "${params.out}/2.2_motif_estimation/log/", mode: 'copy' + publishDir "${params.out}/log/", mode: 'copy' input: set file (logfile), after_filter, before_filter from log22_final @@ -598,20 +655,20 @@ process write_log_for_motif_estimation { """ } + //If channel 'check' is empty print errormessage process check_for_unknown_motifs { - echo true - - input: - val x from check.ifEmpty('EMPTY') + echo true - when: - x == 'EMPTY' + input: + val x from check.ifEmpty('EMPTY') - """ - echo '>>> STOPPED: No unknown Motifs were found.' - """ + when: + x == 'EMPTY' + """ + echo '>>> STOPPED: No unknown Motifs were found.' + """ } @@ -620,13 +677,13 @@ process get_best_motif { //conda "${path_env}" tag{name} - publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/05_best_unknown_motifs/motifs/", mode: 'copy' input: set name, file(meme), file(tsv) from meme_for_scan output: - set name, file('*_best.meme') into best_motif + set name, file('*_best.meme') into best_motif, best_motif_for_venn script: cluster_id = name.split('_')[-1] @@ -639,56 +696,73 @@ process get_best_motif { /* Get the sequence names used to create best X Motifs as a JSON-file */ -process get_best_motif_seq { +process get_best_motif_seq_id { //conda "${path_env}" tag{name} - publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/motif_sequences/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/05_best_unknown_motifs/motif_sequence_ids/", mode: 'copy' input: set name, file (txt) from seq_to_json output: - file("${name}_seq.json") + set name, file("${name}_seq.json") into seq_id_for_venn script: cluster_id = name.split('_')[-1] """ - Rscript ${path_bin}/2.2_motif_estimation/get_motif_seq.R -i ${txt} -o ${name}_seq.json -n ${params.best_motif} -t ./motif_seq_tmp/cluster_${cluster_id} -c ${cluster_id} + Rscript ${path_bin}/2.2_motif_estimation/get_motif_seq_id.R -i ${txt} -o ${name}_seq.json -n ${params.best_motif} -t ./motif_seq_tmp/cluster_${cluster_id} -c ${cluster_id} """ } -best_motif.combine(fa_scan).set {files_for_genome_scan} +best_motif.combine(fa_scan).combine(bed_for_rescan).set {files_for_re_scan} /* -process genome_scan { +*/ +process re_scan { //conda "${path_env}" + tag{name} + publishDir "${params.out}/3.2_evaluation/rescan/${name}", mode: 'copy' input: - set name, file(meme), file(fasta) from files_for_genome_scan + set name, file(meme), file(fasta), file (peaks) from files_for_re_scan output: - file ('.bed') into bed_for_uropa, bed_for_cluster_quality + set name, val ('link') into link_re_scan_to_venn + file ("*.bed") into re_scan_for_uropa script: """ + mkdir -p ./tmp/re_scan/${name} + python ${path_bin}/1.2_filter_motifs/tfbsscan.py -m ${meme} -g ${fasta} -b ${peaks} -o . --cores 4 --use moods --resolve_overlaps + mkdir -p ${workflow.workDir}/tmp/re_scan_final + cp ./Cluster*.bed ${workflow.workDir}/tmp/re_scan_final """ } -process cluster_quality { +seq_id_for_venn.combine(link_re_scan_to_venn, by: 0).combine(bed_for_venn).set {for_venn} + +/* +*/ +process venn { + + tag{name} + publishDir "${params.out}/3.2_evaluation/venn", mode: 'copy' input: - file (bed) from bed_for_cluster_quality + set name, file(seq_ids), link, file (clustered_bed) from for_venn output: - file ('*.bed') into bed_for_final_filter + file ('*.pdf') script: + cluster_id = name.split('_')[-1] """ + Rscript ${path_bin}/3.2_evaluation/venn.R -i ${workflow.workDir}/tmp/re_scan_final/ -p Cluster_${cluster_id}_ -l ${seq_ids} -c ${clustered_bed} -o . """ -} */ +} process create_GTF { @@ -700,7 +774,7 @@ process create_GTF { file ('*.gtf') into gtf when: - params.gtf_path == "" + params.gtf_merged == "" script: """ @@ -708,60 +782,85 @@ process create_GTF { """ } -if (params.gtf_path == "") { - gtf_for_uropa = gtf -} else { - gtf_for_uropa = Channel.fromPath(params.gtf_path) +//TODO put operators in process mwerge_gtf +gtf_list = Channel.empty() +if (params.gtf_merged == "") { + gtf.combine(gtf_to_merge).set {gtf_list} } -/* -bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in} - - -// Create configuration file for UROPA. -// Takes template and replaces bed- and gtf-placeholders with actual paths. -process create_uropa_config { +process merge_gtf { - publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy' + stageInMode 'copy' + publishDir "${params.out}/3.1_create_gtf/", mode: 'copy' input: - set val(bed), val(gtf) from uropa_in.toList() - file (conf) from config + set file (gtf_reg), file (gtf_genes) from gtf_list output: - file ('uropa.config') into uropa_config + file ('merged.gtf') into merged_gtf + + when: + params.gtf_merged == "" script: """ - sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final + cat ${gtf_genes} > merged.gtf + cat ${gtf_reg} >> merged.gtf | sort -k 1,1 -k 4,4n -k 5,5n """ } -process UROPA { +if (params.gtf_merged == "") { + gtf_for_uropa = merged_gtf +} else { + Channel.fromPath( params.gtf_merged ).set {gtf_for_uropa} +} + +re_scan_for_uropa.flatten().combine(gtf_for_uropa).combine(config).set {uropa_in} + +// Create configuration file for UROPA. +// Takes template and replaces bed- and gtf-placeholders with actual paths. +process create_uropa_config { + + tag{bed} + publishDir "${params.out}/3.2_evaluation/uropa/config", mode: 'copy' input: - file (config) from uropa_config + set val(bed), val (gtf), file (conf) from uropa_in output: - set file ("*_allhits.txt"), file ("*_finalhits.txt") into uropa_for_filter + set name, file ("${name}_uropa.config") into uropa_config script: + name = bed.getBaseName() """ + sed -- 's=placeholder_gtf=${gtf}=g; s=placeholder_bed=${bed}=g' ${conf} > ${name}_uropa.config """ } -process filter { +process UROPA { + + tag{config} + maxForks params.max_uropa_runs + publishDir "${params.out}/3.2_evaluation/uropa", mode: 'copy', pattern: "*.txt" + publishDir "${params.out}/3.2_evaluation/uropa/summary", mode: 'copy', pattern: "*.pdf" + publishDir "${params.out}/log/uropa/${name}", mode: 'copy', pattern: "*.log" input: + set name, file (config) from uropa_config output: + set file ("*_allhits.txt"), file ("*_finalhits.txt") into uropa_for_filter + file("*.pdf") + file("*.log") script: """ + uropa -i ${config} -l uropa.log -t 4 -p ${name} -s -d """ -} */ +} + workflow.onComplete { log.info""" @@ -774,4 +873,6 @@ workDir : ${workflow.workDir} exit status : ${workflow.exitStatus} Error report: ${workflow.errorReport ?: '-'} """ +f = file(workflow.workDir + '/tmp/re_scan_final') +f.deleteDir() }