From cc2325031a60709be32b05f512209e3cd5e1f7ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Tue, 18 Dec 2018 08:00:39 -0500 Subject: [PATCH 01/13] Bugfix in bed_to_fasta.R: Get last and second last instead of fixed indices --- bin/bed_to_fasta.R | 62 +++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index 84910f3..8af8516 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -1,28 +1,34 @@ -#!/usr/bin/env Rscript - -# Splitting BED-files depending on their cluster. -# The Sequences of each cluster are writen as an FASTA-file. -# @parameter bedInput BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8 -# @parameter prefix prefix for filenames -# @parameter min_seq min. number of sequences per cluster - -args = commandArgs(trailingOnly = TRUE) - -bedInput <- args[1] -prefix <- args[2] -min_seq <- args[3] - -bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") - -clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Cluster column -discard <- lapply(1:length(clusters), function(i){ - clust <- as.data.frame(clusters[i]) - print(nrow(clust)) - if (nrow(clust) >= as.numeric(min_seq) ) { - sequences <- as.list(clust[[10]]) # <---- sequenze column - outfile <- paste0(prefix,"_cluster_",i,".FASTA") - seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column - } else { - print(paste0("Cluster: ",i," is to small")) - } -}) +#!/usr/bin/env Rscript + +#' Splitting BED-files depending on their cluster. +#' The Sequences of each cluster are writen as an FASTA-file. +#' @parameter bedInput BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8 +#' @parameter prefix prefix for filenames +#' @parameter min_seq min. number of sequences per cluster +#' +#' @author René Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de + +args = commandArgs(trailingOnly = TRUE) + +bedInput <- args[1] +prefix <- args[2] +min_seq <- args[3] + +bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") + +# Get last column of data.table, which refers to the cluster, as a vector. +cluster_no <- as.vector(bed[[ncol(bed)]]) + +clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) # <---- Cluster column + +discard <- lapply(1:length(clusters), function(i){ + clust <- as.data.frame(clusters[i]) + if (nrow(clust) >= as.numeric(min_seq) ) { + sequences <- as.list(clust[[ncol(clust) - 1]]) # <---- sequenze column + outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") + seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column + } else { + print(paste0("Cluster: ",i," is to small")) + } +}) From 62f6f3f1e95ad6038567619a3f846203f1e92751 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Tue, 18 Dec 2018 08:10:13 -0500 Subject: [PATCH 02/13] bed_to_fasta.R: Improved documentation --- bin/bed_to_fasta.R | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index 8af8516..dce3839 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -2,9 +2,10 @@ #' Splitting BED-files depending on their cluster. #' The Sequences of each cluster are writen as an FASTA-file. -#' @parameter bedInput BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8 -#' @parameter prefix prefix for filenames -#' @parameter min_seq min. number of sequences per cluster +#' @param bedInput BED-file with sequences and cluster-id as last two columns: +#' Sequence: second last column; Cluster ID: last column +#' @param prefix prefix for filenames +#' @param min_seq min. number of sequences per cluster #' #' @author René Wiegandt #' @contact rene.wiegandt(at)mpi-bn.mpg.de @@ -20,14 +21,20 @@ bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") # Get last column of data.table, which refers to the cluster, as a vector. cluster_no <- as.vector(bed[[ncol(bed)]]) -clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) # <---- Cluster column +# Split data.table bed on its last column (cluster_no) into list of data.frames +clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) +# For each data.frame(cluster) in list clusters: discard <- lapply(1:length(clusters), function(i){ clust <- as.data.frame(clusters[i]) + # Filter data.tables(clusters), which are to small if (nrow(clust) >= as.numeric(min_seq) ) { - sequences <- as.list(clust[[ncol(clust) - 1]]) # <---- sequenze column + # Get second last column, which contains the nucleotide sequences + sequences <- as.list(clust[[ncol(clust) - 1]]) + # Create filename outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") - seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column + # Write fasta file + seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) } else { print(paste0("Cluster: ",i," is to small")) } From cf9dcd84983c4825d52b38c28cc7314c67b367ae Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Wed, 19 Dec 2018 12:16:13 +0100 Subject: [PATCH 03/13] bed_to_fasta.R: Imporved parametercalling with optparse --- bin/bed_to_fasta.R | 99 +++++++++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 41 deletions(-) diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index dce3839..56ac9dc 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -1,41 +1,58 @@ -#!/usr/bin/env Rscript - -#' Splitting BED-files depending on their cluster. -#' The Sequences of each cluster are writen as an FASTA-file. -#' @param bedInput BED-file with sequences and cluster-id as last two columns: -#' Sequence: second last column; Cluster ID: last column -#' @param prefix prefix for filenames -#' @param min_seq min. number of sequences per cluster -#' -#' @author René Wiegandt -#' @contact rene.wiegandt(at)mpi-bn.mpg.de - -args = commandArgs(trailingOnly = TRUE) - -bedInput <- args[1] -prefix <- args[2] -min_seq <- args[3] - -bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") - -# Get last column of data.table, which refers to the cluster, as a vector. -cluster_no <- as.vector(bed[[ncol(bed)]]) - -# Split data.table bed on its last column (cluster_no) into list of data.frames -clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) - -# For each data.frame(cluster) in list clusters: -discard <- lapply(1:length(clusters), function(i){ - clust <- as.data.frame(clusters[i]) - # Filter data.tables(clusters), which are to small - if (nrow(clust) >= as.numeric(min_seq) ) { - # Get second last column, which contains the nucleotide sequences - sequences <- as.list(clust[[ncol(clust) - 1]]) - # Create filename - outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") - # Write fasta file - seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) - } else { - print(paste0("Cluster: ",i," is to small")) - } -}) +#!/usr/bin/env Rscript +library("optparse") + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"), + make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"), + make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Convert BED-file to one FASTA-file per cluster") + +opt <- parse_args(opt_parser) + +#' Splitting BED-files depending on their cluster. +#' The Sequences of each cluster are writen as an FASTA-file. +#' @param bedInput BED-file with sequences and cluster-id as last two columns: +#' Sequence: second last column; Cluster ID: last column +#' @param prefix prefix for filenames +#' @param min_seq min. number of sequences per cluster +#' +#' @author René Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de +bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ + + if(is.null(bedInput)){ + stop("ERROR: Input parameter cannot be null! Please specify the input parameter.") + } + + bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") + + # Get last column of data.table, which refers to the cluster, as a vector. + cluster_no <- as.vector(bed[[ncol(bed)]]) + + # Split data.table bed on its last column (cluster_no) into list of data.frames + clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) + + # For each data.frame(cluster) in list clusters: + discard <- lapply(1:length(clusters), function(i){ + clust <- as.data.frame(clusters[i]) + # Filter data.tables(clusters), which are to small + if (nrow(clust) >= as.numeric(min_seq) ) { + # Get second last column, which contains the nucleotide sequences + sequences <- as.list(clust[[ncol(clust) - 1]]) + # Create filename + outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") + # Write fasta file + seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) + } else { + print(paste0("Cluster: ",i," is to small")) + } + }) +} + +# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell) +if (!interactive()) { + bed_to_fasta(opt$input, opt$prefix, opt$min_seq) +} \ No newline at end of file From 6d5c604c2950e90ab7f12bd4822f8e2e5cafa0f9 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Wed, 19 Dec 2018 12:18:08 +0100 Subject: [PATCH 04/13] adaption of pipeline.nf to changes in bed_to_fasta.R --- pipeline.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index a39616a..463a790 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -283,7 +283,7 @@ process reduce_bed { """ Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} """ -} +}String /* @@ -324,7 +324,7 @@ process bed_to_clustered_fasta { script: """ - Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq} + Rscript ${path_bin}/bed_to_fasta.R -i ${bed} -p ${name} -m ${params.min_seq} """ } From ce5287174ca4ddee0e8bc54fca3eb6a18378b8a4 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Wed, 19 Dec 2018 12:47:00 +0100 Subject: [PATCH 05/13] Refactoring --- bin/bed_to_fasta.R | 2 +- pipeline.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index 56ac9dc..bb4d3dc 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -55,4 +55,4 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ # run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell) if (!interactive()) { bed_to_fasta(opt$input, opt$prefix, opt$min_seq) -} \ No newline at end of file +} diff --git a/pipeline.nf b/pipeline.nf index 463a790..55b7c26 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -283,7 +283,7 @@ process reduce_bed { """ Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} """ -}String +} /* From 3c4f733eb627dce9c2e8566f617dbe7dbf58a1fd Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Wed, 19 Dec 2018 14:40:59 +0100 Subject: [PATCH 06/13] get_best_motif.py: fixed bug which caused to print motif header as last line + improved documentation --- bin/get_best_motif.py | 54 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/bin/get_best_motif.py b/bin/get_best_motif.py index cc24949..37eccc5 100644 --- a/bin/get_best_motif.py +++ b/bin/get_best_motif.py @@ -1,26 +1,64 @@ -# parses arguments using argparse -# @return args list of all parameters +''' +parses arguments using argparse +@return args list of all parameters +''' def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("meme", help="Path to meme file") + parser = argparse.ArgumentParser(description='A script to convert from GLAM2 output to MEME-format and parsing only the [num] first motifs from file to the output.') + parser.add_argument("meme", help="Path to 'meme' file generated by GLAM2") parser.add_argument("output", help="Output file") parser.add_argument("num", help="Number of motifs parsed from file") args = parser.parse_args() return args -# write lines of file till certain line (MOTIF + [num]) +''' +The script has to functions: + 1. Writing lines of file till certain line (MOTIF + [num]) + 2. Converting GLAM2 output to minimal meme-format +@params meme STING Path to 'meme' file generated from Meme suite +@parmas output STING Output file +@params num INT Number of motifs parsed from file + +@author René Wiegandt +@contact rene.wiegandt(at)mpi-bn.mpg.de +''' def main(): + args = parse_arguments() out = open(args.output, "w+") + + ''' + Create pattern where script should stop writing + For Example: + If num == 3, which means that you want the first/best 3 Motifs, the script + should stop writing lines to output if loop reaches line 'MOTIF 4' + ''' number = int(args.num) + 1 - motif = "MOTIF " + str(number) + break_header = "MOTIF " + str(number) + + # Pattern for motif header + pattern = re.compile("^MOTIF\s{2}(\d)+") + # Init count + count = 0 + with open(args.meme) as f: for line in f: - if motif in line: + ## do not write [count] lines after each header -> needed for meme-format + if count > 0: + count-=1 + continue + if pattern.match(line): + # if line is a motif header + count = 2 + ## + + if break_header in line: + # line matches breaking_header, e.g. 'MOTIF 4' break - out.write(line) + else: + out.write(line) if __name__ == "__main__": import argparse + import re main() From 8389226abaf3c6724b2978fc38bc08c9d06196b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 19 Dec 2018 15:34:43 -0500 Subject: [PATCH 07/13] Fixed typos in get_best_motif.py --- bin/get_best_motif.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/get_best_motif.py b/bin/get_best_motif.py index 37eccc5..a506bd8 100644 --- a/bin/get_best_motif.py +++ b/bin/get_best_motif.py @@ -11,11 +11,11 @@ def parse_arguments(): return args ''' -The script has to functions: +The script has two functions: 1. Writing lines of file till certain line (MOTIF + [num]) 2. Converting GLAM2 output to minimal meme-format -@params meme STING Path to 'meme' file generated from Meme suite -@parmas output STING Output file +@params meme STRING Path to 'meme' file generated from Meme suite +@parmas output STRING Output file @params num INT Number of motifs parsed from file @author René Wiegandt @@ -25,7 +25,7 @@ def main(): args = parse_arguments() out = open(args.output, "w+") - + ''' Create pattern where script should stop writing For Example: From 2fca1581a160844d89be2fda2aea322a1740b9b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 19 Dec 2018 15:35:32 -0500 Subject: [PATCH 08/13] Reads BED-files with or without header --- bin/bed_to_fasta.R | 116 ++++++++++++++++++++++----------------------- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index bb4d3dc..73ea732 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -1,58 +1,58 @@ -#!/usr/bin/env Rscript -library("optparse") - -option_list <- list( - make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"), - make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"), - make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer") -) - -opt_parser <- OptionParser(option_list = option_list, - description = "Convert BED-file to one FASTA-file per cluster") - -opt <- parse_args(opt_parser) - -#' Splitting BED-files depending on their cluster. -#' The Sequences of each cluster are writen as an FASTA-file. -#' @param bedInput BED-file with sequences and cluster-id as last two columns: -#' Sequence: second last column; Cluster ID: last column -#' @param prefix prefix for filenames -#' @param min_seq min. number of sequences per cluster -#' -#' @author René Wiegandt -#' @contact rene.wiegandt(at)mpi-bn.mpg.de -bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ - - if(is.null(bedInput)){ - stop("ERROR: Input parameter cannot be null! Please specify the input parameter.") - } - - bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") - - # Get last column of data.table, which refers to the cluster, as a vector. - cluster_no <- as.vector(bed[[ncol(bed)]]) - - # Split data.table bed on its last column (cluster_no) into list of data.frames - clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) - - # For each data.frame(cluster) in list clusters: - discard <- lapply(1:length(clusters), function(i){ - clust <- as.data.frame(clusters[i]) - # Filter data.tables(clusters), which are to small - if (nrow(clust) >= as.numeric(min_seq) ) { - # Get second last column, which contains the nucleotide sequences - sequences <- as.list(clust[[ncol(clust) - 1]]) - # Create filename - outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") - # Write fasta file - seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) - } else { - print(paste0("Cluster: ",i," is to small")) - } - }) -} - -# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell) -if (!interactive()) { - bed_to_fasta(opt$input, opt$prefix, opt$min_seq) -} +#!/usr/bin/env Rscript +library("optparse") + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"), + make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"), + make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Convert BED-file to one FASTA-file per cluster") + +opt <- parse_args(opt_parser) + +#' Splitting BED-files depending on their cluster. +#' The Sequences of each cluster are writen as an FASTA-file. +#' @param bedInput BED-file with sequences and cluster-id as last two columns: +#' Sequence: second last column; Cluster ID: last column +#' @param prefix prefix for filenames +#' @param min_seq min. number of sequences per cluster +#' +#' @author René Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de +bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ + + if (is.null(bedInput)) { + stop("ERROR: Input parameter cannot be null! Please specify the input parameter.") + } + + bed <- data.table::fread(bedInput, sep = "\t") + + # Get last column of data.table, which refers to the cluster, as a vector. + cluster_no <- as.vector(bed[[ncol(bed)]]) + + # Split data.table bed on its last column (cluster_no) into list of data.frames + clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) + + # For each data.frame(cluster) in list clusters: + discard <- lapply(1:length(clusters), function(i){ + clust <- as.data.frame(clusters[i]) + # Filter data.tables(clusters), which are to small + if (nrow(clust) >= as.numeric(min_seq) ) { + # Get second last column, which contains the nucleotide sequences + sequences <- as.list(clust[[ncol(clust) - 1]]) + # Create filename + outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") + # Write fasta file + seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) + } else { + print(paste0("Cluster: ",i," is to small")) + } + }) +} + +# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell) +if (!interactive()) { + bed_to_fasta(opt$input, opt$prefix, opt$min_seq) +} From 4dea8e413781953175fdef9679a408058bd27269 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 20 Dec 2018 11:51:52 +0100 Subject: [PATCH 09/13] Imporved description for installation in README.md --- README.md | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 1aa01e6..41ee001 100644 --- a/README.md +++ b/README.md @@ -7,38 +7,16 @@ For further information read the [documentation](https://github.molgen.mpg.de/lo ## Dependencies * [conda](https://conda.io/docs/user-guide/install/linux.html) * [Nextflow](https://www.nextflow.io/) -* [MEME-Suite](http://meme-suite.org/doc/install.html?man_type=web) ## Installation -Start with installing all dependencies listed above. It is required to set the [enviroment paths for meme-suite](http://meme-suite.org/doc/install.html?man_type=web#installingtar). -this can be done with following commands: -``` -export PATH=[meme-suite instalation path]/libexec/meme-[meme-suite version]:$PATH -export PATH=[meme-suite instalation path]/bin:$PATH -``` - +Start with installing all dependencies listed above (Nextflow, conda) and downloading all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). -Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). -The Nextflow-script needs a conda enviroment to run. Nextflow can create the needed enviroment from the given yaml-file. -On some systems Nextflow exits the run with following error: -``` -Caused by: - Failed to create Conda environment - command: conda env create --prefix --file env.yml - status : 143 - message: -``` -If this error occurs you have to create the enviroment before starting the pipeline. -To create this enviroment you need the yml-file from the repository. -Run the following commands to create the enviroment: -```console -path=[Path to given masterenv.yml file] -conda env create --name masterenv -f=$path -``` -When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it. +Every other dependency will be automatically installed by Nextflow using conda. For that a new conda enviroment will be created, which can be found in the from Nextflow created work directory after the first pipeline run. +It is **not** required to create and activate the enviroment from the yaml-file beforehand. **Important Note:** For conda the channel bioconda needs to be set as highest priority! This is required due to two differnt packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfisch package from the channel conda-forge! + ## Quick Start ```console nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file] @@ -105,6 +83,24 @@ Optional arguments: All arguments can be set in the configuration files ``` +For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki) +## Known issues +The Nextflow-script needs a conda enviroment to run. Nextflow creates the needed enviroment from the given yaml-file. +On some systems Nextflow exits the run with following error: +``` +Caused by: + Failed to create Conda environment + command: conda env create --prefix --file env.yml + status : 143 + message: +``` +If this error occurs you have to create the enviroment before starting the pipeline. +To create this enviroment you need the yml-file from the repository. +Run the following commands to create the enviroment: +```console +path=[Path to given masterenv.yml file] +conda env create --name masterenv -f $path +``` +When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it. -For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki) From 1a7a812b3f667a710438e05cac3d9f2b91ba4983 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 20 Dec 2018 11:58:11 +0100 Subject: [PATCH 10/13] Removed snakemake from yaml-file --- masterenv.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/masterenv.yml b/masterenv.yml index 211f4e1..a2f13d6 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -14,7 +14,6 @@ dependencies: - r-stringr - r-optparse - bioconductor-iranges - - snakemake - meme - moods - biopython From 4844609d709c67e353c4727d0247ee0287bedf55 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 20 Dec 2018 13:14:41 +0100 Subject: [PATCH 11/13] Set parameter organism as required wihtout an default value --- pipeline.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 55b7c26..3d51f1b 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -55,10 +55,10 @@ params.best_motif = 3 // Top n motifs per cluster //creating_gtf - params.organism="hg38" + params.organism="" params.tissue="" -if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" != "0"){ +if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" != "0"){ log.info """ Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file] @@ -70,6 +70,7 @@ Required arguments: --config Path to UROPA configuration file --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. Path can be set as tfbs_path in next run. (Default: './') + --organism Input organism [hg38 | hg19 | mm9 | mm10] --out Output Directory (Default: './out/') Optional arguments: @@ -115,7 +116,6 @@ Optional arguments: --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) Creating GTF: - --organism [hg38 | hg19 | mm9 | mm10] Input organism --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON config All arguments can be set in the configuration files From 46cfc59119899bb65efadcc007e924707b97cb75 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 20 Dec 2018 13:39:13 +0100 Subject: [PATCH 12/13] Added Parameter gtf_path. If path is set process create_gtf will be skipped --- pipeline.nf | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 3d51f1b..1be18be 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -9,6 +9,7 @@ params.tfbs_path="" params.create_known_tfbs_path = "./" params.help = 0 + params.get_path="" params.out = "./out/" //peak_calling @@ -68,8 +69,6 @@ Required arguments: --genome_fasta Path to genome in FASTA-format --motif_db Path to motif-database in MEME-format --config Path to UROPA configuration file - --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. - Path can be set as tfbs_path in next run. (Default: './') --organism Input organism [hg38 | hg19 | mm9 | mm10] --out Output Directory (Default: './out/') @@ -77,6 +76,9 @@ Optional arguments: --help [0|1] 1 to show this help message. (Default: 0) --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. + Path can be set as tfbs_path in next run. (Default: './') + --gtf_path Path to gtf-file. If path is set the process which creats a gtf-file is skipped. Footprint extraction: --window_length INT This parameter sets the length of a sliding window. (Default: 200) @@ -578,7 +580,10 @@ process create_GTF { publishDir "${params.out}/gtf/", mode: 'copy' output: - file ('*.gtf') into gtf_for_uropa + file ('*.gtf') into gtf + + when: + gtf_path == "" script: """ @@ -586,6 +591,12 @@ process create_GTF { """ } +if (gtf_path == "") { + gtf_for_uropa = gtf +} else { + gtf_for_uropa = Channel.fromPath(params.gtf_path) +} + /* bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in} From 5e462666c3e84c1173443d5b4f93470ee6adbb19 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 20 Dec 2018 14:11:14 +0100 Subject: [PATCH 13/13] Fixed typo in bed_to_fasta.R --- bin/bed_to_fasta.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index 73ea732..e0ade14 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -13,7 +13,7 @@ opt_parser <- OptionParser(option_list = option_list, opt <- parse_args(opt_parser) #' Splitting BED-files depending on their cluster. -#' The Sequences of each cluster are writen as an FASTA-file. +#' The Sequences of each cluster are written as an FASTA-file. #' @param bedInput BED-file with sequences and cluster-id as last two columns: #' Sequence: second last column; Cluster ID: last column #' @param prefix prefix for filenames