From fe5ab42391b9ebfde7d2cc229ea0e59a03170972 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Tue, 8 Jan 2019 08:25:56 -0500 Subject: [PATCH 01/17] get_best_motif.py: Added alternative name to best_motif file --- bin/2.2_motif_estimation/get_best_motif.py | 25 +++++++++++++--------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/bin/2.2_motif_estimation/get_best_motif.py b/bin/2.2_motif_estimation/get_best_motif.py index a506bd8..20922b9 100644 --- a/bin/2.2_motif_estimation/get_best_motif.py +++ b/bin/2.2_motif_estimation/get_best_motif.py @@ -35,6 +35,9 @@ def main(): number = int(args.num) + 1 break_header = "MOTIF " + str(number) + # Get cluster_id + cluster_id = os.path.splitext(args.meme)[0].split('/')[-2].split('_')[-1] + # Pattern for motif header pattern = re.compile("^MOTIF\s{2}(\d)+") # Init count @@ -42,23 +45,25 @@ def main(): with open(args.meme) as f: for line in f: - ## do not write [count] lines after each header -> needed for meme-format - if count > 0: - count-=1 - continue - if pattern.match(line): - # if line is a motif header - count = 2 - ## - if break_header in line: # line matches breaking_header, e.g. 'MOTIF 4' break else: - out.write(line) + ## do not write [count] lines after each header -> needed for meme-format + if count > 0: + count-=1 + continue + if pattern.match(line): + # if line is a motif header + count = 2 + out.write(line.strip('\n') + " Cluster_" + cluster_id + '\n') + ## + else: + out.write(line) if __name__ == "__main__": import argparse import re + import os main() From 79b7e2476e39ac29e1274e90bee69876b62d33e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Tue, 8 Jan 2019 14:55:39 -0500 Subject: [PATCH 02/17] get_best_motif.py: removed whitespace from motif header --- bin/2.2_motif_estimation/get_best_motif.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/2.2_motif_estimation/get_best_motif.py b/bin/2.2_motif_estimation/get_best_motif.py index 20922b9..beacb1c 100644 --- a/bin/2.2_motif_estimation/get_best_motif.py +++ b/bin/2.2_motif_estimation/get_best_motif.py @@ -56,7 +56,7 @@ def main(): if pattern.match(line): # if line is a motif header count = 2 - out.write(line.strip('\n') + " Cluster_" + cluster_id + '\n') + out.write(line.strip('\n') + " Cluster_" + cluster_id + '\n') ## else: out.write(line) From 837734031c8b2b26f0212ba0ea90ce49cb090b27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Tue, 8 Jan 2019 14:56:16 -0500 Subject: [PATCH 03/17] added script get_motif_seq.R --- bin/2.2_motif_estimation/get_motif_seq.R | 86 ++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 bin/2.2_motif_estimation/get_motif_seq.R diff --git a/bin/2.2_motif_estimation/get_motif_seq.R b/bin/2.2_motif_estimation/get_motif_seq.R new file mode 100644 index 0000000..b79ac8d --- /dev/null +++ b/bin/2.2_motif_estimation/get_motif_seq.R @@ -0,0 +1,86 @@ +#!/usr/bin/env Rscript +if (!require(optparse, quietly = T)) install.packages("optparse"); library(optparse) + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input file. Output txt-file from GLAM2.", metavar = "character"), + make_option(opt_str = c("-o", "--output"), default = "sequences.json" , help = "Output JSON-file. Default = '%default'", metavar = "character"), + make_option(opt_str = c("-n", "--num"), default = 3 , help = "Get best (num) motifs. Default = '%default'", metavar = "numeric") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Creating JSON-file with sequence ids which were used to create the best (num) motifs.", + epilogue = "Author: Rene Wiegandt ") + +opt <- parse_args(opt_parser) + +#' Reading files with fread. +#' Only read the first column. +#' @param path Path to file +#' @return first column as vector +read_data <- function(path){ + + f <- data.table::fread(path, select = 1) + return(f[[1]]) +} + + +#' Creating JSON-file with sequence ids which were used to create the best (num) motifs. +#' +#' @param input Input file.Output txt-file from GLAM2. +#' @param output Output JSON-file +#' @param num Get best (num) motifs. +#' +#' @author René Wiegandt +create_seq_json <- function(input, output, num) { + + if (!file.exists(input)) { + stop(paste0("Input file does not exists. Please check the given path: ", input)) + } + + if ( !is.numeric(num)) { + stop("Parameter num needs to be an integer") + } + + if (num > 10 || num <= 0 ) { + stop(paste0("Parameter 'num' needs to be an number between 1 and 10! Your input: ", num)) + } + + # Getting cluster id + split_path <- unlist(strsplit(dirname(input),'_')) + cluster_id <- split_path[[length(split_path)]] + + if ( !varhandle::check.numeric(cluster_id)) { + stop(paste0("CLUSTER ID could not be found. Please make sure that your file path contains _[cluster_id] at the end. Found: ", cluster_id,"\n For example: /test_cluster_1/glam.txt")) + } + + file_dir <- dirname(input) + + # Split glam.txt file on lines that start with Score: + system(paste0("csplit ", input, " '/^Score:.*/' '{*}' -f ", file_dir, "/f_id_test.pholder")) + # Only keep the lines that start with 'f' to get the lines with the sequence ids + system(paste0("for i in ", file_dir, "/*.pholder0[1-", num, "];do grep \"^f\" $i > \"${i}.done\";done")) + + # Getting the filepaths of first 3 files with sequence ids + fnames <- file.path(file_dir,dir(file_dir, pattern = "done")) + + # Running read_data on files + datalist <- lapply(fnames, read_data) + + + # Create json file + ## naming + names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id) + ## creating json object + json <- RJSONIO::toJSON(datalist, pretty = T , .withNames = T) + ## writing file + write(json, file = output ) +} + +# run function create_seq_json with given parameteres if not in interactive context (e.g. run from shell) +if (!interactive()) { + if (length(commandArgs(trailingOnly = TRUE)) <= 0) { + print_help(opt_parser) + } else { + create_seq_json(opt$input, opt$output, opt$num) + } +} From 3f9e4bce32cf700e1e5b69b8e6bc2776596c9930 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Tue, 8 Jan 2019 15:14:28 -0500 Subject: [PATCH 04/17] pipeline.nf: implemented get_motif_seq.R --- pipeline.nf | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 93190a3..af27a1f 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -357,6 +357,7 @@ process glam2 { set name, file("${name}/*.meme") into meme_for_tomtom set name, file("${name}/*.meme") into meme_to_search_in_merged set name, file("${name}/*.meme") into meme_for_filter + set name, file("${name}/*.txt") into glam_for_seq file ('*') script: @@ -508,7 +509,7 @@ Run GLAM2 on emrged FASTA-files */ process clustered_glam2 { - publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/${name}/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/", mode: 'copy' tag{name} conda "${path_env}" @@ -516,8 +517,9 @@ process clustered_glam2 { file (fasta) from motif_clustered_fasta_flat output: - set name, file ('*.meme') into clustered_meme_for_tomtom - set name, file ('*.meme') into clustered_meme_for_filter + set name, file ("/${name}/*.meme") into clustered_meme_for_tomtom + set name, file ("/${name}/*.meme") into clustered_meme_for_filter + set name, file("/${name}/glam2.txt") into clust_glam_for_seq file('*') when: @@ -526,7 +528,7 @@ process clustered_glam2 { script: name = fasta.getBaseName() """ - glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} """ } @@ -537,9 +539,11 @@ If motif clustering is activated or not. if(params.cluster_motif == 1){ for_tomtom = clustered_meme_for_tomtom for_filter = clustered_meme_for_filter + seq_to_json = clust_glam_for_seq } else { for_tomtom = meme_for_tomtom for_filter = meme_for_filter + seq_to_json = glam_for_seq } @@ -594,7 +598,7 @@ process check_for_unknown_motifs { } -//Get the best(first) Motif from each MEME-file +//Get the best (first) X Motifs from each MEME-file process get_best_motif { conda "${path_env}" @@ -614,6 +618,25 @@ process get_best_motif { } +process get_best_motif_seq { + + conda "${path_env}" + tag{name} + publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/motif_sequences/", mode: 'copy' + + input: + set name, file(txt) from seq_to_json + + output: + file('*.json') + + script: + """ + Rscript ${path_bin}/2.2_motif_estimation/ get_motif_seq.R -i ${txt} -o ${name}_seq.json -n ${þærams.best_motif} + """ +} + + best_motif.combine(fa_scan).set {files_for_genome_scan} /* From 49ec389e943f6d10f88daabb4a32f7ab1654094e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Tue, 8 Jan 2019 15:30:36 -0500 Subject: [PATCH 05/17] added r packages: RJSONIO, varhandle to masterenv.yml --- masterenv.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/masterenv.yml b/masterenv.yml index 751964e..bd0f2ca 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -14,6 +14,8 @@ dependencies: - r-stringi - r-stringr - r-optparse + - r-RJSONIO + - r-varhandle - bioconductor-iranges - moods - biopython From d979e16e25877207654c7269c9461082ac7914cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 9 Jan 2019 08:23:28 -0500 Subject: [PATCH 06/17] get_best_motif.py: added parameter cluster id --- bin/2.2_motif_estimation/get_best_motif.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/2.2_motif_estimation/get_best_motif.py b/bin/2.2_motif_estimation/get_best_motif.py index beacb1c..7f9589d 100644 --- a/bin/2.2_motif_estimation/get_best_motif.py +++ b/bin/2.2_motif_estimation/get_best_motif.py @@ -7,6 +7,7 @@ def parse_arguments(): parser.add_argument("meme", help="Path to 'meme' file generated by GLAM2") parser.add_argument("output", help="Output file") parser.add_argument("num", help="Number of motifs parsed from file") + parser.add_argument("id", help="Cluster ID") args = parser.parse_args() return args @@ -36,7 +37,7 @@ def main(): break_header = "MOTIF " + str(number) # Get cluster_id - cluster_id = os.path.splitext(args.meme)[0].split('/')[-2].split('_')[-1] + cluster_id = args.id # Pattern for motif header pattern = re.compile("^MOTIF\s{2}(\d)+") From ea2e1710e418e9ef61f3542b33b7fa8f11ff5f48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 9 Jan 2019 08:24:12 -0500 Subject: [PATCH 07/17] get_motif_seq.R: added parameter tmp_path and cluster_id --- bin/2.2_motif_estimation/get_motif_seq.R | 42 ++++++++++++------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/bin/2.2_motif_estimation/get_motif_seq.R b/bin/2.2_motif_estimation/get_motif_seq.R index b79ac8d..6f298b3 100644 --- a/bin/2.2_motif_estimation/get_motif_seq.R +++ b/bin/2.2_motif_estimation/get_motif_seq.R @@ -4,7 +4,9 @@ if (!require(optparse, quietly = T)) install.packages("optparse"); library(optpa option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input file. Output txt-file from GLAM2.", metavar = "character"), make_option(opt_str = c("-o", "--output"), default = "sequences.json" , help = "Output JSON-file. Default = '%default'", metavar = "character"), - make_option(opt_str = c("-n", "--num"), default = 3 , help = "Get best (num) motifs. Default = '%default'", metavar = "numeric") + make_option(opt_str = c("-n", "--num"), default = 3 , help = "Get best (num) motifs. Default = '%default'", metavar = "numeric"), + make_option(opt_str = c("-c", "--cluster_id"), default = "./" , help = "Cluster ID", metavar = "numeric"), + make_option(opt_str = c("-t", "--tmp"), default = "./" , help = "Path for tmp-files. Default = '%default'", metavar = "character") ) opt_parser <- OptionParser(option_list = option_list, @@ -18,55 +20,53 @@ opt <- parse_args(opt_parser) #' @param path Path to file #' @return first column as vector read_data <- function(path){ - + f <- data.table::fread(path, select = 1) return(f[[1]]) } #' Creating JSON-file with sequence ids which were used to create the best (num) motifs. -#' +#' #' @param input Input file.Output txt-file from GLAM2. #' @param output Output JSON-file #' @param num Get best (num) motifs. -#' -#' @author René Wiegandt -create_seq_json <- function(input, output, num) { - +#' +#' @author Ren� Wiegandt +create_seq_json <- function(input, output, num, tmp_path, cluster_id) { + if (!file.exists(input)) { stop(paste0("Input file does not exists. Please check the given path: ", input)) } - + if ( !is.numeric(num)) { stop("Parameter num needs to be an integer") } - + if (num > 10 || num <= 0 ) { stop(paste0("Parameter 'num' needs to be an number between 1 and 10! Your input: ", num)) } - # Getting cluster id - split_path <- unlist(strsplit(dirname(input),'_')) - cluster_id <- split_path[[length(split_path)]] - if ( !varhandle::check.numeric(cluster_id)) { stop(paste0("CLUSTER ID could not be found. Please make sure that your file path contains _[cluster_id] at the end. Found: ", cluster_id,"\n For example: /test_cluster_1/glam.txt")) } - - file_dir <- dirname(input) - + + dir.create(tmp_path, showWarnings = FALSE) + + file_dir <- tmp_path + # Split glam.txt file on lines that start with Score: system(paste0("csplit ", input, " '/^Score:.*/' '{*}' -f ", file_dir, "/f_id_test.pholder")) # Only keep the lines that start with 'f' to get the lines with the sequence ids system(paste0("for i in ", file_dir, "/*.pholder0[1-", num, "];do grep \"^f\" $i > \"${i}.done\";done")) - + # Getting the filepaths of first 3 files with sequence ids fnames <- file.path(file_dir,dir(file_dir, pattern = "done")) - + # Running read_data on files datalist <- lapply(fnames, read_data) - - + + # Create json file ## naming names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id) @@ -81,6 +81,6 @@ if (!interactive()) { if (length(commandArgs(trailingOnly = TRUE)) <= 0) { print_help(opt_parser) } else { - create_seq_json(opt$input, opt$output, opt$num) + create_seq_json(opt$input, opt$output, opt$num, opt$tmp, opt$cluster_id) } } From 8e5049ea4092ecc5e0c62481f02ac38852128e5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 9 Jan 2019 08:25:34 -0500 Subject: [PATCH 08/17] pipeline.nf: adjusting to new parameters required by get_best_motif; get_motif_seq --- pipeline.nf | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index af27a1f..4ebab19 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -189,7 +189,7 @@ bigwig_input.combine(bed_input).set{footprint_in} This process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest */ process footprint_extraction { - conda "${path_env}" + //conda "${path_env}" tag{name} publishDir "${params.out}/1.1_footprint_extraction/", mode: 'copy', pattern: '*.bed' @@ -214,7 +214,7 @@ for_tfbs = fa_overlap.combine(db_for_motivscan).combine(bed_for_tfbsscan) */ process extract_known_TFBS { - conda "${path_env}" + //conda "${path_env}" publishDir "${params.out}/1.2_filter_motifs/TFBSscan/", mode: 'copy', pattern: '*.bed' @@ -250,7 +250,7 @@ if(params.tfbs_path == "") { /* */ process overlap_with_known_TFBS { - conda "${path_env}" + //conda "${path_env}" publishDir "${params.out}/1.2_filter_motifs/compareBed/", mode :'copy' input: @@ -272,8 +272,8 @@ process overlap_with_known_TFBS { Reduce each sequence to its most conserved region. */ process reduce_sequence { - conda "${path_env}" - echo true + //conda "${path_env}" + publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy' input: @@ -293,7 +293,7 @@ process reduce_sequence { Cluster all sequences. Appends a column with cluster numbers to the bed-file. */ process clustering { - conda "${path_env}" + //conda "${path_env}" echo true publishDir "${params.out}/2.1_clustering/", mode: 'copy', pattern: '*.bed' @@ -316,7 +316,7 @@ Converting BED-File to one FASTA-File per cluster */ process bed_to_clustered_fasta { - conda "${path_env}" + //conda "${path_env}" publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy' tag{name} @@ -347,7 +347,7 @@ process glam2 { tag{name} publishDir "${params.out}/2.2_motif_estimation/glam2/", mode: 'copy' - conda "${path_env}" + //conda "${path_env}" input: set name, file (fasta) from fasta_for_glam2 @@ -373,7 +373,7 @@ The paths are sorted numerically depending on the cluster ID. process merge_meme { publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_meme/", mode: 'copy' - conda "${path_env}" + //conda "${path_env}" input: val (memelist) from meme_to_merge.toList() @@ -406,7 +406,7 @@ process find_similar_motifs { tag{name} publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/cluster_similarity/", mode: 'copy' - conda "${path_env}" + //conda "${path_env}" input: set name, file (meme) ,file (merged_meme) from to_find_similar_motifs @@ -429,7 +429,7 @@ Label first column of TSV-file with Cluster ID process label_cluster { tag{name} - conda "${path_env}" + //conda "${path_env}" input: set name, file (tsv) from motif_similarity @@ -480,7 +480,7 @@ Merging FASTA-files of similar clusters */ process merge_fasta { - conda "${path_env}" + //conda "${path_env}" publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_fasta/", mode: 'copy' input: @@ -511,7 +511,7 @@ process clustered_glam2 { publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/", mode: 'copy' tag{name} - conda "${path_env}" + //conda "${path_env}" input: file (fasta) from motif_clustered_fasta_flat @@ -555,7 +555,7 @@ process tomtom { tag{name} publishDir "${params.out}/2.2_motif_estimation/tomtom/", mode: 'copy' - conda "${path_env}" + //conda "${path_env}" input: set name, file (meme) from for_tomtom @@ -601,7 +601,7 @@ process check_for_unknown_motifs { //Get the best (first) X Motifs from each MEME-file process get_best_motif { - conda "${path_env}" + //conda "${path_env}" tag{name} publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/", mode: 'copy' @@ -612,27 +612,29 @@ process get_best_motif { set name, file('*_best.meme') into best_motif script: + cluster_id = name.split('_')[-1] """ - python ${path_bin}/2.2_motif_estimation/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} + python ${path_bin}/2.2_motif_estimation/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} ${cluster_id} """ } process get_best_motif_seq { - conda "${path_env}" + //conda "${path_env}" tag{name} publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/motif_sequences/", mode: 'copy' input: - set name, file(txt) from seq_to_json + set name, file (txt) from seq_to_json output: - file('*.json') + file("${name}_seq.json") script: + cluster_id = name.split('_')[-1] """ - Rscript ${path_bin}/2.2_motif_estimation/ get_motif_seq.R -i ${txt} -o ${name}_seq.json -n ${þærams.best_motif} + Rscript ${path_bin}/2.2_motif_estimation/get_motif_seq.R -i ${txt} -o ${name}_seq.json -n ${params.best_motif} -t ${path_bin}/2.2_motif_estimation/tmp/cluster_${cluster_id} -c ${cluster_id} """ } @@ -641,7 +643,7 @@ best_motif.combine(fa_scan).set {files_for_genome_scan} /* process genome_scan { - conda "${path_env}" + //conda "${path_env}" input: set name, file(meme), file(fasta) from files_for_genome_scan @@ -670,7 +672,7 @@ process cluster_quality { process create_GTF { - conda "${path_env}" + //conda "${path_env}" publishDir "${params.out}/3.1_create_gtf/", mode: 'copy' From 14390450c3056a6db8b9c8fb11a6586679ee0d6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 9 Jan 2019 09:39:58 -0500 Subject: [PATCH 09/17] Added log file for part 2.2_motif_estimation --- bin/2.2_motif_estimation/bed_to_fasta.R | 17 +++++++++++++++- pipeline.nf | 27 ++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/bin/2.2_motif_estimation/bed_to_fasta.R b/bin/2.2_motif_estimation/bed_to_fasta.R index 7e716b7..7dbb36f 100644 --- a/bin/2.2_motif_estimation/bed_to_fasta.R +++ b/bin/2.2_motif_estimation/bed_to_fasta.R @@ -1,5 +1,5 @@ #!/usr/bin/env Rscript -if (!require(optparse)) install.packages("optparse"); library(optparse) +if (!require(optparse, quietly = T)) install.packages("optparse"); library(optparse) option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster id.", metavar = "character"), @@ -36,6 +36,9 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ # Split data.table bed on its last column (cluster_no) into list of data.frames clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) + # Get number of all clusters + num_clusters <- length(clusters) + # For each data.frame(cluster) in list clusters: discard <- lapply(seq_len(length(clusters)), function(i){ clust <- as.data.frame(clusters[i]) @@ -47,10 +50,22 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") # Write fasta file seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) + return(TRUE) } else { print(paste0("Cluster: ",i," is to small")) + return(FALSE) } }) + + # Get number of clusters which contain enought sequences + tmp <- unlist(discard) + count_clust <- length(tmp[tmp == FALSE]) + + # Write log-file + write(paste0( "------------------------------------------------------------\nfile: ", + bedInput,"\nNumber of all clusters: ", num_clusters, "\nRemoved small clusters ( < ", + min_seq," sequences ): " ,count_clust , "\nRemaining number of clusters: ", num_clusters - count_clust), + file = "bed_to_fasta.log", append = T) } # run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell) diff --git a/pipeline.nf b/pipeline.nf index 4ebab19..48b1455 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -200,6 +200,7 @@ process footprint_extraction { output: set name, file ('*.bed') into bed_for_overlap_with_TFBS + file('*.log') script: """ @@ -317,7 +318,7 @@ Converting BED-File to one FASTA-File per cluster process bed_to_clustered_fasta { //conda "${path_env}" - publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy', pattern: '*.FASTA' tag{name} input: @@ -326,6 +327,7 @@ process bed_to_clustered_fasta { output: file ('*.FASTA') into fasta_for_glam2 file ('*.FASTA') into fasta_for_motif_cluster + file ('*.log') into 22_log script: """ @@ -572,14 +574,33 @@ process tomtom { //Joining channels with meme and tsv files. Filter joined channel on line count. //Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. -for_filter2 = for_filter.join( tsv_for_filter ) +for_filter.join( tsv_for_filter ).into {for_filter2; for_log} for_filter2 .filter { name, meme, tsv -> long count = tsv.readLines().size() count <= 1 } - .into { meme_for_scan; check } + .into { meme_for_scan; check; num_cluster } +count_cluster = num_cluster.count() +count_cluster_before_filter = for_log.count() + +process write_log_for_motif_estimation { + + publishDir "${params.out}/2.2_motif_estimation/log/", mode: 'copy' + + input: + file log, val (after_filter), val (before_filter) from 22_log.combine(count_cluster).combine(count_cluster_before_filter = for_log.count()) + + output: + file ('*.log') + + script: + removed = before_filter - after_filter + """ + printf "\nMotifs found in Database: ${removed}\nNumber of remaining unknown motifs/cluster${after_filter}" >> log + """ +} //If channel 'check' is empty print errormessage process check_for_unknown_motifs { From 0f4bf4694c967395116e2670c4aaee549eb52c6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 9 Jan 2019 14:24:23 -0500 Subject: [PATCH 10/17] added missing shebangs to 2.2 scripts #44 --- bin/2.2_motif_estimation/get_best_motif.py | 2 ++ bin/2.2_motif_estimation/get_motif_seq.R | 2 +- bin/2.2_motif_estimation/motif_estimation.nf | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/bin/2.2_motif_estimation/get_best_motif.py b/bin/2.2_motif_estimation/get_best_motif.py index 7f9589d..532a1f3 100644 --- a/bin/2.2_motif_estimation/get_best_motif.py +++ b/bin/2.2_motif_estimation/get_best_motif.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + ''' parses arguments using argparse @return args list of all parameters diff --git a/bin/2.2_motif_estimation/get_motif_seq.R b/bin/2.2_motif_estimation/get_motif_seq.R index 6f298b3..dda4bd3 100644 --- a/bin/2.2_motif_estimation/get_motif_seq.R +++ b/bin/2.2_motif_estimation/get_motif_seq.R @@ -32,7 +32,7 @@ read_data <- function(path){ #' @param output Output JSON-file #' @param num Get best (num) motifs. #' -#' @author Ren� Wiegandt +#' @author René Wiegandt create_seq_json <- function(input, output, num, tmp_path, cluster_id) { if (!file.exists(input)) { diff --git a/bin/2.2_motif_estimation/motif_estimation.nf b/bin/2.2_motif_estimation/motif_estimation.nf index 430040e..f0eb530 100644 --- a/bin/2.2_motif_estimation/motif_estimation.nf +++ b/bin/2.2_motif_estimation/motif_estimation.nf @@ -1,3 +1,4 @@ +#!/usr/bin/env nextflow params.bed = "" params.out = "" From c0a462be625151a9d2900d3884c15128246285aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 9 Jan 2019 14:25:14 -0500 Subject: [PATCH 11/17] pipeline.nf: adjusting to new parameter required by 2.1 scripts --- pipeline.nf | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 48b1455..4e8708c 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -20,6 +20,7 @@ //filter_unknown_motifs params.min_size_fp=10 params.max_size_fp=100 + params.tfbsscan_method = "moods" //clustering //reduce_sequence @@ -88,6 +89,7 @@ Optional arguments: Filter unknown motifs: --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100) + --tfbsscan_method [moods|fimo] Method used by tfbsscan. (Default: moods) Clustering: Sequence preparation/ reduction: @@ -143,6 +145,7 @@ int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer", req_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config"] valid_organism = ["hg38", "hg19", "mm9", "mm10"] +valid_tfbsscan_methods = ["moods","fimo"] params.each { key, value -> if(int_params.contains(key)) { @@ -163,6 +166,11 @@ if (!("${params.identity}" ==~ /^0\.[8-9][[0-9]*]?|^1(\.0)?/ )){ System.exit(2) } +if (!valid_tfbsscan_methods.contains(params.tfbsscan_method)) { + println("ERROR: Invalid Method for tfbsscan! Valid methods: " + valid_tfbsscan_methods) + System.exit(2) +} + if (!valid_organism.contains(params.organism)) { println("ERROR: Invalid organism! Valid organisms: " + valid_organism) System.exit(2) @@ -230,7 +238,7 @@ process extract_known_TFBS { script: """ - python ${path_bin}/1.2_filter_motifs/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} -b ${bed} + python ${path_bin}/1.2_filter_motifs/tfbsscan.py --use ${params.tfbsscan_method} --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} -b ${bed} """ } @@ -275,17 +283,19 @@ Reduce each sequence to its most conserved region. process reduce_sequence { //conda "${path_env}" - publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy' + publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/2.1_clustering//log", mode: 'copy', pattern: '*.log' input: set name, file (bed) from bed_for_reducing output: set name, file ('*.bed') into bed_for_clustering + file ('*.log') script: """ - Rscript ${path_bin}/2.1_clustering/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} + Rscript ${path_bin}/2.1_clustering/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} --summary reduce_sequence.log """ } @@ -295,19 +305,20 @@ Cluster all sequences. Appends a column with cluster numbers to the bed-file. */ process clustering { //conda "${path_env}" - echo true publishDir "${params.out}/2.1_clustering/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/2.1_clustering//log", mode: 'copy', pattern: '*.log' input: set name, file (bed) from bed_for_clustering output: set name, file ('*.bed') into bed_for_motif_esitmation + file ('*.log') script: """ - Rscript ${path_bin}/2.1_clustering/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} + Rscript ${path_bin}/2.1_clustering/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} --summary reduce_sequence.log """ } @@ -327,7 +338,7 @@ process bed_to_clustered_fasta { output: file ('*.FASTA') into fasta_for_glam2 file ('*.FASTA') into fasta_for_motif_cluster - file ('*.log') into 22_log + file ('*.log') into log22 script: """ @@ -585,12 +596,14 @@ for_filter2 count_cluster = num_cluster.count() count_cluster_before_filter = for_log.count() +log22.combine(count_cluster).combine(count_cluster_before_filter ).set {log22_final} + process write_log_for_motif_estimation { publishDir "${params.out}/2.2_motif_estimation/log/", mode: 'copy' input: - file log, val (after_filter), val (before_filter) from 22_log.combine(count_cluster).combine(count_cluster_before_filter = for_log.count()) + set file (logfile), after_filter, before_filter from log22_final output: file ('*.log') @@ -598,7 +611,8 @@ process write_log_for_motif_estimation { script: removed = before_filter - after_filter """ - printf "\nMotifs found in Database: ${removed}\nNumber of remaining unknown motifs/cluster${after_filter}" >> log + cat ${logfile} > motif_estimation.log + printf "\nMotifs found in Database: $removed\nNumber of remaining unknown motifs/cluster: $after_filter" >> motif_estimation.log """ } From e62bb3674ff4f328a51132467156385b33ac583c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 9 Jan 2019 19:45:28 -0500 Subject: [PATCH 12/17] pipeline.nf: fixed typos --- pipeline.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 4e8708c..74da2e1 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -284,7 +284,7 @@ process reduce_sequence { //conda "${path_env}" publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy', pattern: '*.bed' - publishDir "${params.out}/2.1_clustering//log", mode: 'copy', pattern: '*.log' + publishDir "${params.out}/2.1_clustering/log", mode: 'copy', pattern: '*.log' input: set name, file (bed) from bed_for_reducing @@ -307,7 +307,7 @@ process clustering { //conda "${path_env}" publishDir "${params.out}/2.1_clustering/", mode: 'copy', pattern: '*.bed' - publishDir "${params.out}/2.1_clustering//log", mode: 'copy', pattern: '*.log' + publishDir "${params.out}/2.1_clustering/log", mode: 'copy', pattern: '*.log' input: set name, file (bed) from bed_for_clustering @@ -318,7 +318,7 @@ process clustering { script: """ - Rscript ${path_bin}/2.1_clustering/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} --summary reduce_sequence.log + Rscript ${path_bin}/2.1_clustering/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} --summary clustering.log """ } @@ -715,7 +715,7 @@ process create_GTF { file ('*.gtf') into gtf when: - params.gtf_path == "" + params.gtf_path == "kill" script: """ From f3b3db40d86559ab3a59cf695654d1986b9425d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 10 Jan 2019 03:55:50 -0500 Subject: [PATCH 13/17] get_best_motif.py: Removed additional whitespace in motif header --- bin/2.2_motif_estimation/get_best_motif.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/2.2_motif_estimation/get_best_motif.py b/bin/2.2_motif_estimation/get_best_motif.py index 532a1f3..a4c57bf 100644 --- a/bin/2.2_motif_estimation/get_best_motif.py +++ b/bin/2.2_motif_estimation/get_best_motif.py @@ -59,7 +59,7 @@ def main(): if pattern.match(line): # if line is a motif header count = 2 - out.write(line.strip('\n') + " Cluster_" + cluster_id + '\n') + out.write(line.strip('\n').replace(" "," ") + " Cluster_" + cluster_id + '\n') ## else: out.write(line) From 2d2f8cbfb7b2d4af40b718448dbc93b64f753b3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 10 Jan 2019 03:58:16 -0500 Subject: [PATCH 14/17] pipeline.nf: removed code for debuging --- pipeline.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.nf b/pipeline.nf index 74da2e1..6499a41 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -715,7 +715,7 @@ process create_GTF { file ('*.gtf') into gtf when: - params.gtf_path == "kill" + params.gtf_path == "" script: """ From 43c55aee173630cd0270fd664546a3ff4d565ea6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 10 Jan 2019 07:34:16 -0500 Subject: [PATCH 15/17] bed_to_fasta.R: changes lapply to vapply --- bin/2.2_motif_estimation/bed_to_fasta.R | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bin/2.2_motif_estimation/bed_to_fasta.R b/bin/2.2_motif_estimation/bed_to_fasta.R index 7dbb36f..322f01f 100644 --- a/bin/2.2_motif_estimation/bed_to_fasta.R +++ b/bin/2.2_motif_estimation/bed_to_fasta.R @@ -40,7 +40,7 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ num_clusters <- length(clusters) # For each data.frame(cluster) in list clusters: - discard <- lapply(seq_len(length(clusters)), function(i){ + discard <- vapply(seq_len(length(clusters)), function(i){ clust <- as.data.frame(clusters[i]) # Filter data.tables(clusters), which are to small if (nrow(clust) >= as.numeric(min_seq) ) { @@ -55,11 +55,10 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ print(paste0("Cluster: ",i," is to small")) return(FALSE) } - }) + }, FUN.VALUE=logical(1)) # Get number of clusters which contain enought sequences - tmp <- unlist(discard) - count_clust <- length(tmp[tmp == FALSE]) + count_clust <- length(discard[discard == FALSE]) # Write log-file write(paste0( "------------------------------------------------------------\nfile: ", From 310702eaf5ff39eb60db9aba7b25402cbb3eebc5 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 10 Jan 2019 13:54:12 +0100 Subject: [PATCH 16/17] Removed import os from get_best_motif.py --- bin/2.2_motif_estimation/get_best_motif.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/2.2_motif_estimation/get_best_motif.py b/bin/2.2_motif_estimation/get_best_motif.py index a4c57bf..21ddcbb 100644 --- a/bin/2.2_motif_estimation/get_best_motif.py +++ b/bin/2.2_motif_estimation/get_best_motif.py @@ -68,5 +68,4 @@ def main(): if __name__ == "__main__": import argparse import re - import os main() From b70a38b8c3fd531bbfcd1f534169f0764ee9863c Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 10 Jan 2019 13:55:10 +0100 Subject: [PATCH 17/17] Removed newline from get_motif_seq --- bin/2.2_motif_estimation/get_motif_seq.R | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/2.2_motif_estimation/get_motif_seq.R b/bin/2.2_motif_estimation/get_motif_seq.R index dda4bd3..acce6f2 100644 --- a/bin/2.2_motif_estimation/get_motif_seq.R +++ b/bin/2.2_motif_estimation/get_motif_seq.R @@ -66,7 +66,6 @@ create_seq_json <- function(input, output, num, tmp_path, cluster_id) { # Running read_data on files datalist <- lapply(fnames, read_data) - # Create json file ## naming names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id)