diff --git a/bin/2.2_motif_estimation/bed_to_fasta.R b/bin/2.2_motif_estimation/bed_to_fasta.R index 7e716b7..322f01f 100644 --- a/bin/2.2_motif_estimation/bed_to_fasta.R +++ b/bin/2.2_motif_estimation/bed_to_fasta.R @@ -1,5 +1,5 @@ #!/usr/bin/env Rscript -if (!require(optparse)) install.packages("optparse"); library(optparse) +if (!require(optparse, quietly = T)) install.packages("optparse"); library(optparse) option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster id.", metavar = "character"), @@ -36,8 +36,11 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ # Split data.table bed on its last column (cluster_no) into list of data.frames clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) + # Get number of all clusters + num_clusters <- length(clusters) + # For each data.frame(cluster) in list clusters: - discard <- lapply(seq_len(length(clusters)), function(i){ + discard <- vapply(seq_len(length(clusters)), function(i){ clust <- as.data.frame(clusters[i]) # Filter data.tables(clusters), which are to small if (nrow(clust) >= as.numeric(min_seq) ) { @@ -47,10 +50,21 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") # Write fasta file seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) + return(TRUE) } else { print(paste0("Cluster: ",i," is to small")) + return(FALSE) } - }) + }, FUN.VALUE=logical(1)) + + # Get number of clusters which contain enought sequences + count_clust <- length(discard[discard == FALSE]) + + # Write log-file + write(paste0( "------------------------------------------------------------\nfile: ", + bedInput,"\nNumber of all clusters: ", num_clusters, "\nRemoved small clusters ( < ", + min_seq," sequences ): " ,count_clust , "\nRemaining number of clusters: ", num_clusters - count_clust), + file = "bed_to_fasta.log", append = T) } # run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell) diff --git a/bin/2.2_motif_estimation/get_best_motif.py b/bin/2.2_motif_estimation/get_best_motif.py index a506bd8..21ddcbb 100644 --- a/bin/2.2_motif_estimation/get_best_motif.py +++ b/bin/2.2_motif_estimation/get_best_motif.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + ''' parses arguments using argparse @return args list of all parameters @@ -7,6 +9,7 @@ def parse_arguments(): parser.add_argument("meme", help="Path to 'meme' file generated by GLAM2") parser.add_argument("output", help="Output file") parser.add_argument("num", help="Number of motifs parsed from file") + parser.add_argument("id", help="Cluster ID") args = parser.parse_args() return args @@ -35,6 +38,9 @@ def main(): number = int(args.num) + 1 break_header = "MOTIF " + str(number) + # Get cluster_id + cluster_id = args.id + # Pattern for motif header pattern = re.compile("^MOTIF\s{2}(\d)+") # Init count @@ -42,20 +48,21 @@ def main(): with open(args.meme) as f: for line in f: - ## do not write [count] lines after each header -> needed for meme-format - if count > 0: - count-=1 - continue - if pattern.match(line): - # if line is a motif header - count = 2 - ## - if break_header in line: # line matches breaking_header, e.g. 'MOTIF 4' break else: - out.write(line) + ## do not write [count] lines after each header -> needed for meme-format + if count > 0: + count-=1 + continue + if pattern.match(line): + # if line is a motif header + count = 2 + out.write(line.strip('\n').replace(" "," ") + " Cluster_" + cluster_id + '\n') + ## + else: + out.write(line) if __name__ == "__main__": diff --git a/bin/2.2_motif_estimation/get_motif_seq.R b/bin/2.2_motif_estimation/get_motif_seq.R new file mode 100644 index 0000000..acce6f2 --- /dev/null +++ b/bin/2.2_motif_estimation/get_motif_seq.R @@ -0,0 +1,85 @@ +#!/usr/bin/env Rscript +if (!require(optparse, quietly = T)) install.packages("optparse"); library(optparse) + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input file. Output txt-file from GLAM2.", metavar = "character"), + make_option(opt_str = c("-o", "--output"), default = "sequences.json" , help = "Output JSON-file. Default = '%default'", metavar = "character"), + make_option(opt_str = c("-n", "--num"), default = 3 , help = "Get best (num) motifs. Default = '%default'", metavar = "numeric"), + make_option(opt_str = c("-c", "--cluster_id"), default = "./" , help = "Cluster ID", metavar = "numeric"), + make_option(opt_str = c("-t", "--tmp"), default = "./" , help = "Path for tmp-files. Default = '%default'", metavar = "character") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Creating JSON-file with sequence ids which were used to create the best (num) motifs.", + epilogue = "Author: Rene Wiegandt ") + +opt <- parse_args(opt_parser) + +#' Reading files with fread. +#' Only read the first column. +#' @param path Path to file +#' @return first column as vector +read_data <- function(path){ + + f <- data.table::fread(path, select = 1) + return(f[[1]]) +} + + +#' Creating JSON-file with sequence ids which were used to create the best (num) motifs. +#' +#' @param input Input file.Output txt-file from GLAM2. +#' @param output Output JSON-file +#' @param num Get best (num) motifs. +#' +#' @author René Wiegandt +create_seq_json <- function(input, output, num, tmp_path, cluster_id) { + + if (!file.exists(input)) { + stop(paste0("Input file does not exists. Please check the given path: ", input)) + } + + if ( !is.numeric(num)) { + stop("Parameter num needs to be an integer") + } + + if (num > 10 || num <= 0 ) { + stop(paste0("Parameter 'num' needs to be an number between 1 and 10! Your input: ", num)) + } + + if ( !varhandle::check.numeric(cluster_id)) { + stop(paste0("CLUSTER ID could not be found. Please make sure that your file path contains _[cluster_id] at the end. Found: ", cluster_id,"\n For example: /test_cluster_1/glam.txt")) + } + + dir.create(tmp_path, showWarnings = FALSE) + + file_dir <- tmp_path + + # Split glam.txt file on lines that start with Score: + system(paste0("csplit ", input, " '/^Score:.*/' '{*}' -f ", file_dir, "/f_id_test.pholder")) + # Only keep the lines that start with 'f' to get the lines with the sequence ids + system(paste0("for i in ", file_dir, "/*.pholder0[1-", num, "];do grep \"^f\" $i > \"${i}.done\";done")) + + # Getting the filepaths of first 3 files with sequence ids + fnames <- file.path(file_dir,dir(file_dir, pattern = "done")) + + # Running read_data on files + datalist <- lapply(fnames, read_data) + + # Create json file + ## naming + names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id) + ## creating json object + json <- RJSONIO::toJSON(datalist, pretty = T , .withNames = T) + ## writing file + write(json, file = output ) +} + +# run function create_seq_json with given parameteres if not in interactive context (e.g. run from shell) +if (!interactive()) { + if (length(commandArgs(trailingOnly = TRUE)) <= 0) { + print_help(opt_parser) + } else { + create_seq_json(opt$input, opt$output, opt$num, opt$tmp, opt$cluster_id) + } +} diff --git a/bin/2.2_motif_estimation/motif_estimation.nf b/bin/2.2_motif_estimation/motif_estimation.nf index 430040e..f0eb530 100644 --- a/bin/2.2_motif_estimation/motif_estimation.nf +++ b/bin/2.2_motif_estimation/motif_estimation.nf @@ -1,3 +1,4 @@ +#!/usr/bin/env nextflow params.bed = "" params.out = "" diff --git a/masterenv.yml b/masterenv.yml index 751964e..bd0f2ca 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -14,6 +14,8 @@ dependencies: - r-stringi - r-stringr - r-optparse + - r-RJSONIO + - r-varhandle - bioconductor-iranges - moods - biopython diff --git a/pipeline.nf b/pipeline.nf index 93190a3..6499a41 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -20,6 +20,7 @@ //filter_unknown_motifs params.min_size_fp=10 params.max_size_fp=100 + params.tfbsscan_method = "moods" //clustering //reduce_sequence @@ -88,6 +89,7 @@ Optional arguments: Filter unknown motifs: --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100) + --tfbsscan_method [moods|fimo] Method used by tfbsscan. (Default: moods) Clustering: Sequence preparation/ reduction: @@ -143,6 +145,7 @@ int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer", req_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config"] valid_organism = ["hg38", "hg19", "mm9", "mm10"] +valid_tfbsscan_methods = ["moods","fimo"] params.each { key, value -> if(int_params.contains(key)) { @@ -163,6 +166,11 @@ if (!("${params.identity}" ==~ /^0\.[8-9][[0-9]*]?|^1(\.0)?/ )){ System.exit(2) } +if (!valid_tfbsscan_methods.contains(params.tfbsscan_method)) { + println("ERROR: Invalid Method for tfbsscan! Valid methods: " + valid_tfbsscan_methods) + System.exit(2) +} + if (!valid_organism.contains(params.organism)) { println("ERROR: Invalid organism! Valid organisms: " + valid_organism) System.exit(2) @@ -189,7 +197,7 @@ bigwig_input.combine(bed_input).set{footprint_in} This process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest */ process footprint_extraction { - conda "${path_env}" + //conda "${path_env}" tag{name} publishDir "${params.out}/1.1_footprint_extraction/", mode: 'copy', pattern: '*.bed' @@ -200,6 +208,7 @@ process footprint_extraction { output: set name, file ('*.bed') into bed_for_overlap_with_TFBS + file('*.log') script: """ @@ -214,7 +223,7 @@ for_tfbs = fa_overlap.combine(db_for_motivscan).combine(bed_for_tfbsscan) */ process extract_known_TFBS { - conda "${path_env}" + //conda "${path_env}" publishDir "${params.out}/1.2_filter_motifs/TFBSscan/", mode: 'copy', pattern: '*.bed' @@ -229,7 +238,7 @@ process extract_known_TFBS { script: """ - python ${path_bin}/1.2_filter_motifs/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} -b ${bed} + python ${path_bin}/1.2_filter_motifs/tfbsscan.py --use ${params.tfbsscan_method} --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} -b ${bed} """ } @@ -250,7 +259,7 @@ if(params.tfbs_path == "") { /* */ process overlap_with_known_TFBS { - conda "${path_env}" + //conda "${path_env}" publishDir "${params.out}/1.2_filter_motifs/compareBed/", mode :'copy' input: @@ -272,19 +281,21 @@ process overlap_with_known_TFBS { Reduce each sequence to its most conserved region. */ process reduce_sequence { - conda "${path_env}" - echo true - publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy' + //conda "${path_env}" + + publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/2.1_clustering/log", mode: 'copy', pattern: '*.log' input: set name, file (bed) from bed_for_reducing output: set name, file ('*.bed') into bed_for_clustering + file ('*.log') script: """ - Rscript ${path_bin}/2.1_clustering/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} + Rscript ${path_bin}/2.1_clustering/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} --summary reduce_sequence.log """ } @@ -293,20 +304,21 @@ process reduce_sequence { Cluster all sequences. Appends a column with cluster numbers to the bed-file. */ process clustering { - conda "${path_env}" - echo true + //conda "${path_env}" publishDir "${params.out}/2.1_clustering/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/2.1_clustering/log", mode: 'copy', pattern: '*.log' input: set name, file (bed) from bed_for_clustering output: set name, file ('*.bed') into bed_for_motif_esitmation + file ('*.log') script: """ - Rscript ${path_bin}/2.1_clustering/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} + Rscript ${path_bin}/2.1_clustering/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} --summary clustering.log """ } @@ -316,8 +328,8 @@ Converting BED-File to one FASTA-File per cluster */ process bed_to_clustered_fasta { - conda "${path_env}" - publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy' + //conda "${path_env}" + publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy', pattern: '*.FASTA' tag{name} input: @@ -326,6 +338,7 @@ process bed_to_clustered_fasta { output: file ('*.FASTA') into fasta_for_glam2 file ('*.FASTA') into fasta_for_motif_cluster + file ('*.log') into log22 script: """ @@ -347,7 +360,7 @@ process glam2 { tag{name} publishDir "${params.out}/2.2_motif_estimation/glam2/", mode: 'copy' - conda "${path_env}" + //conda "${path_env}" input: set name, file (fasta) from fasta_for_glam2 @@ -357,6 +370,7 @@ process glam2 { set name, file("${name}/*.meme") into meme_for_tomtom set name, file("${name}/*.meme") into meme_to_search_in_merged set name, file("${name}/*.meme") into meme_for_filter + set name, file("${name}/*.txt") into glam_for_seq file ('*') script: @@ -372,7 +386,7 @@ The paths are sorted numerically depending on the cluster ID. process merge_meme { publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_meme/", mode: 'copy' - conda "${path_env}" + //conda "${path_env}" input: val (memelist) from meme_to_merge.toList() @@ -405,7 +419,7 @@ process find_similar_motifs { tag{name} publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/cluster_similarity/", mode: 'copy' - conda "${path_env}" + //conda "${path_env}" input: set name, file (meme) ,file (merged_meme) from to_find_similar_motifs @@ -428,7 +442,7 @@ Label first column of TSV-file with Cluster ID process label_cluster { tag{name} - conda "${path_env}" + //conda "${path_env}" input: set name, file (tsv) from motif_similarity @@ -479,7 +493,7 @@ Merging FASTA-files of similar clusters */ process merge_fasta { - conda "${path_env}" + //conda "${path_env}" publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_fasta/", mode: 'copy' input: @@ -508,16 +522,17 @@ Run GLAM2 on emrged FASTA-files */ process clustered_glam2 { - publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/${name}/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/", mode: 'copy' tag{name} - conda "${path_env}" + //conda "${path_env}" input: file (fasta) from motif_clustered_fasta_flat output: - set name, file ('*.meme') into clustered_meme_for_tomtom - set name, file ('*.meme') into clustered_meme_for_filter + set name, file ("/${name}/*.meme") into clustered_meme_for_tomtom + set name, file ("/${name}/*.meme") into clustered_meme_for_filter + set name, file("/${name}/glam2.txt") into clust_glam_for_seq file('*') when: @@ -526,7 +541,7 @@ process clustered_glam2 { script: name = fasta.getBaseName() """ - glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} """ } @@ -537,9 +552,11 @@ If motif clustering is activated or not. if(params.cluster_motif == 1){ for_tomtom = clustered_meme_for_tomtom for_filter = clustered_meme_for_filter + seq_to_json = clust_glam_for_seq } else { for_tomtom = meme_for_tomtom for_filter = meme_for_filter + seq_to_json = glam_for_seq } @@ -551,7 +568,7 @@ process tomtom { tag{name} publishDir "${params.out}/2.2_motif_estimation/tomtom/", mode: 'copy' - conda "${path_env}" + //conda "${path_env}" input: set name, file (meme) from for_tomtom @@ -568,14 +585,36 @@ process tomtom { //Joining channels with meme and tsv files. Filter joined channel on line count. //Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. -for_filter2 = for_filter.join( tsv_for_filter ) +for_filter.join( tsv_for_filter ).into {for_filter2; for_log} for_filter2 .filter { name, meme, tsv -> long count = tsv.readLines().size() count <= 1 } - .into { meme_for_scan; check } + .into { meme_for_scan; check; num_cluster } +count_cluster = num_cluster.count() +count_cluster_before_filter = for_log.count() + +log22.combine(count_cluster).combine(count_cluster_before_filter ).set {log22_final} + +process write_log_for_motif_estimation { + + publishDir "${params.out}/2.2_motif_estimation/log/", mode: 'copy' + + input: + set file (logfile), after_filter, before_filter from log22_final + + output: + file ('*.log') + + script: + removed = before_filter - after_filter + """ + cat ${logfile} > motif_estimation.log + printf "\nMotifs found in Database: $removed\nNumber of remaining unknown motifs/cluster: $after_filter" >> motif_estimation.log + """ +} //If channel 'check' is empty print errormessage process check_for_unknown_motifs { @@ -594,10 +633,10 @@ process check_for_unknown_motifs { } -//Get the best(first) Motif from each MEME-file +//Get the best (first) X Motifs from each MEME-file process get_best_motif { - conda "${path_env}" + //conda "${path_env}" tag{name} publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/", mode: 'copy' @@ -608,8 +647,29 @@ process get_best_motif { set name, file('*_best.meme') into best_motif script: + cluster_id = name.split('_')[-1] + """ + python ${path_bin}/2.2_motif_estimation/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} ${cluster_id} + """ +} + + +process get_best_motif_seq { + + //conda "${path_env}" + tag{name} + publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/motif_sequences/", mode: 'copy' + + input: + set name, file (txt) from seq_to_json + + output: + file("${name}_seq.json") + + script: + cluster_id = name.split('_')[-1] """ - python ${path_bin}/2.2_motif_estimation/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} + Rscript ${path_bin}/2.2_motif_estimation/get_motif_seq.R -i ${txt} -o ${name}_seq.json -n ${params.best_motif} -t ${path_bin}/2.2_motif_estimation/tmp/cluster_${cluster_id} -c ${cluster_id} """ } @@ -618,7 +678,7 @@ best_motif.combine(fa_scan).set {files_for_genome_scan} /* process genome_scan { - conda "${path_env}" + //conda "${path_env}" input: set name, file(meme), file(fasta) from files_for_genome_scan @@ -647,7 +707,7 @@ process cluster_quality { process create_GTF { - conda "${path_env}" + //conda "${path_env}" publishDir "${params.out}/3.1_create_gtf/", mode: 'copy'