Merge pull request #12 from loosolab/motif_estiamtion

Motif estiamtion
loosolab · Dec 20, 2018 · 80963a3 · 80963a3
2 parents 8faf399 + 5e46266
commit 80963a3
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 71 deletions.
diff --git a/README.md b/README.md
@@ -7,38 +7,16 @@ For further information read the [documentation](https://github.molgen.mpg.de/lo
 ## Dependencies
 * [conda](https://conda.io/docs/user-guide/install/linux.html)
 * [Nextflow](https://www.nextflow.io/)
-* [MEME-Suite](http://meme-suite.org/doc/install.html?man_type=web)
 
 ## Installation
-Start with installing all dependencies listed above. It is required to set the [enviroment paths for meme-suite](http://meme-suite.org/doc/install.html?man_type=web#installingtar).
-this can be done with following commands:
-```
-export PATH=[meme-suite instalation path]/libexec/meme-[meme-suite version]:$PATH
-export PATH=[meme-suite instalation path]/bin:$PATH
-```
-
+Start with installing all dependencies listed above (Nextflow, conda) and downloading all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018).
 
-Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018).
-The Nextflow-script needs a conda enviroment to run. Nextflow can create the needed enviroment from the given yaml-file.
-On some systems Nextflow exits the run with following error:
-```
-Caused by:
-  Failed to create Conda environment
-  command: conda env create --prefix  --file env.yml
-  status : 143
-  message:
-```
-If this error occurs you have to create the enviroment before starting the pipeline.
-To create this enviroment you need the yml-file from the repository.
-Run the following commands to create the enviroment:
-```console
-path=[Path to given masterenv.yml file]
-conda env create --name masterenv -f=$path
-```
-When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it.
+Every other dependency will be automatically  installed by Nextflow using conda. For that a new conda enviroment will be created, which can be found in the from Nextflow created work directory after the first pipeline run. 
+It is **not** required to create and activate the enviroment from the yaml-file beforehand.
 
 **Important Note:** For conda the channel bioconda needs to be set as highest priority! This is required due to two differnt packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfisch package from the channel conda-forge!
 
+
 ## Quick Start
 ```console
 nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file]
@@ -105,6 +83,24 @@ Optional arguments:
 All arguments can be set in the configuration files
  ```
 
+For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki)
 
+## Known issues
+The Nextflow-script needs a conda enviroment to run. Nextflow creates the needed enviroment from the given yaml-file.
+On some systems Nextflow exits the run with following error:
+```
+Caused by:
+  Failed to create Conda environment
+  command: conda env create --prefix  --file env.yml
+  status : 143
+  message:
+```
+If this error occurs you have to create the enviroment before starting the pipeline.
+To create this enviroment you need the yml-file from the repository.
+Run the following commands to create the enviroment:
+```console
+path=[Path to given masterenv.yml file]
+conda env create --name masterenv -f $path
+```
+When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it.
 
-For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki)
diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R
@@ -1,28 +1,58 @@
-#!/usr/bin/env Rscript
-
-# Splitting BED-files depending on their cluster.
-# The Sequences of each cluster are writen as an FASTA-file.
-# @parameter bedInput <string> BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8
-# @parameter prefix <string> prefix for filenames
-# @parameter min_seq <INT> min. number of sequences per cluster
-
-args = commandArgs(trailingOnly = TRUE)
-
-bedInput <- args[1]
-prefix <- args[2]
-min_seq <- args[3]
-
-bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
-
-clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Cluster column
-discard <- lapply(1:length(clusters), function(i){
-  clust <- as.data.frame(clusters[i])
-  print(nrow(clust))
-  if (nrow(clust) >= as.numeric(min_seq) ) {
-    sequences <- as.list(clust[[10]])   # <---- sequenze column
-    outfile <- paste0(prefix,"_cluster_",i,".FASTA")
-    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column
-  } else {
-    print(paste0("Cluster: ",i," is to small"))
-  }
-})
+#!/usr/bin/env Rscript
+library("optparse")
+
+option_list <- list(
+  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
+  make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"),
+  make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
+)
+
+opt_parser <- OptionParser(option_list = option_list, 
+                           description = "Convert BED-file to one FASTA-file per cluster")
+
+opt <- parse_args(opt_parser)
+
+#' Splitting BED-files depending on their cluster.
+#' The Sequences of each cluster are written as an FASTA-file.
+#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
+#'                              Sequence: second last column; Cluster ID: last column
+#' @param prefix <string> prefix for filenames
+#' @param min_seq <INT> min. number of sequences per cluster
+#'
+#' @author René Wiegandt
+#' @contact rene.wiegandt(at)mpi-bn.mpg.de
+bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
+
+  if (is.null(bedInput)) {
+    stop("ERROR: Input parameter cannot be null! Please specify the input parameter.")
+  }
+
+  bed <- data.table::fread(bedInput, sep = "\t")
+
+  # Get last column of data.table, which refers to the cluster, as a vector.
+  cluster_no <- as.vector(bed[[ncol(bed)]])
+
+  # Split data.table bed on its last column (cluster_no) into list of data.frames
+  clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)
+
+  # For each data.frame(cluster) in list clusters:
+  discard <- lapply(1:length(clusters), function(i){
+    clust <- as.data.frame(clusters[i])
+    # Filter data.tables(clusters), which are to small
+    if (nrow(clust) >= as.numeric(min_seq) ) {
+      # Get second last column, which contains the nucleotide sequences
+      sequences <- as.list(clust[[ncol(clust) - 1]])
+      # Create filename
+      outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
+      # Write fasta file
+      seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
+    } else {
+      print(paste0("Cluster: ",i," is to small"))
+    }
+  })
+}
+
+# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
+if (!interactive()) {
+  bed_to_fasta(opt$input, opt$prefix, opt$min_seq)
+}
diff --git a/bin/get_best_motif.py b/bin/get_best_motif.py
@@ -1,26 +1,64 @@
-# parses arguments using argparse
-# @return args list of all parameters
+'''
+parses arguments using argparse
+@return args list of all parameters
+'''
 def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("meme", help="Path to meme file")
+    parser = argparse.ArgumentParser(description='A script to convert from GLAM2 output to MEME-format and parsing only the [num] first motifs from file to the output.')
+    parser.add_argument("meme", help="Path to 'meme' file generated by GLAM2")
     parser.add_argument("output", help="Output file")
     parser.add_argument("num", help="Number of motifs parsed from file")
     args = parser.parse_args()
     return args
 
-# write lines of file till certain line (MOTIF + [num])
+'''
+The script has two functions:
+    1. Writing lines of file till certain line (MOTIF + [num])
+    2. Converting GLAM2 output to minimal meme-format
+@params meme STRING Path to 'meme' file generated from Meme suite
+@parmas output STRING Output file
+@params num INT Number of motifs parsed from file
+
+@author René Wiegandt
+@contact rene.wiegandt(at)mpi-bn.mpg.de
+'''
 def main():
+
     args = parse_arguments()
     out = open(args.output, "w+")
+
+    '''
+    Create pattern where script should stop writing
+    For Example:
+    If num == 3, which means that you want the first/best 3 Motifs, the script
+    should stop writing lines to output if loop reaches line 'MOTIF  4'
+    '''
     number = int(args.num) + 1
-    motif = "MOTIF " + str(number)
+    break_header = "MOTIF  " + str(number)
+
+    # Pattern for motif header
+    pattern = re.compile("^MOTIF\s{2}(\d)+")
+    # Init count
+    count = 0
+
     with open(args.meme) as f:
         for line in f:
-            if motif in line:
+            ## do not write [count] lines after each header -> needed for meme-format
+            if count > 0:
+                count-=1
+                continue
+            if pattern.match(line):
+                # if line is a motif header
+                count = 2
+            ##
+
+            if break_header in line:
+                # line matches breaking_header, e.g. 'MOTIF  4'
                 break
-            out.write(line)
+            else:
+                out.write(line)
 
 
 if __name__ == "__main__":
     import argparse
+    import re
     main()
diff --git a/masterenv.yml b/masterenv.yml
@@ -14,7 +14,6 @@ dependencies:
   - r-stringr
   - r-optparse
   - bioconductor-iranges
-  - snakemake
   - meme
   - moods
   - biopython

diff --git a/pipeline.nf b/pipeline.nf
@@ -9,6 +9,7 @@
 	params.tfbs_path=""
 	params.create_known_tfbs_path = "./"
 	params.help = 0
+	params.get_path=""
 	params.out = "./out/"
 
 //peak_calling
@@ -55,10 +56,10 @@
 	params.best_motif = 3 // Top n motifs per cluster
 
 //creating_gtf
-	params.organism="hg38"
+	params.organism=""
 	params.tissue=""
 
-if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" != "0"){
+if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" != "0"){
 log.info """
 Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file]
 
@@ -68,14 +69,16 @@ Required arguments:
 	--genome_fasta		 Path to genome in FASTA-format
 	--motif_db		 Path to motif-database in MEME-format
 	--config		 Path to UROPA configuration file
-	--create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored.
-				 Path can be set as tfbs_path in next run. (Default: './')
+ 	--organism 		 Input organism [hg38 | hg19 | mm9 | mm10]
 	--out			 Output Directory (Default: './out/')
 
 Optional arguments:
 
 	--help [0|1]		1 to show this help message. (Default: 0)
 	--tfbs_path 		Path to directory with output from tfbsscan. If given tfbsscan will not be run.
+	--create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored.
+				 Path can be set as tfbs_path in next run. (Default: './')
+	--gtf_path			Path to gtf-file. If path is set the process which creats a gtf-file is skipped.
 
 	Footprint extraction:
 	--window_length INT	This parameter sets the length of a sliding window. (Default: 200)
@@ -115,7 +118,6 @@ Optional arguments:
 	--motif_similarity_thresh FLOAT	Threshold for motif similarity score (Default: 0.00001)
 
 	Creating GTF:
-	--organism [hg38 | hg19 | mm9 | mm10]	Input organism
 	--tissues List/String 	List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON
 				config
 All arguments can be set in the configuration files
@@ -324,7 +326,7 @@ process bed_to_clustered_fasta {
 
     script:
     """
-    Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq}
+    Rscript ${path_bin}/bed_to_fasta.R -i ${bed} -p ${name} -m ${params.min_seq}
     """
 }
 
@@ -578,14 +580,23 @@ process create_GTF {
 	publishDir "${params.out}/gtf/", mode: 'copy'
 
 	output:
-	file ('*.gtf') into gtf_for_uropa
+	file ('*.gtf') into gtf
+
+	when:
+	gtf_path == ""
 
 	script:
 	"""
 	python ${path_bin}/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} --wd ${path_bin}
 	"""
 }
 
+if (gtf_path == "") {
+	gtf_for_uropa = gtf
+} else {
+	gtf_for_uropa = Channel.fromPath(params.gtf_path)
+}
+
 /*
 bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in}