Merge remote-tracking branch 'origin/dev' into cluster

# Conflicts: # config/cluster.config
loosolab · Dec 6, 2018 · 51d7efe · 51d7efe
2 parents b769d49 + a62154c
commit 51d7efe
Show file tree

Hide file tree

Showing 14 changed files with 1,062 additions and 220 deletions.
diff --git a/.gitignore b/.gitignore
diff --git a/README.md b/README.md
@@ -10,19 +10,66 @@ De novo motif discovery and evaluation based on footprints identified by TOBIAS
 ## Installation
 Start with installing all dependencies listed above.
 Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). 
-The Nextflow-script needs a conda enviroment to run. To create this enviroment you need the yml-file from the repository.
+The Nextflow-script needs a conda enviroment to run. Nextflow can create the needed enviroment from the given yaml-file.
+On some systems Nrxtflow exits the run with following error:
+```
+Caused by:
+  Failed to create Conda environment
+  command: conda env create --prefix  --file env.yml
+  status : 143
+  message:
+```
+If this error occurs you have to create the enviroment before starting the pipeline.
+To create this enviroment you need the yml-file from the repository.
 Run the following commands to create the enviroment:
 ```console
 path=[Path to given masterenv.yml file]
 conda env create --name masterenv -f=$path
-source activate masterenv
 ```
+When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it. 
 
 ## Quick Start
 ```console
-nextflow run pipeline.nf --input [INPUT-file] --bed [INPUT-bed] --genome_fasta [path to file] --jaspar_db [path to motif database as meme-file] --config uropa.config
+nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file]
+```
+## Parameters
 ```
-## Parameter
+Required arguments:
+	--input Path to BigWig-file
+	--bed Path to BED-file
+	--genome_fasta Path to genome in FASTA-format
+	--jaspar_db Path to motif-database in MEME-format
+
+
+Optional arguments:
+	Footprint extraction:
+	--window_length INT (Default: 200)
+	--step INT (Default: 100)
+	--percentage INT(Default: 0)
+
+	Filter unknown motifs:
+	--min_size_fp INT (Default: 10)
+	--max_size_fp INT (Default: 100)
+
+	Clustering:
+	--sequence_coverage INT (Default: 8)
+	--kmer INT (Default: 10)
+	--aprox_motif_len INT (Default: 10)
+
+	Motif estimation:
+	--motif_min_len INT	Minimum length of Motif (Default: 8)
+	--motif_max_len INT	Maximum length of Motif (Default: 20)
+	--interation INT	Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000)
+	--tomtom_treshold float	Threshold for similarity score. (Default: 0.01)
+
+	Creating GTF:
+	--organism [homo_sapiens | mus_musculus]
+	--tissues
+  
+ All arguments can be set in the configuration files.
+ ```
+
+
 
 For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki)
 
diff --git a/bin/Modules/SaveResults.py b/bin/Modules/SaveResults.py
@@ -4,14 +4,14 @@
 
 class ResultSaver:
 
-    def __init__(self, results, organism, tissue, wd):
+    def __init__(self, results, organism, tissue):
 
         print("Save results to File !")
         self.path = ""
         if tissue:
-            self.path = os.path.join(wd+"/"+organism+"_filtered.gtf")
+            self.path = os.path.join("./"+organism+"_filtered.gtf")
         else:
-            self.path = os.path.join(wd+"/"+organism+".gtf")
+            self.path = os.path.join("./"+organism+".gtf")
 
         with open(self.path, "w") as file:
             write_it = csv.writer(file, delimiter='\t')

diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py
@@ -6,6 +6,7 @@
 from Modules.Uniquifier import UniqueFilter
 from Modules.SaveResults import ResultSaver
 import os
+import json
 
 
 def check_for_local_folder(wd):
@@ -18,14 +19,38 @@ def check_for_local_folder(wd):
         os.mkdir(os.path.join(wd+"/UCSCData" ))
 
 
+def check_filter(tissue_cmd, org, wd):
+    path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json" )
+    tissues_config = []
+    if not tissue_cmd:
+        return False
+    with open(path_to_config) as input_file:
+        data = json.loads(input_file.read())
+        for x in data:
+            tissues_config.append(x["type"])
+
+    if any(tissue in tissues_config for tissue in tissue_cmd):
+        return True
+
+    else:
+        return False
+
+
 def main_script(org, wd, tissuetype=None):
 
     check_for_local_folder(wd)
+    if check_filter(tissuetype, org, wd):
+        tissues = tissuetype
+        print("Filter detected !")
+    else:
+        tissues = None
+        print("Filter not detected !")
+
     ucsc = UcscGtf(org, wd)
     ense = Ensembl(org, wd)
     print("Getting Unique Results")
-    unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissuetype)
-    ResultSaver(unique_filter.get_results(), org, tissuetype, wd)
+    unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues)
+    ResultSaver(unique_filter.get_results(), org, tissues)
 
 
 if __name__ == '__main__':
@@ -34,9 +59,10 @@ def main_script(org, wd, tissuetype=None):
     parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str)
     parser.add_argument('--wd', help='Working directory. default: "."', action='store', default='.', type=str)
     args = vars(parser.parse_args())
+    print("Working Dir: " + args["wd"])
     if args["organism"]:
         print("Working Dir: " + args["wd"])
         main_script(args["organism"], args["wd"], args["tissue"])
     else:
-        print("No Arguments found -> See ./RegGTFExtractor.py -h for help.")
+        print("No Arguments found -> See python3 ./RegGTFExtractor.py -h for help.")