Merge branch 'dev' of https://github.molgen.mpg.de/loosolab/masterJLU…

…2018 into dev
loosolab · Dec 10, 2018 · 037b2bf · 037b2bf
2 parents b45f676 + ffed012
commit 037b2bf
Show file tree

Hide file tree

Showing 10 changed files with 80 additions and 56 deletions.
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ export PATH=[meme-suite instalation path]/bin:$PATH
 
 Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). 
 The Nextflow-script needs a conda enviroment to run. Nextflow can create the needed enviroment from the given yaml-file.
-On some systems Nrxtflow exits the run with following error:
+On some systems Nextflow exits the run with following error:
 ```
 Caused by:
   Failed to create Conda environment
@@ -42,19 +42,20 @@ When the enviroment is created, set the variable 'path_env' in the configuration
 nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file]
 ```
 ## Parameters
+For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration).
 ```
 Required arguments:
-	--input Path to BigWig-file
-	--bed Path to BED-file
+	--input Path to BigWig-file with scores on the peaks of interest
+	--bed Path to BED-file with peaks of interest corresponding to the BigWig file
 	--genome_fasta Path to genome in FASTA-format
 	--jaspar_db Path to motif-database in MEME-format
 
 
 Optional arguments:
 	Footprint extraction:
-	--window_length INT (Default: 200)
-	--step INT (Default: 100)
-	--percentage INT(Default: 0)
+	--window_length INT (Default: 200) a length of a window
+	--step INT (Default: 100) an interval to slide the window
+	--percentage INT(Default: 0) a percentage to be added to background while searching for footprints
 
 	Filter unknown motifs:
 	--min_size_fp INT (Default: 10)
@@ -81,7 +82,7 @@ Optional arguments:
 	--interation INT	Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000)
 	--tomtom_treshold float	Threshold for similarity score. (Default: 0.01)
 
-	Moitf clustering:
+	Motif clustering:
 	--edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5)
 	--motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001)
 

diff --git a/bin/Modules/Ensembl/ActivityCategorizer.py b/bin/Modules/Ensembl/ActivityCategorizer.py
@@ -4,7 +4,7 @@
 
 class ActivityCategorizer:
 
-    def __init__(self, release, organism, wd):
+    def __init__(self, release, organism, wd, data_dir):
 
         # List of all Folders with Activity Tables
 
@@ -18,7 +18,7 @@ def __init__(self, release, organism, wd):
 
         self.activity = {}
 
-        self.get_activity_data(release, organism, wd)
+        self.get_activity_data(release, organism, wd, data_dir)
 
         # Categorized Activity from Json-config
         print("Categorization: This may take a while")
@@ -41,11 +41,14 @@ def read_config(self, organism, wd):
 
         return c_dict
 
-    def get_activity_data(self, release, organism, wd):
+    def get_activity_data(self, release, organism, wd, data_dir):
 
         for folder in self.folderlist:
             # Generate path to binary File
-            file = os.path.join(wd + "/EnsemblData", release, organism, "activity", folder, "table.bin")
+            if data_dir:
+                file = os.path.join(data_dir + "/EnsemblData", release, organism, "activity", folder, "table.bin")
+            else:
+                file = os.path.join(wd + "/data/EnsemblData", release, organism, "activity", folder, "table.bin")
             with open(file, "rb") as tables:
                 self.activity[folder] = bytearray(tables.read())
 

diff --git a/bin/Modules/Ensembl/ActivityTable.py b/bin/Modules/Ensembl/ActivityTable.py
@@ -15,8 +15,11 @@ class ActivityTable:
     4, "activity=NA"
     """
 
-    def __init__(self, organism, current_release, wd):
-        self.link = os.path.join(wd + "/EnsemblData/", current_release, organism, "activity")
+    def __init__(self, organism, current_release, wd, data_dir):
+        if data_dir:
+            self.link = os.path.join(data_dir + "/EnsemblData/", current_release, organism, "activity")
+        else:
+            self.link = os.path.join(wd + "/data/EnsemblData/", current_release, organism, "activity")
         self.folders = next(os.walk(self.link))[1]
         self.generator = ATGenerator(["activity=ACTIVE",
                                "activity=POISED",

diff --git a/bin/Modules/Ensembl/Ensembl.py b/bin/Modules/Ensembl/Ensembl.py
@@ -6,15 +6,15 @@
 
 class Ensembl:
 
-    def __init__(self, organism, wd):
+    def __init__(self, organism, wd, data_dir):
         print("Starting Ensembl")
-        self.updater = FTPRetriever(organism, wd)
+        self.updater = FTPRetriever(organism, wd, data_dir)
         self.release = self.updater.get_release()
-        self.acttable = ActivityTable(organism, self.release, wd)
+        self.acttable = ActivityTable(organism, self.release, wd, data_dir)
         self.acttable.check_and_generate_activity_table()
-        self.categorizer = ActivityCategorizer(self.release, organism, wd)
+        self.categorizer = ActivityCategorizer(self.release, organism, wd, data_dir)
         print("Generating GTF")
-        self.gtf_generator = GTFGen(organism, self.release, wd)
+        self.gtf_generator = GTFGen(organism, self.release, wd, data_dir)
 
         print("Ensembl Finished !")
 

diff --git a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py b/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py
@@ -15,8 +15,8 @@ def change_dir(self, wd):
     def get_all_entries(self):
         return self.ftp.nlst()
 
-    def get_all_entries_from_dir(self, dir):
-        self.change_dir(dir)
+    def get_all_entries_from_dir(self, dire):
+        self.change_dir(dire)
         return self.get_all_entries()
 
     def get_all_entries_as_FTPEntry(self):

diff --git a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py b/bin/Modules/Ensembl/FTPHandling/VersionChecker.py
@@ -9,12 +9,12 @@ class EnsemblRegulationFTPRetriever:
     And downloading newest version if necessary
     """
 
-    def __init__(self, organism, wd):
+    def __init__(self, organism, wd, data_dir):
         self.site_ftp = FTPHandler("ftp.ensembl.org", "pub")
         self.remoteversion = self.get_current_ftp_version()
-        self.localversion = self.get_current_local_version(wd)
-        if self.check_version_difference(organism, wd):
-            self.download_currentversion_version(self.remoteversion, organism, wd)
+        self.localversion = self.get_current_local_version(wd, data_dir)
+        if self.check_version_difference(organism, wd, data_dir):
+            self.download_currentversion_version(self.remoteversion, organism, wd, data_dir)
         else:
             print("Newest Version installed, no update needed.")
 
@@ -31,15 +31,25 @@ def get_current_ftp_version(self):
         print("Current release is "+c_release)
         return c_release
 
-    def check_organism(self, organism, release, wd):
-        if organism in next(os.walk(os.path.join(wd+"/EnsemblData/"+release+"/")))[1]:
-            return False
+    def check_organism(self, organism, release, wd, data_dir):
+        if data_dir:
+            if organism in next(os.walk(os.path.join(data_dir+"/EnsemblData/"+release+"/")))[1]:
+                return False
+            else:
+                print("No Local Version for "+organism+" installed. Installing...")
+                return True
         else:
-            print("No Local Version for "+organism+" installed. Installing...")
-            return True
+            if organism in next(os.walk(os.path.join(wd+"/data/EnsemblData/"+release+"/")))[1]:
+                return False
+            else:
+                print("No Local Version for "+organism+" installed. Installing...")
+                return True
 
-    def get_current_local_version(self, wd):
-        directories = next(os.walk(os.path.join(wd+"/EnsemblData/")))[1]
+    def get_current_local_version(self, wd, data_dir):
+        if data_dir:
+            directories = next(os.walk(os.path.join(data_dir + "/EnsemblData/")))[1]
+        else:
+            directories = next(os.walk(os.path.join(wd+"/data/EnsemblData/")))[1]
         for dir in directories:
             if "release" in dir:
                 localversion = sorted(directories, reverse=True)[0]
@@ -51,7 +61,7 @@ def get_current_local_version(self, wd):
         print("No Version installed !")
         return None
 
-    def check_version_difference(self, organism, wd):
+    def check_version_difference(self, organism, wd, data_dir):
 
         local_version = self.localversion
         remote_version = self.remoteversion
@@ -64,16 +74,18 @@ def check_version_difference(self, organism, wd):
             print("Outdated Version detected ! local: " + local_version + " remote: " + remote_version)
             return True
         else:
-            if self.check_organism(organism, local_version, wd):
+            if self.check_organism(organism, local_version, wd, data_dir):
                 return True
             else:
                 return False
 
-    def download_currentversion_version(self, version, organism, wd):
+    def download_currentversion_version(self, version, organism, wd, data_dir):
 
         # Download Base File
-
-        targetfolder = os.path.join(wd+"/EnsemblData/", version, organism)
+        if data_dir:
+            targetfolder = os.path.join(data_dir + "/EnsemblData/", version, organism)
+        else:
+            targetfolder = os.path.join(wd+"/data/EnsemblData/", version, organism)
         os.makedirs(targetfolder)
         folder_url = "/pub/"+version+"/regulation/"+organism+"/"
         self.site_ftp.change_dir(folder_url)

diff --git a/bin/Modules/Ensembl/GTFGen.py b/bin/Modules/Ensembl/GTFGen.py
@@ -5,14 +5,16 @@
 
 class GTFGen:
 
-    def __init__(self, organism, release, wd):
+    def __init__(self, organism, release, wd, data_dir):
 
-        self.gff_lines = self.get_organism_as_gff(organism, release, wd)
+        self.gff_lines = self.get_organism_as_gff(organism, release, wd, data_dir)
         self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"}
 
-    def get_organism_as_gff(self, organism, release, wd):
-
-        directory = os.path.join(wd + "/EnsemblData/", release, organism)
+    def get_organism_as_gff(self, organism, release, wd, data_dir):
+        if data_dir:
+            directory = os.path.join(data_dir + "/EnsemblData/", release, organism)
+        else:
+            directory = os.path.join(wd + "/data/EnsemblData/", release, organism)
         inputfile = ""
         for file in os.listdir(directory):
             if file.endswith("gff.gz"):

diff --git a/bin/Modules/ucsc/ucsc.py b/bin/Modules/ucsc/ucsc.py
@@ -7,10 +7,13 @@
 
 class UcscGtf:
 
-    def __init__(self, org, wd):
+    def __init__(self, org, wd, data_dir):
         self.organism_id = self.get_organism_id(org)
         self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb"
-        self.output = os.path.join(wd + "/UCSCData/"+self.organism_id+".bed")
+        if data_dir:
+            self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed")
+        else:
+            self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed")
         self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed")
         print("Getting UCSC Data")
         print("Path to Bin: " + self.path_to_bin)

diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py
@@ -13,14 +13,14 @@ def check_for_local_folder(wd):
 
     if not os.path.isdir(os.path.join(wd+"/EnsemblData")):
 
-        os.mkdir(os.path.join(wd+"/EnsemblData"))
+        os.mkdir(os.path.join(wd+"/data/EnsemblData"))
 
-    if not os.path.isdir(os.path.join(wd+"/UCSCData" )):
-        os.mkdir(os.path.join(wd+"/UCSCData" ))
+    if not os.path.isdir(os.path.join(wd+"/data/UCSCData")):
+        os.mkdir(os.path.join(wd+"/data/UCSCData"))
 
 
 def check_filter(tissue_cmd, org, wd):
-    path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json" )
+    path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json")
     tissues_config = []
     if not tissue_cmd:
         return False
@@ -36,18 +36,18 @@ def check_filter(tissue_cmd, org, wd):
         return False
 
 
-def main_script(org, wd, tissuetype=None):
-
-    check_for_local_folder(wd)
+def main_script(org, wd, data_dir, tissuetype=None):
+    if not data_dir:
+        check_for_local_folder(wd)
     if check_filter(tissuetype, org, wd):
         tissues = tissuetype
         print("Filter detected !")
     else:
         tissues = None
         print("Filter not detected !")
 
-    ucsc = UcscGtf(org, wd)
-    ense = Ensembl(org, wd)
+    ucsc = UcscGtf(org, wd, data_dir)
+    ense = Ensembl(org, wd, data_dir)
     print("Getting Unique Results")
     unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues)
     ResultSaver(unique_filter.get_results(), org, tissues)
@@ -57,12 +57,12 @@ def main_script(org, wd, tissuetype=None):
     parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' )
     parser.add_argument('organism', help='Source organism [ homo_sapiens or mus_musculus ]', action='store', nargs='?', type=str)
     parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str)
-    parser.add_argument('--wd', help='Working directory. default: "."', action='store', default='.', type=str)
+    parser.add_argument('--wd', help='Working directory. default: "."', action='store', default=os.getcwd(), type=str)
+    parser.add_argument('--dir', help='Data directory. default: "working_directory"', action='store', default="", type=str)
     args = vars(parser.parse_args())
-    print("Working Dir: " + args["wd"])
     if args["organism"]:
         print("Working Dir: " + args["wd"])
-        main_script(args["organism"], args["wd"], args["tissue"])
+        main_script(args["organism"], args["wd"], args["dir"], args["tissue"])
     else:
         print("No Arguments found -> See python3 ./RegGTFExtractor.py -h for help.")
 
diff --git a/pipeline.nf b/pipeline.nf
@@ -117,7 +117,7 @@ All arguments can be set in the configuration files.
 
 bigwig_input.combine(bed_input).set{footprint_in}
 /*
-
+this process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest
 */
 process footprint_extraction {
 	conda "${path_env}"