From 638133b9ccd9c61513ec36c6097f3fbb5097bc7a Mon Sep 17 00:00:00 2001 From: basti Date: Mon, 10 Dec 2018 13:27:40 +0100 Subject: [PATCH 1/7] Implemented --dir parameter for remote data directory --- bin/Modules/Ensembl/ActivityCategorizer.py | 11 +++-- bin/Modules/Ensembl/ActivityTable.py | 7 ++- bin/Modules/Ensembl/Ensembl.py | 10 ++--- .../Ensembl/FTPHandling/URLRetrieve.py | 4 +- .../Ensembl/FTPHandling/VersionChecker.py | 44 ++++++++++++------- bin/Modules/Ensembl/GTFGen.py | 12 ++--- bin/Modules/ucsc/ucsc.py | 7 ++- bin/RegGTFExtractor.py | 24 +++++----- 8 files changed, 71 insertions(+), 48 deletions(-) mode change 100644 => 100755 bin/RegGTFExtractor.py diff --git a/bin/Modules/Ensembl/ActivityCategorizer.py b/bin/Modules/Ensembl/ActivityCategorizer.py index 3ef0258..bd82a92 100644 --- a/bin/Modules/Ensembl/ActivityCategorizer.py +++ b/bin/Modules/Ensembl/ActivityCategorizer.py @@ -4,7 +4,7 @@ class ActivityCategorizer: - def __init__(self, release, organism, wd): + def __init__(self, release, organism, wd, data_dir): # List of all Folders with Activity Tables @@ -18,7 +18,7 @@ def __init__(self, release, organism, wd): self.activity = {} - self.get_activity_data(release, organism, wd) + self.get_activity_data(release, organism, wd, data_dir) # Categorized Activity from Json-config print("Categorization: This may take a while") @@ -41,11 +41,14 @@ def read_config(self, organism, wd): return c_dict - def get_activity_data(self, release, organism, wd): + def get_activity_data(self, release, organism, wd, data_dir): for folder in self.folderlist: # Generate path to binary File - file = os.path.join(wd + "/EnsemblData", release, organism, "activity", folder, "table.bin") + if data_dir: + file = os.path.join(data_dir + "/EnsemblData", release, organism, "activity", folder, "table.bin") + else: + file = os.path.join(wd + "/data/EnsemblData", release, organism, "activity", folder, "table.bin") with open(file, "rb") as tables: self.activity[folder] = bytearray(tables.read()) diff --git a/bin/Modules/Ensembl/ActivityTable.py b/bin/Modules/Ensembl/ActivityTable.py index 34f4719..040d15b 100644 --- a/bin/Modules/Ensembl/ActivityTable.py +++ b/bin/Modules/Ensembl/ActivityTable.py @@ -15,8 +15,11 @@ class ActivityTable: 4, "activity=NA" """ - def __init__(self, organism, current_release, wd): - self.link = os.path.join(wd + "/EnsemblData/", current_release, organism, "activity") + def __init__(self, organism, current_release, wd, data_dir): + if data_dir: + self.link = os.path.join(data_dir + "/EnsemblData/", current_release, organism, "activity") + else: + self.link = os.path.join(wd + "/data/EnsemblData/", current_release, organism, "activity") self.folders = next(os.walk(self.link))[1] self.generator = ATGenerator(["activity=ACTIVE", "activity=POISED", diff --git a/bin/Modules/Ensembl/Ensembl.py b/bin/Modules/Ensembl/Ensembl.py index f0a922c..de5839d 100644 --- a/bin/Modules/Ensembl/Ensembl.py +++ b/bin/Modules/Ensembl/Ensembl.py @@ -6,15 +6,15 @@ class Ensembl: - def __init__(self, organism, wd): + def __init__(self, organism, wd, data_dir): print("Starting Ensembl") - self.updater = FTPRetriever(organism, wd) + self.updater = FTPRetriever(organism, wd, data_dir) self.release = self.updater.get_release() - self.acttable = ActivityTable(organism, self.release, wd) + self.acttable = ActivityTable(organism, self.release, wd, data_dir) self.acttable.check_and_generate_activity_table() - self.categorizer = ActivityCategorizer(self.release, organism, wd) + self.categorizer = ActivityCategorizer(self.release, organism, wd, data_dir) print("Generating GTF") - self.gtf_generator = GTFGen(organism, self.release, wd) + self.gtf_generator = GTFGen(organism, self.release, wd, data_dir) print("Ensembl Finished !") diff --git a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py b/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py index 598a6f5..43feb54 100644 --- a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py +++ b/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py @@ -15,8 +15,8 @@ def change_dir(self, wd): def get_all_entries(self): return self.ftp.nlst() - def get_all_entries_from_dir(self, dir): - self.change_dir(dir) + def get_all_entries_from_dir(self, dire): + self.change_dir(dire) return self.get_all_entries() def get_all_entries_as_FTPEntry(self): diff --git a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py b/bin/Modules/Ensembl/FTPHandling/VersionChecker.py index 9c3500d..d4c1ca6 100644 --- a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py +++ b/bin/Modules/Ensembl/FTPHandling/VersionChecker.py @@ -9,12 +9,12 @@ class EnsemblRegulationFTPRetriever: And downloading newest version if necessary """ - def __init__(self, organism, wd): + def __init__(self, organism, wd, data_dir): self.site_ftp = FTPHandler("ftp.ensembl.org", "pub") self.remoteversion = self.get_current_ftp_version() - self.localversion = self.get_current_local_version(wd) - if self.check_version_difference(organism, wd): - self.download_currentversion_version(self.remoteversion, organism, wd) + self.localversion = self.get_current_local_version(wd, data_dir) + if self.check_version_difference(organism, wd, data_dir): + self.download_currentversion_version(self.remoteversion, organism, wd, data_dir) else: print("Newest Version installed, no update needed.") @@ -31,15 +31,25 @@ def get_current_ftp_version(self): print("Current release is "+c_release) return c_release - def check_organism(self, organism, release, wd): - if organism in next(os.walk(os.path.join(wd+"/EnsemblData/"+release+"/")))[1]: - return False + def check_organism(self, organism, release, wd, data_dir): + if data_dir: + if organism in next(os.walk(os.path.join(data_dir+"/EnsemblData/"+release+"/")))[1]: + return False + else: + print("No Local Version for "+organism+" installed. Installing...") + return True else: - print("No Local Version for "+organism+" installed. Installing...") - return True + if organism in next(os.walk(os.path.join(wd+"/data/EnsemblData/"+release+"/")))[1]: + return False + else: + print("No Local Version for "+organism+" installed. Installing...") + return True - def get_current_local_version(self, wd): - directories = next(os.walk(os.path.join(wd+"/EnsemblData/")))[1] + def get_current_local_version(self, wd, data_dir): + if data_dir: + directories = next(os.walk(os.path.join(data_dir + "/EnsemblData/")))[1] + else: + directories = next(os.walk(os.path.join(wd+"/data/EnsemblData/")))[1] for dir in directories: if "release" in dir: localversion = sorted(directories, reverse=True)[0] @@ -51,7 +61,7 @@ def get_current_local_version(self, wd): print("No Version installed !") return None - def check_version_difference(self, organism, wd): + def check_version_difference(self, organism, wd, data_dir): local_version = self.localversion remote_version = self.remoteversion @@ -64,16 +74,18 @@ def check_version_difference(self, organism, wd): print("Outdated Version detected ! local: " + local_version + " remote: " + remote_version) return True else: - if self.check_organism(organism, local_version, wd): + if self.check_organism(organism, local_version, wd, data_dir): return True else: return False - def download_currentversion_version(self, version, organism, wd): + def download_currentversion_version(self, version, organism, wd, data_dir): # Download Base File - - targetfolder = os.path.join(wd+"/EnsemblData/", version, organism) + if data_dir: + targetfolder = os.path.join(data_dir + "/EnsemblData/", version, organism) + else: + targetfolder = os.path.join(wd+"/data/EnsemblData/", version, organism) os.makedirs(targetfolder) folder_url = "/pub/"+version+"/regulation/"+organism+"/" self.site_ftp.change_dir(folder_url) diff --git a/bin/Modules/Ensembl/GTFGen.py b/bin/Modules/Ensembl/GTFGen.py index cd70f3b..6a1a63b 100644 --- a/bin/Modules/Ensembl/GTFGen.py +++ b/bin/Modules/Ensembl/GTFGen.py @@ -5,14 +5,16 @@ class GTFGen: - def __init__(self, organism, release, wd): + def __init__(self, organism, release, wd, data_dir): - self.gff_lines = self.get_organism_as_gff(organism, release, wd) + self.gff_lines = self.get_organism_as_gff(organism, release, wd, data_dir) self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"} - def get_organism_as_gff(self, organism, release, wd): - - directory = os.path.join(wd + "/EnsemblData/", release, organism) + def get_organism_as_gff(self, organism, release, wd, data_dir): + if data_dir: + directory = os.path.join(data_dir + "/EnsemblData/", release, organism) + else: + directory = os.path.join(wd + "/data/EnsemblData/", release, organism) inputfile = "" for file in os.listdir(directory): if file.endswith("gff.gz"): diff --git a/bin/Modules/ucsc/ucsc.py b/bin/Modules/ucsc/ucsc.py index 49e44ad..c8ad466 100644 --- a/bin/Modules/ucsc/ucsc.py +++ b/bin/Modules/ucsc/ucsc.py @@ -7,10 +7,13 @@ class UcscGtf: - def __init__(self, org, wd): + def __init__(self, org, wd, data_dir): self.organism_id = self.get_organism_id(org) self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb" - self.output = os.path.join(wd + "/UCSCData/"+self.organism_id+".bed") + if data_dir: + self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed") + else: + self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed") self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed") print("Getting UCSC Data") print("Path to Bin: " + self.path_to_bin) diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py old mode 100644 new mode 100755 index e4657be..6f0e11c --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -13,14 +13,14 @@ def check_for_local_folder(wd): if not os.path.isdir(os.path.join(wd+"/EnsemblData")): - os.mkdir(os.path.join(wd+"/EnsemblData")) + os.mkdir(os.path.join(wd+"/data/EnsemblData")) - if not os.path.isdir(os.path.join(wd+"/UCSCData" )): - os.mkdir(os.path.join(wd+"/UCSCData" )) + if not os.path.isdir(os.path.join(wd+"/data/UCSCData")): + os.mkdir(os.path.join(wd+"/data/UCSCData")) def check_filter(tissue_cmd, org, wd): - path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json" ) + path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json") tissues_config = [] if not tissue_cmd: return False @@ -36,9 +36,9 @@ def check_filter(tissue_cmd, org, wd): return False -def main_script(org, wd, tissuetype=None): - - check_for_local_folder(wd) +def main_script(org, wd, data_dir, tissuetype=None): + if not data_dir: + check_for_local_folder(wd) if check_filter(tissuetype, org, wd): tissues = tissuetype print("Filter detected !") @@ -46,8 +46,8 @@ def main_script(org, wd, tissuetype=None): tissues = None print("Filter not detected !") - ucsc = UcscGtf(org, wd) - ense = Ensembl(org, wd) + ucsc = UcscGtf(org, wd, data_dir) + ense = Ensembl(org, wd, data_dir) print("Getting Unique Results") unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues) ResultSaver(unique_filter.get_results(), org, tissues) @@ -57,12 +57,12 @@ def main_script(org, wd, tissuetype=None): parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' ) parser.add_argument('organism', help='Source organism [ homo_sapiens or mus_musculus ]', action='store', nargs='?', type=str) parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) - parser.add_argument('--wd', help='Working directory. default: "."', action='store', default='.', type=str) + parser.add_argument('--wd', help='Working directory. default: "."', action='store', default=os.getcwd(), type=str) + parser.add_argument('--dir', help='Data directory. default: "working_directory"', action='store', default="", type=str) args = vars(parser.parse_args()) - print("Working Dir: " + args["wd"]) if args["organism"]: print("Working Dir: " + args["wd"]) - main_script(args["organism"], args["wd"], args["tissue"]) + main_script(args["organism"], args["wd"], args["dir"], args["tissue"]) else: print("No Arguments found -> See python3 ./RegGTFExtractor.py -h for help.") From 0a0e9b8cf35c27ab95b7693e247a60d10a772a36 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Mon, 10 Dec 2018 13:56:53 +0100 Subject: [PATCH 2/7] correcting some errors --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 02f4ec4..5c93985 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ export PATH=[meme-suite instalation path]/bin:$PATH Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). The Nextflow-script needs a conda enviroment to run. Nextflow can create the needed enviroment from the given yaml-file. -On some systems Nrxtflow exits the run with following error: +On some systems Nextflow exits the run with following error: ``` Caused by: Failed to create Conda environment @@ -81,7 +81,7 @@ Optional arguments: --interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) - Moitf clustering: + Motif clustering: --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) From c1cb8ca922c1b5a439745cd22da17765e4ba8879 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Mon, 10 Dec 2018 14:38:02 +0100 Subject: [PATCH 3/7] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5c93985..2e16064 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ When the enviroment is created, set the variable 'path_env' in the configuration nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file] ``` ## Parameters +For a detailed overview of all Parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). ``` Required arguments: --input Path to BigWig-file From 42d32387b14945a46b53e234cac0dff80d63d825 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Mon, 10 Dec 2018 14:43:14 +0100 Subject: [PATCH 4/7] adding comment about the footprints_extraction --- pipeline.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.nf b/pipeline.nf index b1084df..866def8 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -117,7 +117,7 @@ All arguments can be set in the configuration files. bigwig_input.combine(bed_input).into {footprint_in} /* - +this process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest */ process footprint_extraction { conda "${path_env}" From 198334ab0c44ec2dc65431e2a13e351948785d38 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Mon, 10 Dec 2018 14:45:15 +0100 Subject: [PATCH 5/7] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2e16064..f53d9df 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ When the enviroment is created, set the variable 'path_env' in the configuration nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file] ``` ## Parameters -For a detailed overview of all Parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). +For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). ``` Required arguments: --input Path to BigWig-file From 5a7a52683b15b85946779d8e04abc792866fa586 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Mon, 10 Dec 2018 14:52:53 +0100 Subject: [PATCH 6/7] improvement of parameters for my part --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f53d9df..09b7ee3 100644 --- a/README.md +++ b/README.md @@ -45,8 +45,8 @@ nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [ For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). ``` Required arguments: - --input Path to BigWig-file - --bed Path to BED-file + --input Path to BigWig-file with scores on the peaks of interest + --bed Path to BED-file with peaks of interest corresponding to the BigWig file --genome_fasta Path to genome in FASTA-format --jaspar_db Path to motif-database in MEME-format From ffed012ffbdfbbccf27464576a77ec4eb083c118 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Mon, 10 Dec 2018 14:54:19 +0100 Subject: [PATCH 7/7] improve parameters for my part once more --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 09b7ee3..66b2b1f 100644 --- a/README.md +++ b/README.md @@ -53,9 +53,9 @@ Required arguments: Optional arguments: Footprint extraction: - --window_length INT (Default: 200) - --step INT (Default: 100) - --percentage INT(Default: 0) + --window_length INT (Default: 200) a length of a window + --step INT (Default: 100) an interval to slide the window + --percentage INT(Default: 0) a percentage to be added to background while searching for footprints Filter unknown motifs: --min_size_fp INT (Default: 10)