diff --git a/.gitignore b/.gitignore deleted file mode 100644 index bac574a..0000000 --- a/.gitignore +++ /dev/null @@ -1,168 +0,0 @@ -# Created by .ignore support plugin (hsz.mobi) -### Python template -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -### JetBrains template -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/**/usage.statistics.xml -.idea/**/dictionaries -.idea/**/shelf - -# Sensitive or high-churn files -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/**/dbnavigator.xml - -# Gradle -.idea/**/gradle.xml -.idea/**/libraries - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/modules.xml -# .idea/*.iml -# .idea/modules - -# CMake -cmake-build-*/ - -# Mongo Explorer plugin -.idea/**/mongoSettings.xml - -# File-based project format -*.iws - -# IntelliJ -out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Cursive Clojure plugin -.idea/replstate.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -# Editor-based Rest Client -.idea/ - diff --git a/bin/Modules/Ensembl/ActivityCategorizer.py b/bin/Modules/Ensembl/ActivityCategorizer.py index fa204ae..e29ac4b 100644 --- a/bin/Modules/Ensembl/ActivityCategorizer.py +++ b/bin/Modules/Ensembl/ActivityCategorizer.py @@ -4,7 +4,7 @@ class ActivityCategorizer: - def __init__(self, release, organism): + def __init__(self, release, organism, wd): # List of all Folders with Activity Tables @@ -12,13 +12,13 @@ def __init__(self, release, organism): # Dictionary from celltypes_organism.json mit key = Kategorie und Value = [Ordner] - self.c_dict = self.read_config(organism) + self.c_dict = self.read_config(organism, wd) # Activity table from all files as dict self.activity = {} - self.get_activity_data(release, organism) + self.get_activity_data(release, organism, wd) # Categorized Activity from Json-config print("Categorization: This may take a while") @@ -29,10 +29,10 @@ def __init__(self, release, organism): def get_categorization(self): return self.categorization - def read_config(self, organism): + def read_config(self, organism, wd): c_dict = {} - path_to_config = os.path.join("../config/celltypes_"+organism+".json") + path_to_config = os.path.join(wd, "../config/celltypes_"+organism+".json") with open(path_to_config) as input: data = json.loads(input.read()) for x in data: @@ -41,11 +41,11 @@ def read_config(self, organism): return c_dict - def get_activity_data(self, release, organism): + def get_activity_data(self, release, organism, wd): for folder in self.folderlist: # Generate path to binary File - file = os.path.join("./EnsemblData", release, organism, "activity", folder, "table.bin") + file = os.path.join(wd, "/EnsemblData", release, organism, "activity", folder, "table.bin") with open(file, "rb") as tables: self.activity[folder] = bytearray(tables.read()) diff --git a/bin/Modules/Ensembl/ActivityTable.py b/bin/Modules/Ensembl/ActivityTable.py index 042421a..105c351 100644 --- a/bin/Modules/Ensembl/ActivityTable.py +++ b/bin/Modules/Ensembl/ActivityTable.py @@ -15,8 +15,8 @@ class ActivityTable: 4, "activity=NA" """ - def __init__(self, organism, current_release): - self.link = os.path.join("./EnsemblData/", current_release, organism, "activity") + def __init__(self, organism, current_release, wd): + self.link = os.path.join(wd, "/EnsemblData/", current_release, organism, "activity") self.folders = next(os.walk(self.link))[1] self.generator = ATGenerator(["activity=ACTIVE", "activity=POISED", diff --git a/bin/Modules/Ensembl/Ensembl.py b/bin/Modules/Ensembl/Ensembl.py index df9cbc0..d80ce11 100644 --- a/bin/Modules/Ensembl/Ensembl.py +++ b/bin/Modules/Ensembl/Ensembl.py @@ -1,20 +1,20 @@ -from Modules.Ensembl.ActivityTable import ActivityTable -from Modules.Ensembl.FTPHandling.VersionChecker import EnsemblRegulationFTPRetriever as FTPRetriever -from Modules.Ensembl.ActivityCategorizer import ActivityCategorizer -from Modules.Ensembl.GTFGen import GTFGen +from bin.Modules.Ensembl.ActivityTable import ActivityTable +from bin.Modules.Ensembl.FTPHandling.VersionChecker import EnsemblRegulationFTPRetriever as FTPRetriever +from bin.Modules.Ensembl.ActivityCategorizer import ActivityCategorizer +from bin.Modules.Ensembl.GTFGen import GTFGen class Ensembl: - def __init__(self, organism): + def __init__(self, organism, wd): print("Starting Ensembl") - self.updater = FTPRetriever(organism) + self.updater = FTPRetriever(organism, wd) self.release = self.updater.get_release() - self.acttable = ActivityTable(organism, self.release) + self.acttable = ActivityTable(organism, self.release, wd) self.acttable.check_and_generate_activity_table() - self.categorizer = ActivityCategorizer(self.release, organism) + self.categorizer = ActivityCategorizer(self.release, organism, wd) print("Generating GTF") - self.gtf_generator = GTFGen(organism, self.release) + self.gtf_generator = GTFGen(organism, self.release, wd) print("Ensembl Finished !") diff --git a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py b/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py index 598a6f5..0c164d7 100644 --- a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py +++ b/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py @@ -1,5 +1,5 @@ import ftplib -from Modules.Ensembl.FTPHandling.FTPEntry import FTPEntry +from bin.Modules.Ensembl.FTPHandling.FTPEntry import FTPEntry class FTPHandler: diff --git a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py b/bin/Modules/Ensembl/FTPHandling/VersionChecker.py index 874e776..6992dde 100644 --- a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py +++ b/bin/Modules/Ensembl/FTPHandling/VersionChecker.py @@ -1,4 +1,4 @@ -from Modules.Ensembl.FTPHandling.URLRetrieve import FTPHandler +from bin.Modules.Ensembl.FTPHandling.URLRetrieve import FTPHandler import os.path @@ -9,12 +9,12 @@ class EnsemblRegulationFTPRetriever: And downloading newest version if necessary """ - def __init__(self, organism): + def __init__(self, organism, wd): self.site_ftp = FTPHandler("ftp.ensembl.org", "pub") self.remoteversion = self.get_current_ftp_version() - self.localversion = self.get_current_local_version() - if self.check_version_difference(organism): - self.download_currentversion_version(self.remoteversion, organism) + self.localversion = self.get_current_local_version(wd) + if self.check_version_difference(organism, wd): + self.download_currentversion_version(self.remoteversion, organism, wd) else: print("Newest Version installed, no update needed.") @@ -31,15 +31,15 @@ def get_current_ftp_version(self): print("Current release is "+c_release) return c_release - def check_organism(self, organism, release): - if organism in next(os.walk("./EnsemblData/"+release+"/"))[1]: + def check_organism(self, organism, release, wd): + if organism in next(os.walk(os.path.join(wd, "/EnsemblData/"+release+"/")))[1]: return False else: print("No Local Version for "+organism+" installed. Installing...") return True - def get_current_local_version(self): - directories = next(os.walk("./EnsemblData/"))[1] + def get_current_local_version(self, wd): + directories = next(os.walk(os.path.join(wd, "/EnsemblData/")))[1] for dir in directories: if "release" in dir: localversion = sorted(directories, reverse=True)[0] @@ -51,7 +51,7 @@ def get_current_local_version(self): print("No Version installed !") return None - def check_version_difference(self, organism): + def check_version_difference(self, organism, wd): local_version = self.localversion remote_version = self.remoteversion @@ -64,16 +64,16 @@ def check_version_difference(self, organism): print("Outdated Version detected ! local: " + local_version + " remote: " + remote_version) return True else: - if self.check_organism(organism, local_version): + if self.check_organism(organism, local_version, wd): return True else: return False - def download_currentversion_version(self, version, organism): + def download_currentversion_version(self, version, organism, wd): # Download Base File - targetfolder = os.path.join("./EnsemblData/", version, organism) + targetfolder = os.path.join(wd, "/EnsemblData/", version, organism) os.makedirs(targetfolder) folder_url = "/pub/"+version+"/regulation/"+organism+"/" self.site_ftp.change_dir(folder_url) diff --git a/bin/Modules/Ensembl/GTFGen.py b/bin/Modules/Ensembl/GTFGen.py index a4de58a..381bef3 100644 --- a/bin/Modules/Ensembl/GTFGen.py +++ b/bin/Modules/Ensembl/GTFGen.py @@ -5,14 +5,14 @@ class GTFGen: - def __init__(self, organism, release): + def __init__(self, organism, release, wd): - self.gff_lines = self.get_organism_as_gff(organism, release) + self.gff_lines = self.get_organism_as_gff(organism, release, wd) self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"} - def get_organism_as_gff(self, organism, release): + def get_organism_as_gff(self, organism, release, wd): - directory = os.path.join("./EnsemblData/", release, organism) + directory = os.path.join(wd, "/EnsemblData/", release, organism) inputfile = "" for file in os.listdir(directory): if file.endswith("gff.gz"): diff --git a/bin/Modules/ucsc/ucsc.py b/bin/Modules/ucsc/ucsc.py index 9d10108..99788b7 100644 --- a/bin/Modules/ucsc/ucsc.py +++ b/bin/Modules/ucsc/ucsc.py @@ -7,18 +7,19 @@ class UcscGtf: - def __init__(self, org): + def __init__(self, org, wd): self.organism_id = self.get_organism_id(org) self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb" - self.output = "./UCSCData/"+self.organism_id+".bed" + self.output = os.path.join(wd,"/UCSCData/"+self.organism_id+".bed") + self.path_to_bin = os.path.join(wd, "/Modules/ucsc/bigBedToBed") print("Getting UCSC Data") self.generate_gff_file() - self.ucsc_categories = self.get_activity_categories(org) + self.ucsc_categories = self.get_activity_categories(org, wd) self.gtf_lines = self.read_gff_to_gtf() print("UCSC finished !") def generate_gff_file(self): - callstring = ["./Modules/ucsc/bigBedToBed", self.link, self.output] + callstring = [self.path_to_bin, self.link, self.output] subprocess.call(callstring) def read_gff_to_gtf(self): @@ -65,8 +66,8 @@ def get_organism_id(org): return "mm10" @staticmethod - def get_activity_categories(organism): - path_to_config = os.path.join("../config/celltypes_" + organism + ".json") + def get_activity_categories(organism, wd): + path_to_config = os.path.join(wd, "/config/celltypes_" + organism + ".json") categories = {} with open(path_to_config) as input_file: data = json.loads(input_file.read()) @@ -76,15 +77,4 @@ def get_activity_categories(organism): return categories def get_gtf(self): - return self.gtf_lines - - def test_save_to_file(self): - - with open("./results/test.gtf", "w") as file: - write_it = csv.writer(file, delimiter='\t') - for line in self.gtf_lines: - write_it.writerow(line) - - -#u = UcscGtf("homo_sapiens") -#u.test_save_to_file() \ No newline at end of file + return self.gtf_lines \ No newline at end of file diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index 294e539..c3f669b 100644 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -18,11 +18,11 @@ def check_for_local_folder(): os.mkdir( "./UCSCData" ) -def main_script(org, tissuetype=None): +def main_script(org, wd, tissuetype=None): check_for_local_folder() - ucsc = UcscGtf(org) - ense = Ensembl(org) + ucsc = UcscGtf(org, wd) + ense = Ensembl(org, wd) print("Getting Unique Results") unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissuetype) ResultSaver(unique_filter.get_results(), org, tissuetype) @@ -32,10 +32,11 @@ def main_script(org, tissuetype=None): parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' ) parser.add_argument('organism', help='Source organism [ homo_sapiens or mus_musculus ]', action='store', nargs='?', type=str) parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) + parser.add_argument('--wd', help='Working directory. default: "."', default=".", action='store', type=str) args = vars(parser.parse_args()) if args["organism"]: #print(args["tissue"]) - main_script(args["organism"], args["tissue"]) + main_script(args["organism"], args["wd"], args["tissue"]) else: print("No Arguments found -> See ./RegGTFExtractor.py -h for help.")