Skip to content

Commit

Permalink
Merge branch 'dev_gtfdev' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
basti committed Dec 4, 2018
2 parents afe9eb2 + b25f97c commit f8a339e
Show file tree
Hide file tree
Showing 9 changed files with 219 additions and 60 deletions.
168 changes: 168 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# Created by .ignore support plugin (hsz.mobi)
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/modules.xml
# .idea/*.iml
# .idea/modules

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/

18 changes: 9 additions & 9 deletions bin/Modules/Ensembl/ActivityCategorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,21 @@

class ActivityCategorizer:

def __init__(self, release, organism):
def __init__(self, release, organism, wd):

# List of all Folders with Activity Tables

self.folderlist = []

# Dictionary from celltypes_organism.json mit key = Kategorie und Value = [Ordner]

self.c_dict = self.read_config(organism)
self.c_dict = self.read_config(organism, wd)

# Activity table from all files as dict

self.activity = {}

self.get_activity_data(release, organism)
self.get_activity_data(release, organism, wd)

# Categorized Activity from Json-config
print("Categorization: This may take a while")
Expand All @@ -29,23 +29,23 @@ def __init__(self, release, organism):
def get_categorization(self):
return self.categorization

def read_config(self, organism):
def read_config(self, organism, wd):

c_dict = {}
path_to_config = os.path.join("../config/celltypes_"+organism+".json")
with open(path_to_config) as input:
data = json.loads(input.read())
path_to_config = os.path.join(wd, "/config/celltypes_"+organism+".json")
with open(path_to_config) as input_file:
data = json.loads(input_file.read())
for x in data:
c_dict[x["type"]] = x["alias_ensembl"]
self.folderlist.extend(x["alias_ensembl"])

return c_dict

def get_activity_data(self, release, organism):
def get_activity_data(self, release, organism, wd):

for folder in self.folderlist:
# Generate path to binary File
file = os.path.join("./EnsemblData", release, organism, "activity", folder, "table.bin")
file = os.path.join(wd, "/EnsemblData", release, organism, "activity", folder, "table.bin")
with open(file, "rb") as tables:
self.activity[folder] = bytearray(tables.read())

Expand Down
4 changes: 2 additions & 2 deletions bin/Modules/Ensembl/ActivityTable.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ class ActivityTable:
4, "activity=NA"
"""

def __init__(self, organism, current_release):
self.link = os.path.join("./EnsemblData/", current_release, organism, "activity")
def __init__(self, organism, current_release, wd):
self.link = os.path.join(wd, "/EnsemblData/", current_release, organism, "activity")
self.folders = next(os.walk(self.link))[1]
self.generator = ATGenerator(["activity=ACTIVE",
"activity=POISED",
Expand Down
18 changes: 9 additions & 9 deletions bin/Modules/Ensembl/Ensembl.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
from Modules.Ensembl.ActivityTable import ActivityTable
from Modules.Ensembl.FTPHandling.VersionChecker import EnsemblRegulationFTPRetriever as FTPRetriever
from Modules.Ensembl.ActivityCategorizer import ActivityCategorizer
from Modules.Ensembl.GTFGen import GTFGen
from bin.Modules.Ensembl.ActivityTable import ActivityTable
from bin.Modules.Ensembl.FTPHandling.VersionChecker import EnsemblRegulationFTPRetriever as FTPRetriever
from bin.Modules.Ensembl.ActivityCategorizer import ActivityCategorizer
from bin.Modules.Ensembl.GTFGen import GTFGen


class Ensembl:

def __init__(self, organism):
def __init__(self, organism, wd):
print("Starting Ensembl")
self.updater = FTPRetriever(organism)
self.updater = FTPRetriever(organism, wd)
self.release = self.updater.get_release()
self.acttable = ActivityTable(organism, self.release)
self.acttable = ActivityTable(organism, self.release, wd)
self.acttable.check_and_generate_activity_table()
self.categorizer = ActivityCategorizer(self.release, organism)
self.categorizer = ActivityCategorizer(self.release, organism, wd)
print("Generating GTF")
self.gtf_generator = GTFGen(organism, self.release)
self.gtf_generator = GTFGen(organism, self.release, wd)

print("Ensembl Finished !")

Expand Down
2 changes: 1 addition & 1 deletion bin/Modules/Ensembl/FTPHandling/URLRetrieve.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import ftplib
from Modules.Ensembl.FTPHandling.FTPEntry import FTPEntry
from bin.Modules.Ensembl.FTPHandling.FTPEntry import FTPEntry


class FTPHandler:
Expand Down
26 changes: 13 additions & 13 deletions bin/Modules/Ensembl/FTPHandling/VersionChecker.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from Modules.Ensembl.FTPHandling.URLRetrieve import FTPHandler
from bin.Modules.Ensembl.FTPHandling.URLRetrieve import FTPHandler
import os.path


Expand All @@ -9,12 +9,12 @@ class EnsemblRegulationFTPRetriever:
And downloading newest version if necessary
"""

def __init__(self, organism):
def __init__(self, organism, wd):
self.site_ftp = FTPHandler("ftp.ensembl.org", "pub")
self.remoteversion = self.get_current_ftp_version()
self.localversion = self.get_current_local_version()
if self.check_version_difference(organism):
self.download_currentversion_version(self.remoteversion, organism)
self.localversion = self.get_current_local_version(wd)
if self.check_version_difference(organism, wd):
self.download_currentversion_version(self.remoteversion, organism, wd)
else:
print("Newest Version installed, no update needed.")

Expand All @@ -31,15 +31,15 @@ def get_current_ftp_version(self):
print("Current release is "+c_release)
return c_release

def check_organism(self, organism, release):
if organism in next(os.walk("./EnsemblData/"+release+"/"))[1]:
def check_organism(self, organism, release, wd):
if organism in next(os.walk(os.path.join(wd, "/EnsemblData/"+release+"/")))[1]:
return False
else:
print("No Local Version for "+organism+" installed. Installing...")
return True

def get_current_local_version(self):
directories = next(os.walk("./EnsemblData/"))[1]
def get_current_local_version(self, wd):
directories = next(os.walk(os.path.join(wd, "/EnsemblData/")))[1]
for dir in directories:
if "release" in dir:
localversion = sorted(directories, reverse=True)[0]
Expand All @@ -51,7 +51,7 @@ def get_current_local_version(self):
print("No Version installed !")
return None

def check_version_difference(self, organism):
def check_version_difference(self, organism, wd):

local_version = self.localversion
remote_version = self.remoteversion
Expand All @@ -64,16 +64,16 @@ def check_version_difference(self, organism):
print("Outdated Version detected ! local: " + local_version + " remote: " + remote_version)
return True
else:
if self.check_organism(organism, local_version):
if self.check_organism(organism, local_version, wd):
return True
else:
return False

def download_currentversion_version(self, version, organism):
def download_currentversion_version(self, version, organism, wd):

# Download Base File

targetfolder = os.path.join("./EnsemblData/", version, organism)
targetfolder = os.path.join(wd, "/EnsemblData/", version, organism)
os.makedirs(targetfolder)
folder_url = "/pub/"+version+"/regulation/"+organism+"/"
self.site_ftp.change_dir(folder_url)
Expand Down
8 changes: 4 additions & 4 deletions bin/Modules/Ensembl/GTFGen.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@

class GTFGen:

def __init__(self, organism, release):
def __init__(self, organism, release, wd):

self.gff_lines = self.get_organism_as_gff(organism, release)
self.gff_lines = self.get_organism_as_gff(organism, release, wd)
self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"}

def get_organism_as_gff(self, organism, release):
def get_organism_as_gff(self, organism, release, wd):

directory = os.path.join("./EnsemblData/", release, organism)
directory = os.path.join(wd, "/EnsemblData/", release, organism)
inputfile = ""
for file in os.listdir(directory):
if file.endswith("gff.gz"):
Expand Down
Loading

0 comments on commit f8a339e

Please sign in to comment.