merge gtf to motif

loosolab · Dec 4, 2018 · cef1ec8 · cef1ec8
2 parents ee6e0d7 + d8655fd
commit cef1ec8
Show file tree

Hide file tree

Showing 22 changed files with 1,171 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,171 @@
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+/.idea/
+/EnsemblData/release-94/
+/UCSCData/hg38.bed
+/UCSCData/mm10.bed
+/results/homo_sapiens_filtered.gtf
diff --git a/Modules/Ensembl/ActivityCategorizer.py b/Modules/Ensembl/ActivityCategorizer.py
@@ -0,0 +1,103 @@
+import json
+import os
+
+
+class ActivityCategorizer:
+
+    def __init__(self, release, organism):
+
+        # List of all Folders with Activity Tables
+
+        self.folderlist = []
+
+        # Dictionary from celltypes_organism.json mit key = Kategorie und Value = [Ordner]
+
+        self.c_dict = self.read_config(organism)
+
+        # Activity table from all files as dict
+
+        self.activity = {}
+
+        self.get_activity_data(release, organism)
+
+        # Categorized Activity from Json-config
+        print("Categorization: This may take a while")
+        self.categorization = self.generate_categorized_activity()
+
+        print("Categorization finished !")
+
+    def get_categorization(self):
+        return self.categorization
+
+    def read_config(self, organism):
+
+        c_dict = {}
+        path_to_config = os.path.join("./config/celltypes_"+organism+".json")
+        with open(path_to_config) as input:
+            data = json.loads(input.read())
+            for x in data:
+                c_dict[x["type"]] = x["alias_ensembl"]
+                self.folderlist.extend(x["alias_ensembl"])
+
+        return c_dict
+
+    def get_activity_data(self, release, organism):
+
+        for folder in self.folderlist:
+            # Generate path to binary File
+            file = os.path.join("./EnsemblData", release, organism, "activity", folder, "table.bin")
+            with open(file, "rb") as tables:
+                self.activity[folder] = bytearray(tables.read())
+
+    def generate_categorized_activity(self):
+        category_activity = {}
+
+        for category, aliases in self.c_dict.items():
+
+            # If an alias exists
+
+            if aliases:
+
+                # If theres only one alias
+
+                if len(aliases) == 1:
+                    category_activity[category] = self.activity[aliases[0]]
+
+                # If there are multiple alias
+
+                else:
+                    category_activity[category] = self.activity_comparator(aliases)
+
+            # If theres no alias all bytes were set to 4 = NA
+
+            else:
+                category_activity[category] = bytearray([4]*len(self.activity[self.folderlist[0]]))
+
+        return category_activity
+
+    def activity_comparator(self, aliaslist):
+
+        concatenated_array = bytearray([])
+
+        length = len(self.activity[aliaslist[0]])
+        input_arrays = [self.activity[x] for x in aliaslist]
+        for x in range(length):
+            if any(y[x] == 0 for y in input_arrays):
+                concatenated_array.append(0)
+            elif any(y[x] == 1 for y in input_arrays):
+                concatenated_array.append(1)
+            elif any(y[x] == 2 for y in input_arrays):
+                concatenated_array.append(2)
+            elif any(y[x] == 3 for y in input_arrays):
+                concatenated_array.append(3)
+            elif any(y[x] == 4 for y in input_arrays):
+                concatenated_array.append(4)
+        return concatenated_array
+
+
+# Debugging
+
+# e = ActivityCategorizer("../../config/celltypes_human.json", "release-94", "homo_sapiens")
+# print(len(e.categorization))
+# for x in e.categorization.values():
+#     print(len(x))
diff --git a/Modules/Ensembl/ActivityTable.py b/Modules/Ensembl/ActivityTable.py
@@ -0,0 +1,48 @@
+import os.path
+from Modules.Ensembl.ActivityTableGenerator import ATGenerator
+
+
+class ActivityTable:
+
+    """
+    Class for checking activity_table and generating them.
+    ActivityTable = Byte Representation of Activity Status
+    corresponding to the generator Schema default:
+    0, "activity=ACTIVE",
+    1, "activity=POISED",
+    2, "activity=REPRESSED",
+    3, "activity=INACTIVE",
+    4, "activity=NA"
+    """
+
+    def __init__(self, organism, current_release):
+        self.link = os.path.join("./EnsemblData/", current_release, organism, "activity")
+        self.folders = next(os.walk(self.link))[1]
+        self.generator = ATGenerator(["activity=ACTIVE",
+                               "activity=POISED",
+                               "activity=REPRESSED",
+                               "activity=INACTIVE",
+                               "activity=NA"])
+
+    def check_and_generate_activity_table(self):
+        for subfolder in self.folders:
+            folder_link = os.path.join(self.link, subfolder)
+            sf_link = os.path.join(folder_link, "table.bin")
+            if not os.path.isfile(sf_link):
+                print("No ActivityTable for "+subfolder+" found, generating new one.")
+                self.generate_table(folder_link)
+        print("All ActivityTables found, proceeding")
+
+    def generate_table(self, link):
+        for root, dirs, files in os.walk(link):
+            for file in files:
+                if file.endswith(".gff.gz"):
+                    originpath = os.path.join(root, file)
+                    file_path = os.path.join(root, "table.bin")
+                    with open(file_path, "wb") as f:
+                        f.write(self.generator.read_table(originpath))
+                    print("New ActivityTable generated in: " + root)
+# Debug
+
+#e = ActivityTable("homo_sapiens", "release-94")
+#e.check_and_generate_activity_table()
diff --git a/Modules/Ensembl/ActivityTableGenerator.py b/Modules/Ensembl/ActivityTableGenerator.py
@@ -0,0 +1,21 @@
+import gzip
+
+
+class ATGenerator:
+
+    def __init__(self, repre):
+
+        self.representation = repre
+
+    def read_table(self, file):
+        activity_table = []
+        with gzip.open(file, 'rb') as f:
+            for line in f:
+                for index, re in enumerate(self.representation):
+                    if re in str(line):
+                        activity_table.append(index)
+                        break
+        return bytearray(activity_table)
+
+
+
diff --git a/Modules/Ensembl/Ensembl.py b/Modules/Ensembl/Ensembl.py
@@ -0,0 +1,25 @@
+from Modules.Ensembl.ActivityTable import ActivityTable
+from Modules.Ensembl.FTPHandling.VersionChecker import EnsemblRegulationFTPRetriever as FTPRetriever
+from Modules.Ensembl.ActivityCategorizer import ActivityCategorizer
+from Modules.Ensembl.GTFGen import GTFGen
+
+
+class Ensembl:
+
+    def __init__(self, organism):
+        print("Starting Ensembl")
+        self.updater = FTPRetriever(organism)
+        self.release = self.updater.get_release()
+        self.acttable = ActivityTable(organism, self.release)
+        self.acttable.check_and_generate_activity_table()
+        self.categorizer = ActivityCategorizer(self.release, organism)
+        print("Generating GTF")
+        self.gtf_generator = GTFGen(organism, self.release)
+
+        print("Ensembl Finished !")
+
+    def get_gtf(self):
+        return self.gtf_generator.get_gtf(self.release, self.categorizer.get_categorization())
+
+#e = Ensembl("homo_sapiens")
+#print(len(e.categorizer.categorization))