Skip to content

Commit

Permalink
merge gtf to motif
Browse files Browse the repository at this point in the history
  • Loading branch information
renewiegandt committed Dec 4, 2018
2 parents ee6e0d7 + d8655fd commit cef1ec8
Show file tree
Hide file tree
Showing 22 changed files with 1,171 additions and 0 deletions.
171 changes: 171 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
# Created by .ignore support plugin (hsz.mobi)
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/modules.xml
# .idea/*.iml
# .idea/modules

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
/.idea/
/EnsemblData/release-94/
/UCSCData/hg38.bed
/UCSCData/mm10.bed
/results/homo_sapiens_filtered.gtf
103 changes: 103 additions & 0 deletions Modules/Ensembl/ActivityCategorizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import json
import os


class ActivityCategorizer:

def __init__(self, release, organism):

# List of all Folders with Activity Tables

self.folderlist = []

# Dictionary from celltypes_organism.json mit key = Kategorie und Value = [Ordner]

self.c_dict = self.read_config(organism)

# Activity table from all files as dict

self.activity = {}

self.get_activity_data(release, organism)

# Categorized Activity from Json-config
print("Categorization: This may take a while")
self.categorization = self.generate_categorized_activity()

print("Categorization finished !")

def get_categorization(self):
return self.categorization

def read_config(self, organism):

c_dict = {}
path_to_config = os.path.join("./config/celltypes_"+organism+".json")
with open(path_to_config) as input:
data = json.loads(input.read())
for x in data:
c_dict[x["type"]] = x["alias_ensembl"]
self.folderlist.extend(x["alias_ensembl"])

return c_dict

def get_activity_data(self, release, organism):

for folder in self.folderlist:
# Generate path to binary File
file = os.path.join("./EnsemblData", release, organism, "activity", folder, "table.bin")
with open(file, "rb") as tables:
self.activity[folder] = bytearray(tables.read())

def generate_categorized_activity(self):
category_activity = {}

for category, aliases in self.c_dict.items():

# If an alias exists

if aliases:

# If theres only one alias

if len(aliases) == 1:
category_activity[category] = self.activity[aliases[0]]

# If there are multiple alias

else:
category_activity[category] = self.activity_comparator(aliases)

# If theres no alias all bytes were set to 4 = NA

else:
category_activity[category] = bytearray([4]*len(self.activity[self.folderlist[0]]))

return category_activity

def activity_comparator(self, aliaslist):

concatenated_array = bytearray([])

length = len(self.activity[aliaslist[0]])
input_arrays = [self.activity[x] for x in aliaslist]
for x in range(length):
if any(y[x] == 0 for y in input_arrays):
concatenated_array.append(0)
elif any(y[x] == 1 for y in input_arrays):
concatenated_array.append(1)
elif any(y[x] == 2 for y in input_arrays):
concatenated_array.append(2)
elif any(y[x] == 3 for y in input_arrays):
concatenated_array.append(3)
elif any(y[x] == 4 for y in input_arrays):
concatenated_array.append(4)
return concatenated_array


# Debugging

# e = ActivityCategorizer("../../config/celltypes_human.json", "release-94", "homo_sapiens")
# print(len(e.categorization))
# for x in e.categorization.values():
# print(len(x))
48 changes: 48 additions & 0 deletions Modules/Ensembl/ActivityTable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os.path
from Modules.Ensembl.ActivityTableGenerator import ATGenerator


class ActivityTable:

"""
Class for checking activity_table and generating them.
ActivityTable = Byte Representation of Activity Status
corresponding to the generator Schema default:
0, "activity=ACTIVE",
1, "activity=POISED",
2, "activity=REPRESSED",
3, "activity=INACTIVE",
4, "activity=NA"
"""

def __init__(self, organism, current_release):
self.link = os.path.join("./EnsemblData/", current_release, organism, "activity")
self.folders = next(os.walk(self.link))[1]
self.generator = ATGenerator(["activity=ACTIVE",
"activity=POISED",
"activity=REPRESSED",
"activity=INACTIVE",
"activity=NA"])

def check_and_generate_activity_table(self):
for subfolder in self.folders:
folder_link = os.path.join(self.link, subfolder)
sf_link = os.path.join(folder_link, "table.bin")
if not os.path.isfile(sf_link):
print("No ActivityTable for "+subfolder+" found, generating new one.")
self.generate_table(folder_link)
print("All ActivityTables found, proceeding")

def generate_table(self, link):
for root, dirs, files in os.walk(link):
for file in files:
if file.endswith(".gff.gz"):
originpath = os.path.join(root, file)
file_path = os.path.join(root, "table.bin")
with open(file_path, "wb") as f:
f.write(self.generator.read_table(originpath))
print("New ActivityTable generated in: " + root)
# Debug

#e = ActivityTable("homo_sapiens", "release-94")
#e.check_and_generate_activity_table()
21 changes: 21 additions & 0 deletions Modules/Ensembl/ActivityTableGenerator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import gzip


class ATGenerator:

def __init__(self, repre):

self.representation = repre

def read_table(self, file):
activity_table = []
with gzip.open(file, 'rb') as f:
for line in f:
for index, re in enumerate(self.representation):
if re in str(line):
activity_table.append(index)
break
return bytearray(activity_table)



25 changes: 25 additions & 0 deletions Modules/Ensembl/Ensembl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from Modules.Ensembl.ActivityTable import ActivityTable
from Modules.Ensembl.FTPHandling.VersionChecker import EnsemblRegulationFTPRetriever as FTPRetriever
from Modules.Ensembl.ActivityCategorizer import ActivityCategorizer
from Modules.Ensembl.GTFGen import GTFGen


class Ensembl:

def __init__(self, organism):
print("Starting Ensembl")
self.updater = FTPRetriever(organism)
self.release = self.updater.get_release()
self.acttable = ActivityTable(organism, self.release)
self.acttable.check_and_generate_activity_table()
self.categorizer = ActivityCategorizer(self.release, organism)
print("Generating GTF")
self.gtf_generator = GTFGen(organism, self.release)

print("Ensembl Finished !")

def get_gtf(self):
return self.gtf_generator.get_gtf(self.release, self.categorizer.get_categorization())

#e = Ensembl("homo_sapiens")
#print(len(e.categorizer.categorization))
Loading

0 comments on commit cef1ec8

Please sign in to comment.