Skip to content

Gtf creation #40

Merged
merged 21 commits into from
Jan 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
208 changes: 208 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
# Created by .ignore support plugin (hsz.mobi)
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/modules.xml
# .idea/*.iml
# .idea/modules

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/httpRequests
### R template
# History files
.Rhistory
.Rapp.history

# Session Data files
.RData

# Example code in package build process
*-Ex.R

# Output files from R CMD build
/*.tar.gz

# Output files from R CMD check
/*.Rcheck/

# RStudio files
.Rproj.user/

# produced vignettes
vignettes/*.html
vignettes/*.pdf

# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth

# knitr and R markdown default cache directories
/*_cache/
/cache/

# Temporary files created by R markdown
*.utf8.md
*.knit.md

# Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
rsconnect/
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/


# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

Würde bin/3.1_create_gtf/data/ löschen
Würde data/ löschen
37 changes: 30 additions & 7 deletions bin/3.1_create_gtf/Modules/CrossMapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,49 @@ class CrossMapper:
Class to download chain_files for chrossmapping hg38 or mm10 to older assembly versions.
Utilizes CrossMap.py. see wiki for more information.
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""

def __init__(self, org, wd, out, is_dir):
self.org = org

# Constructor for CrossMapper class
# input_parameter: org = input organism
# wd = working directory
# out = path to output-file -> Parameter
# is_dir = boolean if wd is data_dir or just working directory

# Get path to tempfile / outputfile and chain-file

if is_dir:
self.infile = os.path.join( wd + "/temp/" + org + ".gtf")
self.infile = os.path.join(wd + "/temp/" + org + ".gtf")
else:
self.infile = os.path.join(wd+"/data/temp/"+org+".gtf")
self.outfile = os.path.join(out+"/" + org + "_mapped.gtf")
self.outfile = os.path.join(out)
self.chainfile = self.get_chain_file(org, wd, is_dir)
# Execute Crossmapper for gff/gtf files

# Execute Crossmap for gff/gtf files

(mapTree, targetChromSizes, sourceChromSizes) = CrossMap.read_chain_file(self.chainfile)

# Map results and save output to self.outfile

CrossMap.crossmap_gff_file(mapTree, self.infile, self.outfile)

def get_chain_file(self, org, wd, isdir):
def get_chain_file(self, org, wd, is_data_dir):

# Defines the Chain files for different conversions.
# input_parameter: org = organism
# wd = working directory
# is_data_dir = is wd data_dir or not

# return_value: Link to chain-file for conversion.
# Custom chain-files and chain-files for more organism can be specified in this section

if org == "hg19":
if isdir:
if is_data_dir:
file_link = os.path.join(wd+"temp/hg38tohg19.over.chain.gz")
else:
file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" )
Expand All @@ -39,7 +62,7 @@ def get_chain_file(self, org, wd, isdir):
return file_link

elif org == "mm9":
if isdir:
if is_data_dir:
file_link = os.path.join(wd+"temp/mm10ToMm9.over.chain.gz")
else:
file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" )
Expand Down
47 changes: 43 additions & 4 deletions bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,27 @@

class ActivityCategorizer:

"""
Class that categorizes activitydata based on json config and binary activitydata (table.bin).
SebastianBeyvers marked this conversation as resolved.
Show resolved Hide resolved
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""

def __init__(self, release, organism, wd, data_dir):

# List of all Folders with Activity Tables
# Constructor for ActivityCategorizer
# input_parameter: organism = input organism
# release = current used Ensembl release
# wd = working dir (default working directory, data_dir is used if specified)
# data_dir = data directory (this is used as directory if specified)

# List of all folders with activity Tables

self.folderlist = []

# Dictionary from celltypes_organism.json mit key = Kategorie und Value = [Ordner]
# Dictionary from celltypes_organism.json mit key = category and Value = [directory]

self.c_dict = self.read_config(organism, wd)

Expand All @@ -27,10 +41,19 @@ def __init__(self, release, organism, wd, data_dir):
print("Categorization finished !")

def get_categorization(self):

# Getter method to return the self.categorization variable

return self.categorization

def read_config(self, organism, wd):

# Method to read the celltypes_organism.json config file
# input_parameter: organism = input organism
# wd = working directory to find the config files.
# return_value: Dictionary with ensembl aliases based on config
# -> Key = type (from config), value = list of ensembl aliases

c_dict = {}
path_to_config = os.path.join(wd +"/config/celltypes_"+organism+".json")
with open(path_to_config) as input_file:
Expand All @@ -43,6 +66,13 @@ def read_config(self, organism, wd):

def get_activity_data(self, release, organism, wd, data_dir):

# Method to read the binary table.bin file and return its content as bytearray
# input_parameter: organism = input organism
# release = current used Ensembl release
# wd = working dir (default working directory, data_dir is used if specified)
# data_dir = data directory (this is used as directory if specified)
# return_value: bytearray with activitystatus

for folder in self.folderlist:
# Generate path to binary File
if data_dir:
Expand All @@ -53,6 +83,9 @@ def get_activity_data(self, release, organism, wd, data_dir):
self.activity[folder] = bytearray(tables.read())

def generate_categorized_activity(self):

# Categorizes the activity by config defined categories.

category_activity = {}

for category, aliases in self.c_dict.items():
Expand Down Expand Up @@ -80,10 +113,16 @@ def generate_categorized_activity(self):

def activity_comparator(self, aliaslist):

# Method to determine the resulting activitystatus if the entry contains
# multiple differing activitystatus from aliases
# e.g. if one alias is ACTIVE and one INACTIVE the result will be ACTIVE -> see wiki for more detailed info
# input_parameter: aliaslist = list of aliases for activity_data
# return_value: Array of Activitystatus by category (type in config)

concatenated_array = bytearray([])

length = len(self.activity[aliaslist[0]])
input_arrays = [self.activity[x] for x in aliaslist]
input_arrays = [self.activity[index] for index in aliaslist]
for x in range(length):
if any(y[x] == 0 for y in input_arrays):
concatenated_array.append(0)
Expand All @@ -103,4 +142,4 @@ def activity_comparator(self, aliaslist):
# e = ActivityCategorizer("../../config/celltypes_human.json", "release-94", "homo_sapiens")
# print(len(e.categorization))
# for x in e.categorization.values():
# print(len(x))
SebastianBeyvers marked this conversation as resolved.
Show resolved Hide resolved
# print(len(x))