Skip to content

Commit

Permalink
Merge pull request #40 from loosolab/gtf_creation
Browse files Browse the repository at this point in the history
Gtf creation
  • Loading branch information
SebastianBeyvers authored Jan 8, 2019
2 parents eb36ef7 + b97841f commit e4d5c5c
Show file tree
Hide file tree
Showing 41 changed files with 638 additions and 5,848 deletions.
208 changes: 208 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
# Created by .ignore support plugin (hsz.mobi)
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/modules.xml
# .idea/*.iml
# .idea/modules

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/httpRequests
### R template
# History files
.Rhistory
.Rapp.history

# Session Data files
.RData

# Example code in package build process
*-Ex.R

# Output files from R CMD build
/*.tar.gz

# Output files from R CMD check
/*.Rcheck/

# RStudio files
.Rproj.user/

# produced vignettes
vignettes/*.html
vignettes/*.pdf

# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth

# knitr and R markdown default cache directories
/*_cache/
/cache/

# Temporary files created by R markdown
*.utf8.md
*.knit.md

# Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
rsconnect/
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/


# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

Würde bin/3.1_create_gtf/data/ löschen
Würde data/ löschen
37 changes: 30 additions & 7 deletions bin/3.1_create_gtf/Modules/CrossMapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,49 @@ class CrossMapper:
Class to download chain_files for chrossmapping hg38 or mm10 to older assembly versions.
Utilizes CrossMap.py. see wiki for more information.
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""

def __init__(self, org, wd, out, is_dir):
self.org = org

# Constructor for CrossMapper class
# input_parameter: org = input organism
# wd = working directory
# out = path to output-file -> Parameter
# is_dir = boolean if wd is data_dir or just working directory

# Get path to tempfile / outputfile and chain-file

if is_dir:
self.infile = os.path.join( wd + "/temp/" + org + ".gtf")
self.infile = os.path.join(wd + "/temp/" + org + ".gtf")
else:
self.infile = os.path.join(wd+"/data/temp/"+org+".gtf")
self.outfile = os.path.join(out+"/" + org + "_mapped.gtf")
self.outfile = os.path.join(out)
self.chainfile = self.get_chain_file(org, wd, is_dir)
# Execute Crossmapper for gff/gtf files

# Execute Crossmap for gff/gtf files

(mapTree, targetChromSizes, sourceChromSizes) = CrossMap.read_chain_file(self.chainfile)

# Map results and save output to self.outfile

CrossMap.crossmap_gff_file(mapTree, self.infile, self.outfile)

def get_chain_file(self, org, wd, isdir):
def get_chain_file(self, org, wd, is_data_dir):

# Defines the Chain files for different conversions.
# input_parameter: org = organism
# wd = working directory
# is_data_dir = is wd data_dir or not

# return_value: Link to chain-file for conversion.
# Custom chain-files and chain-files for more organism can be specified in this section

if org == "hg19":
if isdir:
if is_data_dir:
file_link = os.path.join(wd+"temp/hg38tohg19.over.chain.gz")
else:
file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" )
Expand All @@ -39,7 +62,7 @@ def get_chain_file(self, org, wd, isdir):
return file_link

elif org == "mm9":
if isdir:
if is_data_dir:
file_link = os.path.join(wd+"temp/mm10ToMm9.over.chain.gz")
else:
file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" )
Expand Down
47 changes: 43 additions & 4 deletions bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,27 @@

class ActivityCategorizer:

"""
Class that categorizes activitydata based on json config and binary activitydata (table.bin).
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""

def __init__(self, release, organism, wd, data_dir):

# List of all Folders with Activity Tables
# Constructor for ActivityCategorizer
# input_parameter: organism = input organism
# release = current used Ensembl release
# wd = working dir (default working directory, data_dir is used if specified)
# data_dir = data directory (this is used as directory if specified)

# List of all folders with activity Tables

self.folderlist = []

# Dictionary from celltypes_organism.json mit key = Kategorie und Value = [Ordner]
# Dictionary from celltypes_organism.json mit key = category and Value = [directory]

self.c_dict = self.read_config(organism, wd)

Expand All @@ -27,10 +41,19 @@ def __init__(self, release, organism, wd, data_dir):
print("Categorization finished !")

def get_categorization(self):

# Getter method to return the self.categorization variable

return self.categorization

def read_config(self, organism, wd):

# Method to read the celltypes_organism.json config file
# input_parameter: organism = input organism
# wd = working directory to find the config files.
# return_value: Dictionary with ensembl aliases based on config
# -> Key = type (from config), value = list of ensembl aliases

c_dict = {}
path_to_config = os.path.join(wd +"/config/celltypes_"+organism+".json")
with open(path_to_config) as input_file:
Expand All @@ -43,6 +66,13 @@ def read_config(self, organism, wd):

def get_activity_data(self, release, organism, wd, data_dir):

# Method to read the binary table.bin file and return its content as bytearray
# input_parameter: organism = input organism
# release = current used Ensembl release
# wd = working dir (default working directory, data_dir is used if specified)
# data_dir = data directory (this is used as directory if specified)
# return_value: bytearray with activitystatus

for folder in self.folderlist:
# Generate path to binary File
if data_dir:
Expand All @@ -53,6 +83,9 @@ def get_activity_data(self, release, organism, wd, data_dir):
self.activity[folder] = bytearray(tables.read())

def generate_categorized_activity(self):

# Categorizes the activity by config defined categories.

category_activity = {}

for category, aliases in self.c_dict.items():
Expand Down Expand Up @@ -80,10 +113,16 @@ def generate_categorized_activity(self):

def activity_comparator(self, aliaslist):

# Method to determine the resulting activitystatus if the entry contains
# multiple differing activitystatus from aliases
# e.g. if one alias is ACTIVE and one INACTIVE the result will be ACTIVE -> see wiki for more detailed info
# input_parameter: aliaslist = list of aliases for activity_data
# return_value: Array of Activitystatus by category (type in config)

concatenated_array = bytearray([])

length = len(self.activity[aliaslist[0]])
input_arrays = [self.activity[x] for x in aliaslist]
input_arrays = [self.activity[index] for index in aliaslist]
for x in range(length):
if any(y[x] == 0 for y in input_arrays):
concatenated_array.append(0)
Expand All @@ -103,4 +142,4 @@ def activity_comparator(self, aliaslist):
# e = ActivityCategorizer("../../config/celltypes_human.json", "release-94", "homo_sapiens")
# print(len(e.categorization))
# for x in e.categorization.values():
# print(len(x))
# print(len(x))
Loading

0 comments on commit e4d5c5c

Please sign in to comment.