Merge pull request #40 from loosolab/gtf_creation

Gtf creation
loosolab · Jan 8, 2019 · e4d5c5c · e4d5c5c
2 parents eb36ef7 + b97841f
commit e4d5c5c
Show file tree

Hide file tree

Showing 41 changed files with 638 additions and 5,848 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,208 @@
+# Created by .ignore support plugin (hsz.mobi)
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+### R template
+# History files
+.Rhistory
+.Rapp.history
+
+# Session Data files
+.RData
+
+# Example code in package build process
+*-Ex.R
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+# RStudio files
+.Rproj.user/
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+.httr-oauth
+
+# knitr and R markdown default cache directories
+/*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+
+# Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
+rsconnect/
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+Würde bin/3.1_create_gtf/data/ löschen
+Würde data/ löschen
diff --git a/bin/3.1_create_gtf/Modules/CrossMapper.py b/bin/3.1_create_gtf/Modules/CrossMapper.py
@@ -10,26 +10,49 @@ class CrossMapper:
     Class to download chain_files for chrossmapping hg38 or mm10 to older assembly versions.
     Utilizes CrossMap.py. see wiki for more information.
 
+    @author: Sebastian Beyvers
+    @contact: sebastian.beyvers@med.uni-giessen.de
+
+
     """
 
     def __init__(self, org, wd, out, is_dir):
-        self.org = org
+
+        # Constructor for CrossMapper class
+        # input_parameter: org    = input organism
+        #                  wd     = working directory
+        #                  out    = path to output-file -> Parameter
+        #                  is_dir = boolean if wd is data_dir or just working directory
+
+        # Get path to tempfile / outputfile and chain-file
+
         if is_dir:
-            self.infile = os.path.join( wd + "/temp/" + org + ".gtf")
+            self.infile = os.path.join(wd + "/temp/" + org + ".gtf")
         else:
             self.infile = os.path.join(wd+"/data/temp/"+org+".gtf")
-        self.outfile = os.path.join(out+"/" + org + "_mapped.gtf")
+        self.outfile = os.path.join(out)
         self.chainfile = self.get_chain_file(org, wd, is_dir)
-        # Execute Crossmapper for gff/gtf files
+
+        # Execute Crossmap for gff/gtf files
+
         (mapTree, targetChromSizes, sourceChromSizes) = CrossMap.read_chain_file(self.chainfile)
+
+        # Map results and save output to self.outfile
+
         CrossMap.crossmap_gff_file(mapTree, self.infile, self.outfile)
 
-    def get_chain_file(self, org, wd, isdir):
+    def get_chain_file(self, org, wd, is_data_dir):
 
         # Defines the Chain files for different conversions.
+        # input_parameter: org         = organism
+        #                  wd          = working directory
+        #                  is_data_dir = is wd data_dir or not
+
+        # return_value: Link to chain-file for conversion.
+        # Custom chain-files and chain-files for more organism can be specified in this section
 
         if org == "hg19":
-            if isdir:
+            if is_data_dir:
                 file_link = os.path.join(wd+"temp/hg38tohg19.over.chain.gz")
             else:
                 file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" )
@@ -39,7 +62,7 @@ def get_chain_file(self, org, wd, isdir):
             return file_link
 
         elif org == "mm9":
-            if isdir:
+            if is_data_dir:
                 file_link = os.path.join(wd+"temp/mm10ToMm9.over.chain.gz")
             else:
                 file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" )

diff --git a/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py b/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py
@@ -4,13 +4,27 @@
 
 class ActivityCategorizer:
 
+    """
+
+        Class that categorizes activitydata based on json config and binary activitydata (table.bin).
+        @author: Sebastian Beyvers
+        @contact: sebastian.beyvers@med.uni-giessen.de
+
+     """
+
     def __init__(self, release, organism, wd, data_dir):
 
-        # List of all Folders with Activity Tables
+        # Constructor for ActivityCategorizer
+        # input_parameter: organism        = input organism
+        #                  release         = current used Ensembl release
+        #                  wd              = working dir (default working directory, data_dir is used if specified)
+        #                  data_dir        = data directory (this is used as directory if specified)
+
+        # List of all folders with activity Tables
 
         self.folderlist = []
 
-        # Dictionary from celltypes_organism.json mit key = Kategorie und Value = [Ordner]
+        # Dictionary from celltypes_organism.json mit key = category and Value = [directory]
 
         self.c_dict = self.read_config(organism, wd)
 
@@ -27,10 +41,19 @@ def __init__(self, release, organism, wd, data_dir):
         print("Categorization finished !")
 
     def get_categorization(self):
+
+        # Getter method to return the self.categorization variable
+
         return self.categorization
 
     def read_config(self, organism, wd):
 
+        # Method to read the celltypes_organism.json config file
+        # input_parameter: organism = input organism
+        #                        wd = working directory to find the config files.
+        # return_value: Dictionary with ensembl aliases based on config
+        # -> Key = type (from config), value = list of ensembl aliases
+
         c_dict = {}
         path_to_config = os.path.join(wd +"/config/celltypes_"+organism+".json")
         with open(path_to_config) as input_file:
@@ -43,6 +66,13 @@ def read_config(self, organism, wd):
 
     def get_activity_data(self, release, organism, wd, data_dir):
 
+        # Method to read the binary table.bin file and return its content as bytearray
+        # input_parameter: organism        = input organism
+        #                  release         = current used Ensembl release
+        #                  wd              = working dir (default working directory, data_dir is used if specified)
+        #                  data_dir        = data directory (this is used as directory if specified)
+        # return_value: bytearray with activitystatus
+
         for folder in self.folderlist:
             # Generate path to binary File
             if data_dir:
@@ -53,6 +83,9 @@ def get_activity_data(self, release, organism, wd, data_dir):
                 self.activity[folder] = bytearray(tables.read())
 
     def generate_categorized_activity(self):
+
+        # Categorizes the activity by config defined categories.
+
         category_activity = {}
 
         for category, aliases in self.c_dict.items():
@@ -80,10 +113,16 @@ def generate_categorized_activity(self):
 
     def activity_comparator(self, aliaslist):
 
+        # Method to determine the resulting activitystatus if the entry contains
+        # multiple differing activitystatus from aliases
+        # e.g. if one alias is ACTIVE and one INACTIVE the result will be ACTIVE -> see wiki for more detailed info
+        # input_parameter: aliaslist = list of aliases for activity_data
+        # return_value: Array of Activitystatus by category (type in config)
+
         concatenated_array = bytearray([])
 
         length = len(self.activity[aliaslist[0]])
-        input_arrays = [self.activity[x] for x in aliaslist]
+        input_arrays = [self.activity[index] for index in aliaslist]
         for x in range(length):
             if any(y[x] == 0 for y in input_arrays):
                 concatenated_array.append(0)
@@ -103,4 +142,4 @@ def activity_comparator(self, aliaslist):
 # e = ActivityCategorizer("../../config/celltypes_human.json", "release-94", "homo_sapiens")
 # print(len(e.categorization))
 # for x in e.categorization.values():
-#     print(len(x))
+#     print(len(x))