loosolab · SebastianBeyvers · Jan 11, 2019 · Jan 10, 2019 · Jan 10, 2019 · Jan 11, 2019
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ export PATH=[meme-suite instalation path]/bin:$PATH
 Every other dependency will be automatically  installed by Nextflow using conda. For that a new conda enviroment will be created, which can be found in the from Nextflow created work directory after the first pipeline run.
 It is **not** required to create and activate the enviroment from the yaml-file beforehand.
 
-**Important Note:** For conda the channel bioconda needs to be set as highest priority! This is required due to two differnt packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfisch package from the channel conda-forge!
+**Important Note:** For conda the channel bioconda needs to be set as highest priority! This is required due to two different packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfish package from the channel conda-forge!
 
 
 ## Quick Start
@@ -94,7 +94,7 @@ All arguments can be set in the configuration files
 For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki).
 
 ## Known issues
-The Nextflow-script needs a conda enviroment to run. Nextflow creates the needed enviroment from the given yaml-file.
+The Nextflow-script needs a conda environment to run. Nextflow creates the needed environment from the given yaml-file.
 On some systems Nextflow exits the run with following error:
 ```
 Caused by:
@@ -103,11 +103,11 @@ Caused by:
   status : 143
   message:
 ```
-If this error occurs you have to create the enviroment before starting the pipeline.
-To create this enviroment you need the yml-file from the repository.
-Run the following commands to create the enviroment:
+If this error occurs you have to create the environment before starting the pipeline.
+To create this environment you need the yml-file from the repository.
+Run the following commands to create the environment:
 ```console
 path=[Path to given masterenv.yml file]
 conda env create --name masterenv -f $path
 ```
-When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it.
+When the environment is created, set the variable 'path_env' in the configuration file as the path to it.
diff --git a/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py b/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py
@@ -120,20 +120,24 @@ def activity_comparator(self, aliaslist):
         # return_value: Array of Activitystatus by category (type in config)
 
         concatenated_array = bytearray([])
-
         length = len(self.activity[aliaslist[0]])
         input_arrays = [self.activity[index] for index in aliaslist]
         for x in range(length):
-            if any(y[x] == 0 for y in input_arrays):
-                concatenated_array.append(0)
-            elif any(y[x] == 1 for y in input_arrays):
-                concatenated_array.append(1)
-            elif any(y[x] == 2 for y in input_arrays):
-                concatenated_array.append(2)
-            elif any(y[x] == 3 for y in input_arrays):
-                concatenated_array.append(3)
-            elif any(y[x] == 4 for y in input_arrays):
+            # This try-catch block is needed because of inconsistency in file-lengths in Ensembl-release-95
+            try:
+                if any(y[x] == 0 for y in input_arrays):
+                    concatenated_array.append(0)
+                elif any(y[x] == 1 for y in input_arrays):
+                    concatenated_array.append(1)
+                elif any(y[x] == 2 for y in input_arrays):
+                    concatenated_array.append(2)
+                elif any(y[x] == 3 for y in input_arrays):
+                    concatenated_array.append(3)
+                elif any(y[x] == 4 for y in input_arrays):
+                    concatenated_array.append(4)
+            except IndexError:
                 concatenated_array.append(4)
+
         return concatenated_array
 
 

diff --git a/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py b/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py
@@ -89,9 +89,9 @@ def generate_additional_information(gene_id, activity):
         #                  activity = List of activity-data for specified gene
         # return_value: String for attributes (column 9) in gtf-format
 
-        if gene_id.startswith("ID=regulatory_region:"):
+        if not gene_id.startswith("ID=E"):
             gene_id = 'gene_id "'+gene_id.split(':')[1]+'"'
-        elif gene_id.startswith("ID=E"):
+        else:
             gene_id = 'gene_id "'+gene_id.split('=')[1]+'"'
 
         activity_string = 'activity "'+', '.join(activity)+'"'
@@ -107,7 +107,13 @@ def generate_activity_list(self, activity, index):
 
         activity_list = []
         for key, value in activity.items():
-            activity_list.append(key+">"+self.value_map[value[index]])
+            #  if no index is found the key will be annotated as "NA"
+            #  this is needed due to an inconsistency in ensembl-release-95
+            #  some activity-files contain less entries than others
+            try:
+                activity_list.append(key+">"+self.value_map[value[index]])
+            except IndexError:
+                activity_list.append(key+">NA")
         return activity_list
 
     def get_gtf(self, release, activity):

diff --git a/bin/3.1_create_gtf/Modules/Validator.py b/bin/3.1_create_gtf/Modules/Validator.py
@@ -1,7 +1,10 @@
+import os
+
+
 class Validator:
 
     """
-        Class to validate the gtf-output-file.
+        Class to validate and sort the gtf-output-file.
         @author: Sebastian Beyvers
         @contact: sebastian.beyvers@med.uni-giessen.de
 
@@ -13,8 +16,17 @@ def __init__(self, out_file):
         # input_parameter: out_file = path to output file
 
         self.out_file = out_file
+        self.sort_file()
         self.test_read_file()
 
+    def sort_file(self):
+
+        # function that utilizes linux sort to sort the output by chromosome and start coordinate,
+        # this improves handling in tools like IGV
+
+        command = "sort -V -k1,1 -k4,4n -o "+self.out_file+" "+self.out_file
+        os.system(command)
+
     def test_read_file(self):
 
         # Method to test the output file-format

diff --git a/bin/3.1_create_gtf/RegGTFExtractor.py b/bin/3.1_create_gtf/RegGTFExtractor.py
diff --git a/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json b/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json
@@ -2,7 +2,7 @@
     {
       "type": "A549",
       "alias_ucsc": [],
-      "alias_ensembl": ["A549"]
+      "alias_ensembl": ["A549", "A673"]
     },
     {
       "type": "Aorta",
@@ -17,22 +17,32 @@
     {
       "type": "B-Cells",
       "alias_ucsc": [],
-      "alias_ensembl": ["B_cells_PB_Roadmap", "naive_B_cell_VB", "GM12878"]
+      "alias_ensembl": ["B_cells_PB_Roadmap", "naive_B_cell_VB", "GM12878", "B_cell_ENCSR682AXR", "CD38__naive_B_cell_VB", "naive_B_cell_To"]
     },
     {
       "type": "T-Cell",
       "alias_ucsc": [],
-      "alias_ensembl": ["CD4_ab_T_cell_VB", "CM_CD4_ab_T_cell_VB", "CD8_ab_T_cell_CB", "T_cells_PB_Roadmap"]
+      "alias_ensembl": ["CD4_ab_T_cell_VB",
+                        "CM_CD4_ab_T_cell_VB",
+                        "CD8_ab_T_cell_CB",
+                        "T_cells_PB_Roadmap",
+                        "CD4_ab_T_cell_CB",
+                        "CD4_positive__alpha_beta_memory_T_cell",
+                        "CD4_positive__alpha_beta_T_cell",
+                        "CD4_positive__alpha_beta_T_cell_ENCSR948ZKZ",
+                        "CD4_positive__CD25_positive__alpha_beta_regulatory_T_cell",
+                        "effector_memory_CD4_positive__alpha_beta_T_cell", "EM_CD8_ab_T_cell_VB",
+                        "naive_thymus_derived_CD4_positive__alpha_beta_T_cell", "T_helper_17_cell"]
     },
     {
       "type": "Monocyte",
       "alias_ucsc": [],
-      "alias_ensembl": ["CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB", "Monocytes_CD14", "Monocytes_CD14_PB_Roadmap"]
+      "alias_ensembl": ["CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB", "Monocytes_CD14", "Monocytes_CD14_PB_Roadmap", "CD14_positive_monocyte"]
     },
     {
       "type": "Neutrophil",
       "alias_ucsc": [],
-      "alias_ensembl": ["neutrophil_CB", "neutrophil_myelocyte_BM", "neutrophil_VB"]
+      "alias_ensembl": ["neutrophil_CB", "neutrophil_myelocyte_BM", "neutrophil_VB", "neutrophil"]
     },
      {
       "type": "Eosinophil",
@@ -59,7 +69,7 @@
     {
       "type": "Intestine",
       "alias_ucsc": [],
-      "alias_ensembl": ["Fetal_Intestine_Large", "Fetal_Intestine_Small", "Small_Intestine"]
+      "alias_ensembl": ["Fetal_Intestine_Large", "Fetal_Intestine_Small", "Small_Intestine", "sigmoid_colon"]
     },
     {
       "type": "AdrenalGland",
@@ -69,7 +79,7 @@
     {
       "type": "Muscle",
       "alias_ucsc": ["limb"],
-      "alias_ensembl": ["Fetal_Muscle_Leg", "Fetal_Muscle_Trunk", "Psoas_Muscle", "HSMM", "HSMMtube"]
+      "alias_ensembl": ["Fetal_Muscle_Leg", "Fetal_Muscle_Trunk", "Psoas_Muscle", "HSMM", "HSMMtube", "skeletal_muscle_myoblast"]
     },
     {
       "type": "Gastric",
@@ -79,17 +89,17 @@
     {
       "type": "Endothelial",
       "alias_ucsc": ["blood vessels"],
-      "alias_ensembl": ["EPC_VB", "HMEC", "HUVEC", "HUVEC_prol_CB", "NHEK"]
+      "alias_ensembl": ["EPC_VB", "HMEC", "HUVEC", "HUVEC_prol_CB", "NHEK", "endothelial_cell_of_umbilical_vein"]
     },
     {
       "type": "StemCells",
       "alias_ucsc": [],
-      "alias_ensembl": ["H1ESC", "H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "MSC_VB", "iPS_20b", "iPS_DF_6_9", "iPS_DF_19_11"]
+      "alias_ensembl": ["H1ESC", "HUES48", "HUES6", "HUES64",  "H1_hESC", "H1_hESC_ENCSR820QMS", "H9_ENCSR323FKB", "H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "MSC_VB", "iPS_20b", "iPS_15b", "iPS_DF_6_9", "iPS_DF_19_11", "common_myeloid_progenitor__CD34_positive", "common_myeloid_progenitor__CD34_positive_ENCSR337XXD_1", "common_myeloid_progenitor__CD34_positive_ENCSR722JRY"]
     },
     {
       "type": "Lung",
       "alias_ucsc": [],
-      "alias_ensembl": ["Lung", "IMR90", "NHLF"]
+      "alias_ensembl": ["Lung", "IMR90", "NHLF", "lung_ENCSR465WKM"]
     },
     {
       "type": "Pancreas",
@@ -99,7 +109,7 @@
     {
       "type": "Liver",
       "alias_ucsc": ["liver"],
-      "alias_ensembl": []
+      "alias_ensembl": ["hepatocyte"]
     },
     {
       "type": "Ovary",
@@ -119,7 +129,7 @@
     {
       "type": "Heart",
       "alias_ucsc": ["heart"],
-      "alias_ensembl": ["Right_Atrium", "Left_Ventricle"]
+      "alias_ensembl": ["Right_Atrium", "Left_Ventricle", "cardiac_muscle_cell", "heart_right_ventricle", "heart"]
     },
     {
       "type": "Osteoblast",
@@ -129,7 +139,7 @@
     {
       "type": "Fibroblast",
       "alias_ucsc": [],
-      "alias_ensembl": ["NHDF_AD"]
+      "alias_ensembl": ["NHDF_AD", "fibroblast_of_dermis", "fibroblast_of_lung", "IMR_90"]
     },
     {
       "type": "NK-Cells",
@@ -144,7 +154,7 @@
     {
       "type": "Brain",
       "alias_ucsc": ["midbrain (mesencephalon)", "trigeminal V (ganglion, cranial)", "forebrain", "neural tube", "hindbrain (rhombencephalon)", "dorsal root ganglion", "cranial nerve"],
-      "alias_ensembl": ["NH_A"]
+      "alias_ensembl": ["NH_A", "astrocyte", "bipolar_neuron", "brain", "neural_progenitor_cell", "neural_stem_progenitor_cell", "neuron"]
     },
     {
       "type": "Mesenchym",
@@ -175,5 +185,15 @@
       "type": "Melanocytes",
       "alias_ucsc": ["melanocytes"],
       "alias_ensembl": []
+    },
+    {
+      "type": "Miscelanious",
+      "alias_ucsc": [],
+      "alias_ensembl": ["endodermal_cell", "esophagus", "HCT116", "Karpas_422", "keratinocyte", "mammary_epithelial_cell", "MCF_7", "MM_1S", "myotube", "PC_3", "PC_9", "SK_N_SH"]
+    },
+    {
+      "type": "Kidney",
+      "alias_ucsc": [],
+      "alias_ensembl": ["kidney"]
     }
 ]