From a161ed03711d055fc90f5a913e354b58e1d5aa93 Mon Sep 17 00:00:00 2001 From: basti Date: Thu, 10 Jan 2019 03:58:25 +0100 Subject: [PATCH 1/4] Updated config, updated skript for changes in Ensembl Release 95 temporary fix for #51. needs more improvement of config handling --- .../Modules/Ensembl/ActivityCategorizer.py | 26 +++++----- bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py | 9 ++-- bin/3.1_create_gtf/RegGTFExtractor.py | 0 .../config/celltypes_homo_sapiens.json | 48 +++++++++++++------ 4 files changed, 54 insertions(+), 29 deletions(-) mode change 100644 => 100755 bin/3.1_create_gtf/RegGTFExtractor.py diff --git a/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py b/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py index 93b3558..78f1826 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py @@ -120,20 +120,22 @@ def activity_comparator(self, aliaslist): # return_value: Array of Activitystatus by category (type in config) concatenated_array = bytearray([]) - length = len(self.activity[aliaslist[0]]) input_arrays = [self.activity[index] for index in aliaslist] - for x in range(length): - if any(y[x] == 0 for y in input_arrays): - concatenated_array.append(0) - elif any(y[x] == 1 for y in input_arrays): - concatenated_array.append(1) - elif any(y[x] == 2 for y in input_arrays): - concatenated_array.append(2) - elif any(y[x] == 3 for y in input_arrays): - concatenated_array.append(3) - elif any(y[x] == 4 for y in input_arrays): - concatenated_array.append(4) + try: + for x in range(length): + if any(y[x] == 0 for y in input_arrays): + concatenated_array.append(0) + elif any(y[x] == 1 for y in input_arrays): + concatenated_array.append(1) + elif any(y[x] == 2 for y in input_arrays): + concatenated_array.append(2) + elif any(y[x] == 3 for y in input_arrays): + concatenated_array.append(3) + elif any(y[x] == 4 for y in input_arrays): + concatenated_array.append(4) + except IndexError: + print("Indexerror occured") return concatenated_array diff --git a/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py b/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py index eaf88f6..cad1451 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py @@ -89,9 +89,9 @@ def generate_additional_information(gene_id, activity): # activity = List of activity-data for specified gene # return_value: String for attributes (column 9) in gtf-format - if gene_id.startswith("ID=regulatory_region:"): + if not gene_id.startswith("ID=E"): gene_id = 'gene_id "'+gene_id.split(':')[1]+'"' - elif gene_id.startswith("ID=E"): + else: gene_id = 'gene_id "'+gene_id.split('=')[1]+'"' activity_string = 'activity "'+', '.join(activity)+'"' @@ -107,7 +107,10 @@ def generate_activity_list(self, activity, index): activity_list = [] for key, value in activity.items(): - activity_list.append(key+">"+self.value_map[value[index]]) + try: + activity_list.append(key+">"+self.value_map[value[index]]) + except IndexError: + pass return activity_list def get_gtf(self, release, activity): diff --git a/bin/3.1_create_gtf/RegGTFExtractor.py b/bin/3.1_create_gtf/RegGTFExtractor.py old mode 100644 new mode 100755 diff --git a/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json b/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json index 0f02cde..d10135b 100644 --- a/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json +++ b/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json @@ -2,7 +2,7 @@ { "type": "A549", "alias_ucsc": [], - "alias_ensembl": ["A549"] + "alias_ensembl": ["A549", "A673"] }, { "type": "Aorta", @@ -17,22 +17,32 @@ { "type": "B-Cells", "alias_ucsc": [], - "alias_ensembl": ["B_cells_PB_Roadmap", "naive_B_cell_VB", "GM12878"] + "alias_ensembl": ["B_cells_PB_Roadmap", "naive_B_cell_VB", "GM12878", "B_cell_ENCSR682AXR", "CD38__naive_B_cell_VB", "naive_B_cell_To"] }, { "type": "T-Cell", "alias_ucsc": [], - "alias_ensembl": ["CD4_ab_T_cell_VB", "CM_CD4_ab_T_cell_VB", "CD8_ab_T_cell_CB", "T_cells_PB_Roadmap"] + "alias_ensembl": ["CD4_ab_T_cell_VB", + "CM_CD4_ab_T_cell_VB", + "CD8_ab_T_cell_CB", + "T_cells_PB_Roadmap", + "CD4_ab_T_cell_CB", + "CD4_positive__alpha_beta_memory_T_cell", + "CD4_positive__alpha_beta_T_cell", + "CD4_positive__alpha_beta_T_cell_ENCSR948ZKZ", + "CD4_positive__CD25_positive__alpha_beta_regulatory_T_cell", + "effector_memory_CD4_positive__alpha_beta_T_cell", "EM_CD8_ab_T_cell_VB", + "naive_thymus_derived_CD4_positive__alpha_beta_T_cell", "T_helper_17_cell"] }, { "type": "Monocyte", "alias_ucsc": [], - "alias_ensembl": ["CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB", "Monocytes_CD14", "Monocytes_CD14_PB_Roadmap"] + "alias_ensembl": ["CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB", "Monocytes_CD14", "Monocytes_CD14_PB_Roadmap", "CD14_positive_monocyte"] }, { "type": "Neutrophil", "alias_ucsc": [], - "alias_ensembl": ["neutrophil_CB", "neutrophil_myelocyte_BM", "neutrophil_VB"] + "alias_ensembl": ["neutrophil_CB", "neutrophil_myelocyte_BM", "neutrophil_VB", "neutrophil"] }, { "type": "Eosinophil", @@ -59,7 +69,7 @@ { "type": "Intestine", "alias_ucsc": [], - "alias_ensembl": ["Fetal_Intestine_Large", "Fetal_Intestine_Small", "Small_Intestine"] + "alias_ensembl": ["Fetal_Intestine_Large", "Fetal_Intestine_Small", "Small_Intestine", "sigmoid_colon"] }, { "type": "AdrenalGland", @@ -69,7 +79,7 @@ { "type": "Muscle", "alias_ucsc": ["limb"], - "alias_ensembl": ["Fetal_Muscle_Leg", "Fetal_Muscle_Trunk", "Psoas_Muscle", "HSMM", "HSMMtube"] + "alias_ensembl": ["Fetal_Muscle_Leg", "Fetal_Muscle_Trunk", "Psoas_Muscle", "HSMM", "HSMMtube", "skeletal_muscle_myoblast"] }, { "type": "Gastric", @@ -79,17 +89,17 @@ { "type": "Endothelial", "alias_ucsc": ["blood vessels"], - "alias_ensembl": ["EPC_VB", "HMEC", "HUVEC", "HUVEC_prol_CB", "NHEK"] + "alias_ensembl": ["EPC_VB", "HMEC", "HUVEC", "HUVEC_prol_CB", "NHEK", "endothelial_cell_of_umbilical_vein"] }, { "type": "StemCells", "alias_ucsc": [], - "alias_ensembl": ["H1ESC", "H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "MSC_VB", "iPS_20b", "iPS_DF_6_9", "iPS_DF_19_11"] + "alias_ensembl": ["H1ESC", "HUES48", "HUES6", "HUES64", "H1_hESC", "H1_hESC_ENCSR820QMS", "H9_ENCSR323FKB", "H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "MSC_VB", "iPS_20b", "iPS_15b", "iPS_DF_6_9", "iPS_DF_19_11", "common_myeloid_progenitor__CD34_positive", "common_myeloid_progenitor__CD34_positive_ENCSR337XXD_1", "common_myeloid_progenitor__CD34_positive_ENCSR722JRY"] }, { "type": "Lung", "alias_ucsc": [], - "alias_ensembl": ["Lung", "IMR90", "NHLF"] + "alias_ensembl": ["Lung", "IMR90", "NHLF", "lung_ENCSR465WKM"] }, { "type": "Pancreas", @@ -99,7 +109,7 @@ { "type": "Liver", "alias_ucsc": ["liver"], - "alias_ensembl": [] + "alias_ensembl": ["hepatocyte"] }, { "type": "Ovary", @@ -119,7 +129,7 @@ { "type": "Heart", "alias_ucsc": ["heart"], - "alias_ensembl": ["Right_Atrium", "Left_Ventricle"] + "alias_ensembl": ["Right_Atrium", "Left_Ventricle", "cardiac_muscle_cell", "heart_right_ventricle"] }, { "type": "Osteoblast", @@ -129,7 +139,7 @@ { "type": "Fibroblast", "alias_ucsc": [], - "alias_ensembl": ["NHDF_AD"] + "alias_ensembl": ["NHDF_AD", "fibroblast_of_dermis", "fibroblast_of_lung", "IMR_90"] }, { "type": "NK-Cells", @@ -144,7 +154,7 @@ { "type": "Brain", "alias_ucsc": ["midbrain (mesencephalon)", "trigeminal V (ganglion, cranial)", "forebrain", "neural tube", "hindbrain (rhombencephalon)", "dorsal root ganglion", "cranial nerve"], - "alias_ensembl": ["NH_A"] + "alias_ensembl": ["NH_A", "astrocyte", "bipolar_neuron", "brain", "neural_progenitor_cell", "neural_stem_progenitor_cell", "neuron"] }, { "type": "Mesenchym", @@ -175,5 +185,15 @@ "type": "Melanocytes", "alias_ucsc": ["melanocytes"], "alias_ensembl": [] + }, + { + "type": "Miscelanious", + "alias_ucsc": [], + "alias_ensembl": ["endodermal_cell", "esophagus", "HCT116", "Karpas_422", "keratinocyte", "mammary_epithelial_cell", "MCF_7", "MM_1S", "myotube", "PC_3", "PC_9", "SK_N_SH"] + }, + { + "type": "Kidney", + "alias_ucsc": [], + "alias_ensembl": ["kidney"] } ] From 97c526d60f039f3c10c29e7f395c6851852b5352 Mon Sep 17 00:00:00 2001 From: basti Date: Thu, 10 Jan 2019 17:51:01 +0100 Subject: [PATCH 2/4] Fix for #51 --- .../Modules/Ensembl/ActivityCategorizer.py | 10 ++++++---- bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py | 5 ++++- bin/3.1_create_gtf/config/celltypes_homo_sapiens.json | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py b/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py index 78f1826..eb201dd 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py @@ -122,8 +122,9 @@ def activity_comparator(self, aliaslist): concatenated_array = bytearray([]) length = len(self.activity[aliaslist[0]]) input_arrays = [self.activity[index] for index in aliaslist] - try: - for x in range(length): + for x in range(length): + # This try-catch block is needed because of inconsistency in file-lengths in Ensembl-release-95 + try: if any(y[x] == 0 for y in input_arrays): concatenated_array.append(0) elif any(y[x] == 1 for y in input_arrays): @@ -134,8 +135,9 @@ def activity_comparator(self, aliaslist): concatenated_array.append(3) elif any(y[x] == 4 for y in input_arrays): concatenated_array.append(4) - except IndexError: - print("Indexerror occured") + except IndexError: + concatenated_array.append(4) + return concatenated_array diff --git a/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py b/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py index cad1451..22914b6 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py @@ -107,10 +107,13 @@ def generate_activity_list(self, activity, index): activity_list = [] for key, value in activity.items(): + # if no index is found the key will be annotated as "NA" + # this is needed due to an inconsistency in ensembl-release-95 + # some activity-files contain less entries than others try: activity_list.append(key+">"+self.value_map[value[index]]) except IndexError: - pass + activity_list.append(key+">NA") return activity_list def get_gtf(self, release, activity): diff --git a/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json b/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json index d10135b..776d83e 100644 --- a/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json +++ b/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json @@ -129,7 +129,7 @@ { "type": "Heart", "alias_ucsc": ["heart"], - "alias_ensembl": ["Right_Atrium", "Left_Ventricle", "cardiac_muscle_cell", "heart_right_ventricle"] + "alias_ensembl": ["Right_Atrium", "Left_Ventricle", "cardiac_muscle_cell", "heart_right_ventricle", "heart"] }, { "type": "Osteoblast", From 6ed7d989766323e6e834268171c50f89199babef Mon Sep 17 00:00:00 2001 From: basti Date: Fri, 11 Jan 2019 13:18:05 +0100 Subject: [PATCH 3/4] Added sorted outputs to improve handling in IGV -> Fix #55 --- bin/3.1_create_gtf/Modules/Validator.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/bin/3.1_create_gtf/Modules/Validator.py b/bin/3.1_create_gtf/Modules/Validator.py index ee34471..ae66f75 100644 --- a/bin/3.1_create_gtf/Modules/Validator.py +++ b/bin/3.1_create_gtf/Modules/Validator.py @@ -1,7 +1,10 @@ +import os + + class Validator: """ - Class to validate the gtf-output-file. + Class to validate and sort the gtf-output-file. @author: Sebastian Beyvers @contact: sebastian.beyvers@med.uni-giessen.de @@ -13,8 +16,17 @@ def __init__(self, out_file): # input_parameter: out_file = path to output file self.out_file = out_file + self.sort_file() self.test_read_file() + def sort_file(self): + + # function that utilizes linux sort to sort the output by chromosome and start coordinate, + # this improves handling in tools like IGV + + command = "sort -V -k1,1 -k4,4n -o "+self.out_file+" "+self.out_file + os.system(command) + def test_read_file(self): # Method to test the output file-format From 6c912f8859acc9d667598e6643a6e5b15b75ffdd Mon Sep 17 00:00:00 2001 From: basti Date: Fri, 11 Jan 2019 13:44:49 +0100 Subject: [PATCH 4/4] Removed some typos in Readme --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 75279af..e6a58bf 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ export PATH=[meme-suite instalation path]/bin:$PATH Every other dependency will be automatically installed by Nextflow using conda. For that a new conda enviroment will be created, which can be found in the from Nextflow created work directory after the first pipeline run. It is **not** required to create and activate the enviroment from the yaml-file beforehand. -**Important Note:** For conda the channel bioconda needs to be set as highest priority! This is required due to two differnt packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfisch package from the channel conda-forge! +**Important Note:** For conda the channel bioconda needs to be set as highest priority! This is required due to two different packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfish package from the channel conda-forge! ## Quick Start @@ -94,7 +94,7 @@ All arguments can be set in the configuration files For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki). ## Known issues -The Nextflow-script needs a conda enviroment to run. Nextflow creates the needed enviroment from the given yaml-file. +The Nextflow-script needs a conda environment to run. Nextflow creates the needed environment from the given yaml-file. On some systems Nextflow exits the run with following error: ``` Caused by: @@ -103,11 +103,11 @@ Caused by: status : 143 message: ``` -If this error occurs you have to create the enviroment before starting the pipeline. -To create this enviroment you need the yml-file from the repository. -Run the following commands to create the enviroment: +If this error occurs you have to create the environment before starting the pipeline. +To create this environment you need the yml-file from the repository. +Run the following commands to create the environment: ```console path=[Path to given masterenv.yml file] conda env create --name masterenv -f $path ``` -When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it. +When the environment is created, set the variable 'path_env' in the configuration file as the path to it.