diff --git a/README.md b/README.md index 75279af..e6a58bf 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ export PATH=[meme-suite instalation path]/bin:$PATH Every other dependency will be automatically installed by Nextflow using conda. For that a new conda enviroment will be created, which can be found in the from Nextflow created work directory after the first pipeline run. It is **not** required to create and activate the enviroment from the yaml-file beforehand. -**Important Note:** For conda the channel bioconda needs to be set as highest priority! This is required due to two differnt packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfisch package from the channel conda-forge! +**Important Note:** For conda the channel bioconda needs to be set as highest priority! This is required due to two different packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfish package from the channel conda-forge! ## Quick Start @@ -94,7 +94,7 @@ All arguments can be set in the configuration files For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki). ## Known issues -The Nextflow-script needs a conda enviroment to run. Nextflow creates the needed enviroment from the given yaml-file. +The Nextflow-script needs a conda environment to run. Nextflow creates the needed environment from the given yaml-file. On some systems Nextflow exits the run with following error: ``` Caused by: @@ -103,11 +103,11 @@ Caused by: status : 143 message: ``` -If this error occurs you have to create the enviroment before starting the pipeline. -To create this enviroment you need the yml-file from the repository. -Run the following commands to create the enviroment: +If this error occurs you have to create the environment before starting the pipeline. +To create this environment you need the yml-file from the repository. +Run the following commands to create the environment: ```console path=[Path to given masterenv.yml file] conda env create --name masterenv -f $path ``` -When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it. +When the environment is created, set the variable 'path_env' in the configuration file as the path to it. diff --git a/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py b/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py index 93b3558..eb201dd 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py @@ -120,20 +120,24 @@ def activity_comparator(self, aliaslist): # return_value: Array of Activitystatus by category (type in config) concatenated_array = bytearray([]) - length = len(self.activity[aliaslist[0]]) input_arrays = [self.activity[index] for index in aliaslist] for x in range(length): - if any(y[x] == 0 for y in input_arrays): - concatenated_array.append(0) - elif any(y[x] == 1 for y in input_arrays): - concatenated_array.append(1) - elif any(y[x] == 2 for y in input_arrays): - concatenated_array.append(2) - elif any(y[x] == 3 for y in input_arrays): - concatenated_array.append(3) - elif any(y[x] == 4 for y in input_arrays): + # This try-catch block is needed because of inconsistency in file-lengths in Ensembl-release-95 + try: + if any(y[x] == 0 for y in input_arrays): + concatenated_array.append(0) + elif any(y[x] == 1 for y in input_arrays): + concatenated_array.append(1) + elif any(y[x] == 2 for y in input_arrays): + concatenated_array.append(2) + elif any(y[x] == 3 for y in input_arrays): + concatenated_array.append(3) + elif any(y[x] == 4 for y in input_arrays): + concatenated_array.append(4) + except IndexError: concatenated_array.append(4) + return concatenated_array diff --git a/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py b/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py index eaf88f6..22914b6 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py @@ -89,9 +89,9 @@ def generate_additional_information(gene_id, activity): # activity = List of activity-data for specified gene # return_value: String for attributes (column 9) in gtf-format - if gene_id.startswith("ID=regulatory_region:"): + if not gene_id.startswith("ID=E"): gene_id = 'gene_id "'+gene_id.split(':')[1]+'"' - elif gene_id.startswith("ID=E"): + else: gene_id = 'gene_id "'+gene_id.split('=')[1]+'"' activity_string = 'activity "'+', '.join(activity)+'"' @@ -107,7 +107,13 @@ def generate_activity_list(self, activity, index): activity_list = [] for key, value in activity.items(): - activity_list.append(key+">"+self.value_map[value[index]]) + # if no index is found the key will be annotated as "NA" + # this is needed due to an inconsistency in ensembl-release-95 + # some activity-files contain less entries than others + try: + activity_list.append(key+">"+self.value_map[value[index]]) + except IndexError: + activity_list.append(key+">NA") return activity_list def get_gtf(self, release, activity): diff --git a/bin/3.1_create_gtf/Modules/Validator.py b/bin/3.1_create_gtf/Modules/Validator.py index ee34471..ae66f75 100644 --- a/bin/3.1_create_gtf/Modules/Validator.py +++ b/bin/3.1_create_gtf/Modules/Validator.py @@ -1,7 +1,10 @@ +import os + + class Validator: """ - Class to validate the gtf-output-file. + Class to validate and sort the gtf-output-file. @author: Sebastian Beyvers @contact: sebastian.beyvers@med.uni-giessen.de @@ -13,8 +16,17 @@ def __init__(self, out_file): # input_parameter: out_file = path to output file self.out_file = out_file + self.sort_file() self.test_read_file() + def sort_file(self): + + # function that utilizes linux sort to sort the output by chromosome and start coordinate, + # this improves handling in tools like IGV + + command = "sort -V -k1,1 -k4,4n -o "+self.out_file+" "+self.out_file + os.system(command) + def test_read_file(self): # Method to test the output file-format diff --git a/bin/3.1_create_gtf/RegGTFExtractor.py b/bin/3.1_create_gtf/RegGTFExtractor.py old mode 100644 new mode 100755 diff --git a/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json b/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json index 0f02cde..776d83e 100644 --- a/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json +++ b/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json @@ -2,7 +2,7 @@ { "type": "A549", "alias_ucsc": [], - "alias_ensembl": ["A549"] + "alias_ensembl": ["A549", "A673"] }, { "type": "Aorta", @@ -17,22 +17,32 @@ { "type": "B-Cells", "alias_ucsc": [], - "alias_ensembl": ["B_cells_PB_Roadmap", "naive_B_cell_VB", "GM12878"] + "alias_ensembl": ["B_cells_PB_Roadmap", "naive_B_cell_VB", "GM12878", "B_cell_ENCSR682AXR", "CD38__naive_B_cell_VB", "naive_B_cell_To"] }, { "type": "T-Cell", "alias_ucsc": [], - "alias_ensembl": ["CD4_ab_T_cell_VB", "CM_CD4_ab_T_cell_VB", "CD8_ab_T_cell_CB", "T_cells_PB_Roadmap"] + "alias_ensembl": ["CD4_ab_T_cell_VB", + "CM_CD4_ab_T_cell_VB", + "CD8_ab_T_cell_CB", + "T_cells_PB_Roadmap", + "CD4_ab_T_cell_CB", + "CD4_positive__alpha_beta_memory_T_cell", + "CD4_positive__alpha_beta_T_cell", + "CD4_positive__alpha_beta_T_cell_ENCSR948ZKZ", + "CD4_positive__CD25_positive__alpha_beta_regulatory_T_cell", + "effector_memory_CD4_positive__alpha_beta_T_cell", "EM_CD8_ab_T_cell_VB", + "naive_thymus_derived_CD4_positive__alpha_beta_T_cell", "T_helper_17_cell"] }, { "type": "Monocyte", "alias_ucsc": [], - "alias_ensembl": ["CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB", "Monocytes_CD14", "Monocytes_CD14_PB_Roadmap"] + "alias_ensembl": ["CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB", "Monocytes_CD14", "Monocytes_CD14_PB_Roadmap", "CD14_positive_monocyte"] }, { "type": "Neutrophil", "alias_ucsc": [], - "alias_ensembl": ["neutrophil_CB", "neutrophil_myelocyte_BM", "neutrophil_VB"] + "alias_ensembl": ["neutrophil_CB", "neutrophil_myelocyte_BM", "neutrophil_VB", "neutrophil"] }, { "type": "Eosinophil", @@ -59,7 +69,7 @@ { "type": "Intestine", "alias_ucsc": [], - "alias_ensembl": ["Fetal_Intestine_Large", "Fetal_Intestine_Small", "Small_Intestine"] + "alias_ensembl": ["Fetal_Intestine_Large", "Fetal_Intestine_Small", "Small_Intestine", "sigmoid_colon"] }, { "type": "AdrenalGland", @@ -69,7 +79,7 @@ { "type": "Muscle", "alias_ucsc": ["limb"], - "alias_ensembl": ["Fetal_Muscle_Leg", "Fetal_Muscle_Trunk", "Psoas_Muscle", "HSMM", "HSMMtube"] + "alias_ensembl": ["Fetal_Muscle_Leg", "Fetal_Muscle_Trunk", "Psoas_Muscle", "HSMM", "HSMMtube", "skeletal_muscle_myoblast"] }, { "type": "Gastric", @@ -79,17 +89,17 @@ { "type": "Endothelial", "alias_ucsc": ["blood vessels"], - "alias_ensembl": ["EPC_VB", "HMEC", "HUVEC", "HUVEC_prol_CB", "NHEK"] + "alias_ensembl": ["EPC_VB", "HMEC", "HUVEC", "HUVEC_prol_CB", "NHEK", "endothelial_cell_of_umbilical_vein"] }, { "type": "StemCells", "alias_ucsc": [], - "alias_ensembl": ["H1ESC", "H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "MSC_VB", "iPS_20b", "iPS_DF_6_9", "iPS_DF_19_11"] + "alias_ensembl": ["H1ESC", "HUES48", "HUES6", "HUES64", "H1_hESC", "H1_hESC_ENCSR820QMS", "H9_ENCSR323FKB", "H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "MSC_VB", "iPS_20b", "iPS_15b", "iPS_DF_6_9", "iPS_DF_19_11", "common_myeloid_progenitor__CD34_positive", "common_myeloid_progenitor__CD34_positive_ENCSR337XXD_1", "common_myeloid_progenitor__CD34_positive_ENCSR722JRY"] }, { "type": "Lung", "alias_ucsc": [], - "alias_ensembl": ["Lung", "IMR90", "NHLF"] + "alias_ensembl": ["Lung", "IMR90", "NHLF", "lung_ENCSR465WKM"] }, { "type": "Pancreas", @@ -99,7 +109,7 @@ { "type": "Liver", "alias_ucsc": ["liver"], - "alias_ensembl": [] + "alias_ensembl": ["hepatocyte"] }, { "type": "Ovary", @@ -119,7 +129,7 @@ { "type": "Heart", "alias_ucsc": ["heart"], - "alias_ensembl": ["Right_Atrium", "Left_Ventricle"] + "alias_ensembl": ["Right_Atrium", "Left_Ventricle", "cardiac_muscle_cell", "heart_right_ventricle", "heart"] }, { "type": "Osteoblast", @@ -129,7 +139,7 @@ { "type": "Fibroblast", "alias_ucsc": [], - "alias_ensembl": ["NHDF_AD"] + "alias_ensembl": ["NHDF_AD", "fibroblast_of_dermis", "fibroblast_of_lung", "IMR_90"] }, { "type": "NK-Cells", @@ -144,7 +154,7 @@ { "type": "Brain", "alias_ucsc": ["midbrain (mesencephalon)", "trigeminal V (ganglion, cranial)", "forebrain", "neural tube", "hindbrain (rhombencephalon)", "dorsal root ganglion", "cranial nerve"], - "alias_ensembl": ["NH_A"] + "alias_ensembl": ["NH_A", "astrocyte", "bipolar_neuron", "brain", "neural_progenitor_cell", "neural_stem_progenitor_cell", "neuron"] }, { "type": "Mesenchym", @@ -175,5 +185,15 @@ "type": "Melanocytes", "alias_ucsc": ["melanocytes"], "alias_ensembl": [] + }, + { + "type": "Miscelanious", + "alias_ucsc": [], + "alias_ensembl": ["endodermal_cell", "esophagus", "HCT116", "Karpas_422", "keratinocyte", "mammary_epithelial_cell", "MCF_7", "MM_1S", "myotube", "PC_3", "PC_9", "SK_N_SH"] + }, + { + "type": "Kidney", + "alias_ucsc": [], + "alias_ensembl": ["kidney"] } ]