diff --git a/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py b/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py index 93b3558..78f1826 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py @@ -120,20 +120,22 @@ def activity_comparator(self, aliaslist): # return_value: Array of Activitystatus by category (type in config) concatenated_array = bytearray([]) - length = len(self.activity[aliaslist[0]]) input_arrays = [self.activity[index] for index in aliaslist] - for x in range(length): - if any(y[x] == 0 for y in input_arrays): - concatenated_array.append(0) - elif any(y[x] == 1 for y in input_arrays): - concatenated_array.append(1) - elif any(y[x] == 2 for y in input_arrays): - concatenated_array.append(2) - elif any(y[x] == 3 for y in input_arrays): - concatenated_array.append(3) - elif any(y[x] == 4 for y in input_arrays): - concatenated_array.append(4) + try: + for x in range(length): + if any(y[x] == 0 for y in input_arrays): + concatenated_array.append(0) + elif any(y[x] == 1 for y in input_arrays): + concatenated_array.append(1) + elif any(y[x] == 2 for y in input_arrays): + concatenated_array.append(2) + elif any(y[x] == 3 for y in input_arrays): + concatenated_array.append(3) + elif any(y[x] == 4 for y in input_arrays): + concatenated_array.append(4) + except IndexError: + print("Indexerror occured") return concatenated_array diff --git a/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py b/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py index eaf88f6..cad1451 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py @@ -89,9 +89,9 @@ def generate_additional_information(gene_id, activity): # activity = List of activity-data for specified gene # return_value: String for attributes (column 9) in gtf-format - if gene_id.startswith("ID=regulatory_region:"): + if not gene_id.startswith("ID=E"): gene_id = 'gene_id "'+gene_id.split(':')[1]+'"' - elif gene_id.startswith("ID=E"): + else: gene_id = 'gene_id "'+gene_id.split('=')[1]+'"' activity_string = 'activity "'+', '.join(activity)+'"' @@ -107,7 +107,10 @@ def generate_activity_list(self, activity, index): activity_list = [] for key, value in activity.items(): - activity_list.append(key+">"+self.value_map[value[index]]) + try: + activity_list.append(key+">"+self.value_map[value[index]]) + except IndexError: + pass return activity_list def get_gtf(self, release, activity): diff --git a/bin/3.1_create_gtf/RegGTFExtractor.py b/bin/3.1_create_gtf/RegGTFExtractor.py old mode 100644 new mode 100755 diff --git a/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json b/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json index 0f02cde..d10135b 100644 --- a/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json +++ b/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json @@ -2,7 +2,7 @@ { "type": "A549", "alias_ucsc": [], - "alias_ensembl": ["A549"] + "alias_ensembl": ["A549", "A673"] }, { "type": "Aorta", @@ -17,22 +17,32 @@ { "type": "B-Cells", "alias_ucsc": [], - "alias_ensembl": ["B_cells_PB_Roadmap", "naive_B_cell_VB", "GM12878"] + "alias_ensembl": ["B_cells_PB_Roadmap", "naive_B_cell_VB", "GM12878", "B_cell_ENCSR682AXR", "CD38__naive_B_cell_VB", "naive_B_cell_To"] }, { "type": "T-Cell", "alias_ucsc": [], - "alias_ensembl": ["CD4_ab_T_cell_VB", "CM_CD4_ab_T_cell_VB", "CD8_ab_T_cell_CB", "T_cells_PB_Roadmap"] + "alias_ensembl": ["CD4_ab_T_cell_VB", + "CM_CD4_ab_T_cell_VB", + "CD8_ab_T_cell_CB", + "T_cells_PB_Roadmap", + "CD4_ab_T_cell_CB", + "CD4_positive__alpha_beta_memory_T_cell", + "CD4_positive__alpha_beta_T_cell", + "CD4_positive__alpha_beta_T_cell_ENCSR948ZKZ", + "CD4_positive__CD25_positive__alpha_beta_regulatory_T_cell", + "effector_memory_CD4_positive__alpha_beta_T_cell", "EM_CD8_ab_T_cell_VB", + "naive_thymus_derived_CD4_positive__alpha_beta_T_cell", "T_helper_17_cell"] }, { "type": "Monocyte", "alias_ucsc": [], - "alias_ensembl": ["CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB", "Monocytes_CD14", "Monocytes_CD14_PB_Roadmap"] + "alias_ensembl": ["CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB", "Monocytes_CD14", "Monocytes_CD14_PB_Roadmap", "CD14_positive_monocyte"] }, { "type": "Neutrophil", "alias_ucsc": [], - "alias_ensembl": ["neutrophil_CB", "neutrophil_myelocyte_BM", "neutrophil_VB"] + "alias_ensembl": ["neutrophil_CB", "neutrophil_myelocyte_BM", "neutrophil_VB", "neutrophil"] }, { "type": "Eosinophil", @@ -59,7 +69,7 @@ { "type": "Intestine", "alias_ucsc": [], - "alias_ensembl": ["Fetal_Intestine_Large", "Fetal_Intestine_Small", "Small_Intestine"] + "alias_ensembl": ["Fetal_Intestine_Large", "Fetal_Intestine_Small", "Small_Intestine", "sigmoid_colon"] }, { "type": "AdrenalGland", @@ -69,7 +79,7 @@ { "type": "Muscle", "alias_ucsc": ["limb"], - "alias_ensembl": ["Fetal_Muscle_Leg", "Fetal_Muscle_Trunk", "Psoas_Muscle", "HSMM", "HSMMtube"] + "alias_ensembl": ["Fetal_Muscle_Leg", "Fetal_Muscle_Trunk", "Psoas_Muscle", "HSMM", "HSMMtube", "skeletal_muscle_myoblast"] }, { "type": "Gastric", @@ -79,17 +89,17 @@ { "type": "Endothelial", "alias_ucsc": ["blood vessels"], - "alias_ensembl": ["EPC_VB", "HMEC", "HUVEC", "HUVEC_prol_CB", "NHEK"] + "alias_ensembl": ["EPC_VB", "HMEC", "HUVEC", "HUVEC_prol_CB", "NHEK", "endothelial_cell_of_umbilical_vein"] }, { "type": "StemCells", "alias_ucsc": [], - "alias_ensembl": ["H1ESC", "H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "MSC_VB", "iPS_20b", "iPS_DF_6_9", "iPS_DF_19_11"] + "alias_ensembl": ["H1ESC", "HUES48", "HUES6", "HUES64", "H1_hESC", "H1_hESC_ENCSR820QMS", "H9_ENCSR323FKB", "H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "MSC_VB", "iPS_20b", "iPS_15b", "iPS_DF_6_9", "iPS_DF_19_11", "common_myeloid_progenitor__CD34_positive", "common_myeloid_progenitor__CD34_positive_ENCSR337XXD_1", "common_myeloid_progenitor__CD34_positive_ENCSR722JRY"] }, { "type": "Lung", "alias_ucsc": [], - "alias_ensembl": ["Lung", "IMR90", "NHLF"] + "alias_ensembl": ["Lung", "IMR90", "NHLF", "lung_ENCSR465WKM"] }, { "type": "Pancreas", @@ -99,7 +109,7 @@ { "type": "Liver", "alias_ucsc": ["liver"], - "alias_ensembl": [] + "alias_ensembl": ["hepatocyte"] }, { "type": "Ovary", @@ -119,7 +129,7 @@ { "type": "Heart", "alias_ucsc": ["heart"], - "alias_ensembl": ["Right_Atrium", "Left_Ventricle"] + "alias_ensembl": ["Right_Atrium", "Left_Ventricle", "cardiac_muscle_cell", "heart_right_ventricle"] }, { "type": "Osteoblast", @@ -129,7 +139,7 @@ { "type": "Fibroblast", "alias_ucsc": [], - "alias_ensembl": ["NHDF_AD"] + "alias_ensembl": ["NHDF_AD", "fibroblast_of_dermis", "fibroblast_of_lung", "IMR_90"] }, { "type": "NK-Cells", @@ -144,7 +154,7 @@ { "type": "Brain", "alias_ucsc": ["midbrain (mesencephalon)", "trigeminal V (ganglion, cranial)", "forebrain", "neural tube", "hindbrain (rhombencephalon)", "dorsal root ganglion", "cranial nerve"], - "alias_ensembl": ["NH_A"] + "alias_ensembl": ["NH_A", "astrocyte", "bipolar_neuron", "brain", "neural_progenitor_cell", "neural_stem_progenitor_cell", "neuron"] }, { "type": "Mesenchym", @@ -175,5 +185,15 @@ "type": "Melanocytes", "alias_ucsc": ["melanocytes"], "alias_ensembl": [] + }, + { + "type": "Miscelanious", + "alias_ucsc": [], + "alias_ensembl": ["endodermal_cell", "esophagus", "HCT116", "Karpas_422", "keratinocyte", "mammary_epithelial_cell", "MCF_7", "MM_1S", "myotube", "PC_3", "PC_9", "SK_N_SH"] + }, + { + "type": "Kidney", + "alias_ucsc": [], + "alias_ensembl": ["kidney"] } ]