Skip to content

Gtf creation #56

Merged
merged 4 commits into from
Jan 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ export PATH=[meme-suite instalation path]/bin:$PATH
Every other dependency will be automatically installed by Nextflow using conda. For that a new conda enviroment will be created, which can be found in the from Nextflow created work directory after the first pipeline run.
It is **not** required to create and activate the enviroment from the yaml-file beforehand.

**Important Note:** For conda the channel bioconda needs to be set as highest priority! This is required due to two differnt packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfisch package from the channel conda-forge!
**Important Note:** For conda the channel bioconda needs to be set as highest priority! This is required due to two different packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfish package from the channel conda-forge!


## Quick Start
Expand Down Expand Up @@ -94,7 +94,7 @@ All arguments can be set in the configuration files
For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki).

## Known issues
The Nextflow-script needs a conda enviroment to run. Nextflow creates the needed enviroment from the given yaml-file.
The Nextflow-script needs a conda environment to run. Nextflow creates the needed environment from the given yaml-file.
On some systems Nextflow exits the run with following error:
```
Caused by:
Expand All @@ -103,11 +103,11 @@ Caused by:
status : 143
message:
```
If this error occurs you have to create the enviroment before starting the pipeline.
To create this enviroment you need the yml-file from the repository.
Run the following commands to create the enviroment:
If this error occurs you have to create the environment before starting the pipeline.
To create this environment you need the yml-file from the repository.
Run the following commands to create the environment:
```console
path=[Path to given masterenv.yml file]
conda env create --name masterenv -f $path
```
When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it.
When the environment is created, set the variable 'path_env' in the configuration file as the path to it.
24 changes: 14 additions & 10 deletions bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,20 +120,24 @@ def activity_comparator(self, aliaslist):
# return_value: Array of Activitystatus by category (type in config)

concatenated_array = bytearray([])

length = len(self.activity[aliaslist[0]])
input_arrays = [self.activity[index] for index in aliaslist]
for x in range(length):
if any(y[x] == 0 for y in input_arrays):
concatenated_array.append(0)
elif any(y[x] == 1 for y in input_arrays):
concatenated_array.append(1)
elif any(y[x] == 2 for y in input_arrays):
concatenated_array.append(2)
elif any(y[x] == 3 for y in input_arrays):
concatenated_array.append(3)
elif any(y[x] == 4 for y in input_arrays):
# This try-catch block is needed because of inconsistency in file-lengths in Ensembl-release-95
try:
if any(y[x] == 0 for y in input_arrays):
concatenated_array.append(0)
elif any(y[x] == 1 for y in input_arrays):
concatenated_array.append(1)
elif any(y[x] == 2 for y in input_arrays):
concatenated_array.append(2)
elif any(y[x] == 3 for y in input_arrays):
concatenated_array.append(3)
elif any(y[x] == 4 for y in input_arrays):
concatenated_array.append(4)
except IndexError:
concatenated_array.append(4)

return concatenated_array


Expand Down
12 changes: 9 additions & 3 deletions bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ def generate_additional_information(gene_id, activity):
# activity = List of activity-data for specified gene
# return_value: String for attributes (column 9) in gtf-format

if gene_id.startswith("ID=regulatory_region:"):
if not gene_id.startswith("ID=E"):
gene_id = 'gene_id "'+gene_id.split(':')[1]+'"'
elif gene_id.startswith("ID=E"):
else:
gene_id = 'gene_id "'+gene_id.split('=')[1]+'"'

activity_string = 'activity "'+', '.join(activity)+'"'
Expand All @@ -107,7 +107,13 @@ def generate_activity_list(self, activity, index):

activity_list = []
for key, value in activity.items():
activity_list.append(key+">"+self.value_map[value[index]])
# if no index is found the key will be annotated as "NA"
# this is needed due to an inconsistency in ensembl-release-95
# some activity-files contain less entries than others
try:
activity_list.append(key+">"+self.value_map[value[index]])
except IndexError:
activity_list.append(key+">NA")
return activity_list

def get_gtf(self, release, activity):
Expand Down
14 changes: 13 additions & 1 deletion bin/3.1_create_gtf/Modules/Validator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import os


class Validator:

"""
Class to validate the gtf-output-file.
Class to validate and sort the gtf-output-file.
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de

Expand All @@ -13,8 +16,17 @@ def __init__(self, out_file):
# input_parameter: out_file = path to output file

self.out_file = out_file
self.sort_file()
self.test_read_file()

def sort_file(self):

# function that utilizes linux sort to sort the output by chromosome and start coordinate,
# this improves handling in tools like IGV

command = "sort -V -k1,1 -k4,4n -o "+self.out_file+" "+self.out_file
os.system(command)

def test_read_file(self):

# Method to test the output file-format
Expand Down
Empty file modified bin/3.1_create_gtf/RegGTFExtractor.py
100644 → 100755
Empty file.
48 changes: 34 additions & 14 deletions bin/3.1_create_gtf/config/celltypes_homo_sapiens.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
{
"type": "A549",
"alias_ucsc": [],
"alias_ensembl": ["A549"]
"alias_ensembl": ["A549", "A673"]
},
{
"type": "Aorta",
Expand All @@ -17,22 +17,32 @@
{
"type": "B-Cells",
"alias_ucsc": [],
"alias_ensembl": ["B_cells_PB_Roadmap", "naive_B_cell_VB", "GM12878"]
"alias_ensembl": ["B_cells_PB_Roadmap", "naive_B_cell_VB", "GM12878", "B_cell_ENCSR682AXR", "CD38__naive_B_cell_VB", "naive_B_cell_To"]
},
{
"type": "T-Cell",
"alias_ucsc": [],
"alias_ensembl": ["CD4_ab_T_cell_VB", "CM_CD4_ab_T_cell_VB", "CD8_ab_T_cell_CB", "T_cells_PB_Roadmap"]
"alias_ensembl": ["CD4_ab_T_cell_VB",
"CM_CD4_ab_T_cell_VB",
"CD8_ab_T_cell_CB",
"T_cells_PB_Roadmap",
"CD4_ab_T_cell_CB",
"CD4_positive__alpha_beta_memory_T_cell",
"CD4_positive__alpha_beta_T_cell",
"CD4_positive__alpha_beta_T_cell_ENCSR948ZKZ",
"CD4_positive__CD25_positive__alpha_beta_regulatory_T_cell",
"effector_memory_CD4_positive__alpha_beta_T_cell", "EM_CD8_ab_T_cell_VB",
"naive_thymus_derived_CD4_positive__alpha_beta_T_cell", "T_helper_17_cell"]
},
{
"type": "Monocyte",
"alias_ucsc": [],
"alias_ensembl": ["CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB", "Monocytes_CD14", "Monocytes_CD14_PB_Roadmap"]
"alias_ensembl": ["CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB", "Monocytes_CD14", "Monocytes_CD14_PB_Roadmap", "CD14_positive_monocyte"]
},
{
"type": "Neutrophil",
"alias_ucsc": [],
"alias_ensembl": ["neutrophil_CB", "neutrophil_myelocyte_BM", "neutrophil_VB"]
"alias_ensembl": ["neutrophil_CB", "neutrophil_myelocyte_BM", "neutrophil_VB", "neutrophil"]
},
{
"type": "Eosinophil",
Expand All @@ -59,7 +69,7 @@
{
"type": "Intestine",
"alias_ucsc": [],
"alias_ensembl": ["Fetal_Intestine_Large", "Fetal_Intestine_Small", "Small_Intestine"]
"alias_ensembl": ["Fetal_Intestine_Large", "Fetal_Intestine_Small", "Small_Intestine", "sigmoid_colon"]
},
{
"type": "AdrenalGland",
Expand All @@ -69,7 +79,7 @@
{
"type": "Muscle",
"alias_ucsc": ["limb"],
"alias_ensembl": ["Fetal_Muscle_Leg", "Fetal_Muscle_Trunk", "Psoas_Muscle", "HSMM", "HSMMtube"]
"alias_ensembl": ["Fetal_Muscle_Leg", "Fetal_Muscle_Trunk", "Psoas_Muscle", "HSMM", "HSMMtube", "skeletal_muscle_myoblast"]
},
{
"type": "Gastric",
Expand All @@ -79,17 +89,17 @@
{
"type": "Endothelial",
"alias_ucsc": ["blood vessels"],
"alias_ensembl": ["EPC_VB", "HMEC", "HUVEC", "HUVEC_prol_CB", "NHEK"]
"alias_ensembl": ["EPC_VB", "HMEC", "HUVEC", "HUVEC_prol_CB", "NHEK", "endothelial_cell_of_umbilical_vein"]
},
{
"type": "StemCells",
"alias_ucsc": [],
"alias_ensembl": ["H1ESC", "H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "MSC_VB", "iPS_20b", "iPS_DF_6_9", "iPS_DF_19_11"]
"alias_ensembl": ["H1ESC", "HUES48", "HUES6", "HUES64", "H1_hESC", "H1_hESC_ENCSR820QMS", "H9_ENCSR323FKB", "H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "MSC_VB", "iPS_20b", "iPS_15b", "iPS_DF_6_9", "iPS_DF_19_11", "common_myeloid_progenitor__CD34_positive", "common_myeloid_progenitor__CD34_positive_ENCSR337XXD_1", "common_myeloid_progenitor__CD34_positive_ENCSR722JRY"]
},
{
"type": "Lung",
"alias_ucsc": [],
"alias_ensembl": ["Lung", "IMR90", "NHLF"]
"alias_ensembl": ["Lung", "IMR90", "NHLF", "lung_ENCSR465WKM"]
},
{
"type": "Pancreas",
Expand All @@ -99,7 +109,7 @@
{
"type": "Liver",
"alias_ucsc": ["liver"],
"alias_ensembl": []
"alias_ensembl": ["hepatocyte"]
},
{
"type": "Ovary",
Expand All @@ -119,7 +129,7 @@
{
"type": "Heart",
"alias_ucsc": ["heart"],
"alias_ensembl": ["Right_Atrium", "Left_Ventricle"]
"alias_ensembl": ["Right_Atrium", "Left_Ventricle", "cardiac_muscle_cell", "heart_right_ventricle", "heart"]
},
{
"type": "Osteoblast",
Expand All @@ -129,7 +139,7 @@
{
"type": "Fibroblast",
"alias_ucsc": [],
"alias_ensembl": ["NHDF_AD"]
"alias_ensembl": ["NHDF_AD", "fibroblast_of_dermis", "fibroblast_of_lung", "IMR_90"]
},
{
"type": "NK-Cells",
Expand All @@ -144,7 +154,7 @@
{
"type": "Brain",
"alias_ucsc": ["midbrain (mesencephalon)", "trigeminal V (ganglion, cranial)", "forebrain", "neural tube", "hindbrain (rhombencephalon)", "dorsal root ganglion", "cranial nerve"],
"alias_ensembl": ["NH_A"]
"alias_ensembl": ["NH_A", "astrocyte", "bipolar_neuron", "brain", "neural_progenitor_cell", "neural_stem_progenitor_cell", "neuron"]
},
{
"type": "Mesenchym",
Expand Down Expand Up @@ -175,5 +185,15 @@
"type": "Melanocytes",
"alias_ucsc": ["melanocytes"],
"alias_ensembl": []
},
{
"type": "Miscelanious",
"alias_ucsc": [],
"alias_ensembl": ["endodermal_cell", "esophagus", "HCT116", "Karpas_422", "keratinocyte", "mammary_epithelial_cell", "MCF_7", "MM_1S", "myotube", "PC_3", "PC_9", "SK_N_SH"]
},
{
"type": "Kidney",
"alias_ucsc": [],
"alias_ensembl": ["kidney"]
}
]