Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
master_project_JLU2018/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
122 lines (86 sloc)
4.45 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import gzip | |
class GTFGen: | |
""" | |
Class to generate Ensembl GTF-data with activity | |
@author: Sebastian Beyvers | |
@contact: sebastian.beyvers@med.uni-giessen.de | |
""" | |
def __init__(self, organism, release, wd, data_dir): | |
# Constructor for GTFGen | |
# input_parameter: organism = input organism | |
# release = used Ensembl release | |
# wd = working directory (default is "."), this is used if data_dir is not specified. | |
# data_dir = data directory (if specified this is used) | |
self.gff_lines = self.get_organism_as_gff(organism, release, wd, data_dir) | |
# Map to assign numbers from Activitytable-binary to activity status | |
self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"} | |
def get_organism_as_gff(self, organism, release, wd, data_dir): | |
# reads the original gff file for organism | |
# input_parameter as in __init__ described. | |
# return_value: list of gff-entries | |
if data_dir: | |
directory = os.path.join(data_dir + "/EnsemblData/", release, organism) | |
else: | |
directory = os.path.join(wd + "/data/EnsemblData/", release, organism) | |
inputfile = "" | |
for file in os.listdir(directory): | |
if file.endswith("gff.gz"): | |
inputfile = os.path.join(directory, file) | |
with gzip.open(inputfile) as original_file: | |
return original_file.readlines() | |
def reformat_to_gtf(self, activity, release): | |
# Reformats gff to gtf and appends activity-data for config specified celltype-categories | |
# input_parameter: activity = list of activity status for all genes | |
# release = current ensembl release | |
# return_value: List of gtf-formatted entries | |
gtf_return = [] | |
for index, line in enumerate(self.gff_lines): | |
decoded_line = line.decode("UTF-8") | |
# Generate a templist | |
templist = [] | |
# Split Line by Tab | |
splitted = decoded_line.split("\t") | |
# Split Last Field by ";" | |
splitted_additional = splitted[-1].strip().split(";") | |
# Add Chromosome Name Format = chr+Name | |
templist.append("chr"+splitted[0]) | |
# Add RegBuild_ + release | |
templist.append("RegBuild_"+release) | |
# Add Description from Description in last ; separated segment | |
templist.append(splitted_additional[4].split("=")[1].lower().replace(' ', '_')) | |
# Add Start / End Data from original | |
templist.extend(splitted[3:5]) | |
# Add Score, Strand and Frame Data | |
templist.extend([".", "+", "."]) | |
# Add "additional" information | |
templist.append(self.generate_additional_information(splitted_additional[0], | |
self.generate_activity_list(activity, index))) | |
gtf_return.append(templist) | |
return gtf_return | |
@staticmethod | |
def generate_additional_information(gene_id, activity): | |
# helper method to concat activity information to string and reformat from gff to gtf-style | |
# input_parameter: gene_id = gene_id formatted in gff format | |
# activity = List of activity-data for specified gene | |
# return_value: String for attributes (column 9) in gtf-format | |
if gene_id.startswith("ID=regulatory_region:"): | |
gene_id = 'gene_id "'+gene_id.split(':')[1]+'"' | |
elif gene_id.startswith("ID=E"): | |
gene_id = 'gene_id "'+gene_id.split('=')[1]+'"' | |
activity_string = 'activity "'+', '.join(activity)+'"' | |
return gene_id+'; '+activity_string+';' | |
def generate_activity_list(self, activity, index): | |
# generates activity list for a specified index | |
# input_parameter: index = index for a specified gene | |
# activity = List of activity-data for all entries | |
# return_value: List of activity for gene at index | |
activity_list = [] | |
for key, value in activity.items(): | |
activity_list.append(key+">"+self.value_map[value[index]]) | |
return activity_list | |
def get_gtf(self, release, activity): | |
# getter function for the resulting gtf-formatted-list | |
# input_parameters: release, activity as in self.reformat_to_gtf() | |
# return_value: List of GTF-Entries | |
return self.reformat_to_gtf(activity, release) | |