Skip to content
Permalink
4f40b11ca2
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
122 lines (86 sloc) 4.45 KB
import os
import gzip
class GTFGen:
"""
Class to generate Ensembl GTF-data with activity
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""
def __init__(self, organism, release, wd, data_dir):
# Constructor for GTFGen
# input_parameter: organism = input organism
# release = used Ensembl release
# wd = working directory (default is "."), this is used if data_dir is not specified.
# data_dir = data directory (if specified this is used)
self.gff_lines = self.get_organism_as_gff(organism, release, wd, data_dir)
# Map to assign numbers from Activitytable-binary to activity status
self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"}
def get_organism_as_gff(self, organism, release, wd, data_dir):
# reads the original gff file for organism
# input_parameter as in __init__ described.
# return_value: list of gff-entries
if data_dir:
directory = os.path.join(data_dir + "/EnsemblData/", release, organism)
else:
directory = os.path.join(wd + "/data/EnsemblData/", release, organism)
inputfile = ""
for file in os.listdir(directory):
if file.endswith("gff.gz"):
inputfile = os.path.join(directory, file)
with gzip.open(inputfile) as original_file:
return original_file.readlines()
def reformat_to_gtf(self, activity, release):
# Reformats gff to gtf and appends activity-data for config specified celltype-categories
# input_parameter: activity = list of activity status for all genes
# release = current ensembl release
# return_value: List of gtf-formatted entries
gtf_return = []
for index, line in enumerate(self.gff_lines):
decoded_line = line.decode("UTF-8")
# Generate a templist
templist = []
# Split Line by Tab
splitted = decoded_line.split("\t")
# Split Last Field by ";"
splitted_additional = splitted[-1].strip().split(";")
# Add Chromosome Name Format = chr+Name
templist.append("chr"+splitted[0])
# Add RegBuild_ + release
templist.append("RegBuild_"+release)
# Add Description from Description in last ; separated segment
templist.append(splitted_additional[4].split("=")[1].lower().replace(' ', '_'))
# Add Start / End Data from original
templist.extend(splitted[3:5])
# Add Score, Strand and Frame Data
templist.extend([".", "+", "."])
# Add "additional" information
templist.append(self.generate_additional_information(splitted_additional[0],
self.generate_activity_list(activity, index)))
gtf_return.append(templist)
return gtf_return
@staticmethod
def generate_additional_information(gene_id, activity):
# helper method to concat activity information to string and reformat from gff to gtf-style
# input_parameter: gene_id = gene_id formatted in gff format
# activity = List of activity-data for specified gene
# return_value: String for attributes (column 9) in gtf-format
if gene_id.startswith("ID=regulatory_region:"):
gene_id = 'gene_id "'+gene_id.split(':')[1]+'"'
elif gene_id.startswith("ID=E"):
gene_id = 'gene_id "'+gene_id.split('=')[1]+'"'
activity_string = 'activity "'+', '.join(activity)+'"'
return gene_id+'; '+activity_string+';'
def generate_activity_list(self, activity, index):
# generates activity list for a specified index
# input_parameter: index = index for a specified gene
# activity = List of activity-data for all entries
# return_value: List of activity for gene at index
activity_list = []
for key, value in activity.items():
activity_list.append(key+">"+self.value_map[value[index]])
return activity_list
def get_gtf(self, release, activity):
# getter function for the resulting gtf-formatted-list
# input_parameters: release, activity as in self.reformat_to_gtf()
# return_value: List of GTF-Entries
return self.reformat_to_gtf(activity, release)