Skip to content
Permalink
ab6f883dd1
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
91 lines (64 sloc) 2.96 KB
import os
import gzip
import csv
class GTFGen:
"""
Class to generate Ensembl GTF-data with activity
"""
def __init__(self, organism, release, wd, data_dir):
self.gff_lines = self.get_organism_as_gff(organism, release, wd, data_dir)
self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"}
def get_organism_as_gff(self, organism, release, wd, data_dir):
# reads the original gff file for organism
if data_dir:
directory = os.path.join(data_dir + "/EnsemblData/", release, organism)
else:
directory = os.path.join(wd + "/data/EnsemblData/", release, organism)
inputfile = ""
for file in os.listdir(directory):
if file.endswith("gff.gz"):
inputfile = os.path.join(directory, file)
with gzip.open(inputfile) as original_file:
return original_file.readlines()
def reformat_to_gff(self, activity, release):
# Reformats gff to gtf and appends activity-data for config specified celltype-categories
gtf_return = []
for index, line in enumerate(self.gff_lines):
decoded_line = line.decode("UTF-8")
# Generate a templist
templist = []
# Split Line by Tab
splitted = decoded_line.split("\t")
# Split Last Field by ";"
splitted_additional = splitted[-1].strip().split(";")
# Add Chromosome Name Format = chr+Name
templist.append("chr"+splitted[0])
# Add RegBuild_ + release
templist.append("RegBuild_"+release)
# Add Description from Description in last ; separated segment
templist.append(splitted_additional[4].split("=")[1].lower())
# Add Start / End Data from original
templist.extend(splitted[3:5])
# Add Score, Strand and Frame Data
templist.extend([".", "+", "."])
# Add "additional" information
templist.append(self.generate_additional_information(splitted_additional[0],
self.generate_activity_list(activity, index)))
gtf_return.append(templist)
return gtf_return
@staticmethod
def generate_additional_information(gene_id, activity):
if gene_id.startswith("ID=regulatory_region:"):
gene_id = 'ID "'+gene_id.split(':')[1]+'"'
activity_string = 'activity "'+', '.join(activity)+'"'
# helper method to concat activity information to string
return gene_id+'; '+activity_string
def generate_activity_list(self, activity, index):
# generates activity list
activity_list = []
for key, value in activity.items():
activity_list.append(key+">"+self.value_map[value[index]])
return activity_list
def get_gtf(self, release, activity):
# returns the resulting gtf-formatted-list
return self.reformat_to_gff(activity, release)