import os import gzip import csv class GTFGen: """ Class to generate Ensembl GTF-data with activity """ def __init__(self, organism, release, wd, data_dir): self.gff_lines = self.get_organism_as_gff(organism, release, wd, data_dir) self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"} def get_organism_as_gff(self, organism, release, wd, data_dir): # reads the original gff file for organism if data_dir: directory = os.path.join(data_dir + "/EnsemblData/", release, organism) else: directory = os.path.join(wd + "/data/EnsemblData/", release, organism) inputfile = "" for file in os.listdir(directory): if file.endswith("gff.gz"): inputfile = os.path.join(directory, file) with gzip.open(inputfile) as original_file: return original_file.readlines() def reformat_to_gff(self, activity, release): # Reformats gff to gtf and appends activity-data for config specified celltype-categories gtf_return = [] for index, line in enumerate(self.gff_lines): decoded_line = line.decode("UTF-8") # Generate a templist templist = [] # Split Line by Tab splitted = decoded_line.split("\t") # Split Last Field by ";" splitted_additional = splitted[-1].strip().split(";") # Add Chromosome Name Format = chr+Name templist.append("chr"+splitted[0]) # Add RegBuild_ + release templist.append("RegBuild_"+release) # Add Description from Description in last ; separated segment templist.append(splitted_additional[4].split("=")[1].lower()) # Add Start / End Data from original templist.extend(splitted[3:5]) # Add Score, Strand and Frame Data templist.extend([".", "+", "."]) # Add "additional" information templist.append(self.generate_additional_information(splitted_additional[0], self.generate_activity_list(activity, index))) gtf_return.append(templist) return gtf_return @staticmethod def generate_additional_information(gene_id, activity): if gene_id.startswith("ID=regulatory_region:"): gene_id = 'ID "'+gene_id.split(':')[1]+'"' activity_string = 'activity "'+', '.join(activity)+'"' # helper method to concat activity information to string return gene_id+'; '+activity_string def generate_activity_list(self, activity, index): # generates activity list activity_list = [] for key, value in activity.items(): activity_list.append(key+">"+self.value_map[value[index]]) return activity_list def get_gtf(self, release, activity): # returns the resulting gtf-formatted-list return self.reformat_to_gff(activity, release)