Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
master_project_JLU2018/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
91 lines (64 sloc)
2.96 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import gzip | |
import csv | |
class GTFGen: | |
""" | |
Class to generate Ensembl GTF-data with activity | |
""" | |
def __init__(self, organism, release, wd, data_dir): | |
self.gff_lines = self.get_organism_as_gff(organism, release, wd, data_dir) | |
self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"} | |
def get_organism_as_gff(self, organism, release, wd, data_dir): | |
# reads the original gff file for organism | |
if data_dir: | |
directory = os.path.join(data_dir + "/EnsemblData/", release, organism) | |
else: | |
directory = os.path.join(wd + "/data/EnsemblData/", release, organism) | |
inputfile = "" | |
for file in os.listdir(directory): | |
if file.endswith("gff.gz"): | |
inputfile = os.path.join(directory, file) | |
with gzip.open(inputfile) as original_file: | |
return original_file.readlines() | |
def reformat_to_gff(self, activity, release): | |
# Reformats gff to gtf and appends activity-data for config specified celltype-categories | |
gtf_return = [] | |
for index, line in enumerate(self.gff_lines): | |
decoded_line = line.decode("UTF-8") | |
# Generate a templist | |
templist = [] | |
# Split Line by Tab | |
splitted = decoded_line.split("\t") | |
# Split Last Field by ";" | |
splitted_additional = splitted[-1].strip().split(";") | |
# Add Chromosome Name Format = chr+Name | |
templist.append("chr"+splitted[0]) | |
# Add RegBuild_ + release | |
templist.append("RegBuild_"+release) | |
# Add Description from Description in last ; separated segment | |
templist.append(splitted_additional[4].split("=")[1].lower()) | |
# Add Start / End Data from original | |
templist.extend(splitted[3:5]) | |
# Add Score, Strand and Frame Data | |
templist.extend([".", "+", "."]) | |
# Add "additional" information | |
templist.append(self.generate_additional_information(splitted_additional[0], | |
self.generate_activity_list(activity, index))) | |
gtf_return.append(templist) | |
return gtf_return | |
@staticmethod | |
def generate_additional_information(gene_id, activity): | |
if gene_id.startswith("ID=regulatory_region:"): | |
gene_id = 'ID "'+gene_id.split(':')[1]+'"' | |
activity_string = 'activity "'+', '.join(activity)+'"' | |
# helper method to concat activity information to string | |
return gene_id+'; '+activity_string | |
def generate_activity_list(self, activity, index): | |
# generates activity list | |
activity_list = [] | |
for key, value in activity.items(): | |
activity_list.append(key+">"+self.value_map[value[index]]) | |
return activity_list | |
def get_gtf(self, release, activity): | |
# returns the resulting gtf-formatted-list | |
return self.reformat_to_gff(activity, release) | |