Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
master_project_JLU2018/bin/Modules/Ensembl/GTFGen.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
71 lines (50 sloc)
2.29 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import gzip | |
import csv | |
class GTFGen: | |
def __init__(self, organism, release, wd): | |
self.gff_lines = self.get_organism_as_gff(organism, release, wd) | |
self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"} | |
def get_organism_as_gff(self, organism, release, wd): | |
directory = os.path.join(wd + "/EnsemblData/", release, organism) | |
inputfile = "" | |
for file in os.listdir(directory): | |
if file.endswith("gff.gz"): | |
inputfile = os.path.join(directory, file) | |
with gzip.open(inputfile) as original_file: | |
return original_file.readlines() | |
def reformat_to_gff(self, activity, release): | |
gtf_return = [] | |
for index, line in enumerate(self.gff_lines): | |
decoded_line = line.decode("UTF-8") | |
# Generate a templist | |
templist = [] | |
# Split Line by Tab | |
splitted = decoded_line.split("\t") | |
# Split Last Field by ";" | |
splitted_additional = splitted[-1].strip().split(";") | |
# Add Chromosome Name Format = chr+Name | |
templist.append("chr"+splitted[0]) | |
# Add RegBuild_ + release | |
templist.append("RegBuild_"+release) | |
# Add Description from Description in last ; separated segment | |
templist.append(splitted_additional[4].split("=")[1]) | |
# Add Start / End Data from original | |
templist.extend(splitted[3:5]) | |
# Add Score, Strand and Frame Data | |
templist.extend([".", "+", "."]) | |
# Add "additional" information | |
templist.append(self.generate_additional_information(splitted_additional[0], | |
self.generate_activity_list(activity, index))) | |
gtf_return.append(templist) | |
return gtf_return | |
@staticmethod | |
def generate_additional_information(id, activity): | |
return "; ".join([id, "activity="+", ".join(activity)]) | |
def generate_activity_list(self, activity, index): | |
activity_list = [] | |
for key, value in activity.items(): | |
activity_list.append(key+">"+self.value_map[value[index]]) | |
return activity_list | |
def get_gtf(self, release, activity): | |
return self.reformat_to_gff(activity, release) | |