Skip to content
Permalink
9b258c6d1f
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
Latest commit d22a0e1 Dec 4, 2018 History
1 contributor

Users who have contributed to this file

81 lines (68 sloc) 2.67 KB
import subprocess
import csv
import os
import json
import re
class UcscGtf:
def __init__(self, org, wd):
self.organism_id = self.get_organism_id(org)
self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb"
self.output = os.path.join(wd + "/UCSCData/"+self.organism_id+".bed")
self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed")
print("Getting UCSC Data")
print("Path to Bin: " + self.path_to_bin)
self.generate_gff_file()
self.ucsc_categories = self.get_activity_categories(org, wd)
self.gtf_lines = self.read_gff_to_gtf()
print("UCSC finished !")
def generate_gff_file(self):
callstring = [self.path_to_bin, self.link, self.output]
subprocess.call(callstring)
def read_gff_to_gtf(self):
gtf_lines = []
with open(self.output, 'r') as csvfile:
tsvreader = csv.reader(csvfile, delimiter='\t')
for row in tsvreader:
sequence = []
sequence.append(row[0])
sequence.append("UCSC")
sequence.append(row[3])
sequence.append(row[1])
sequence.append(row[2])
sequence.append(".")
sequence.append(row[5])
sequence.append(".")
sequence.append("; ".join([self.find_ID("".join(row[11:])), ", ".join(self.get_activity(".".join(row[11:])))]))
gtf_lines.append(sequence)
return gtf_lines
def find_ID(self, line):
pattern = re.compile(r'ID:[0-9]{,9}|$')
return re.search( pattern, line).group()
def get_activity(self, line):
key_status = []
for key, value in self.ucsc_categories.items():
if value:
if any([line.find(keyword) != -1 for keyword in value]):
key_status.append(key+">ACTIVE")
else:
key_status.append(key + ">NA")
else:
key_status.append(key + ">NA")
return key_status
@staticmethod
def get_organism_id(org):
if org == "homo_sapiens":
return "hg38"
elif org == "mus_musculus":
return "mm10"
@staticmethod
def get_activity_categories(organism, wd):
path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json")
categories = {}
with open(path_to_config) as input_file:
data = json.loads(input_file.read())
for x in data:
categories[x["type"]] = x["alias_ucsc"]
return categories
def get_gtf(self):
return self.gtf_lines