Skip to content
Permalink
ab6f883dd1
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
101 lines (86 sloc) 3.47 KB
import subprocess
import csv
import os
import json
import re
class UcscGtf:
"""
Class to gather ucsc refSeq-FuncElem data.
"""
def __init__(self, org, wd, data_dir):
self.organism_id = self.get_organism_id(org)
self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb"
if data_dir:
self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed")
else:
self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed")
self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed")
print("Getting UCSC Data")
print("Path to Bin: " + self.path_to_bin)
self.generate_gff_file()
self.ucsc_categories = self.get_activity_categories(org, wd)
self.gtf_lines = self.read_gff_to_gtf()
print("UCSC finished !")
def generate_gff_file(self):
# Call bigBedToBed binary to get a Bed file in the UCSCData folder
callstring = [self.path_to_bin, self.link, self.output]
subprocess.call(callstring)
def read_gff_to_gtf(self):
# Reads Bed File and return a gtf-formatted list of elements.
gtf_lines = []
with open(self.output, 'r') as csvfile:
tsvreader = csv.reader(csvfile, delimiter='\t')
for row in tsvreader:
sequence = []
sequence.append(row[0])
sequence.append("UCSC")
sequence.append(row[3].lower())
sequence.append(row[1])
sequence.append(row[2])
sequence.append(".")
sequence.append(row[5])
sequence.append(".")
sequence.append('; '.join([self.find_ID(''.join(row[11:])), 'activity \"'+", ".join(self.get_activity(''.join(row[11:]))) + '"']))
gtf_lines.append(sequence)
return gtf_lines
def find_ID(self, line):
# Find RefSeq ID in Line
pattern = re.compile(r'ID:[0-9]{,9}|$')
ref_id = re.search(pattern, line).group()
splitted = ref_id.split(":")
if len(splitted) == 2:
returnstring = str(splitted[0])+' "'+str(splitted[1])+'"'
else:
returnstring = 'ID '+'"NA"'
return returnstring
def get_activity(self, line):
# Find activity categories in bed file
key_status = []
for key, value in self.ucsc_categories.items():
if value:
if any([line.find(keyword) != -1 for keyword in value]):
key_status.append(key+">ACTIVE")
else:
key_status.append(key + ">NA")
else:
key_status.append(key + ">NA")
return key_status
@staticmethod
def get_organism_id(org):
# convert intern name e.g. "homo_sapiens" to ucsc name "hg38".
if org == "homo_sapiens":
return "hg38"
elif org == "mus_musculus":
return "mm10"
@staticmethod
def get_activity_categories(organism, wd):
# Method to get ucsc-celltype categories from JSON config
path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json")
categories = {}
with open(path_to_config) as input_file:
data = json.loads(input_file.read())
for x in data:
categories[x["type"]] = x["alias_ucsc"]
return categories
def get_gtf(self):
return self.gtf_lines