Permalink
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
master_project_JLU2018/bin/Modules/ucsc/ucsc.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
81 lines (68 sloc)
2.67 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import csv | |
import os | |
import json | |
import re | |
class UcscGtf: | |
def __init__(self, org, wd): | |
self.organism_id = self.get_organism_id(org) | |
self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb" | |
self.output = os.path.join(wd + "/UCSCData/"+self.organism_id+".bed") | |
self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed") | |
print("Getting UCSC Data") | |
print("Path to Bin: " + self.path_to_bin) | |
self.generate_gff_file() | |
self.ucsc_categories = self.get_activity_categories(org, wd) | |
self.gtf_lines = self.read_gff_to_gtf() | |
print("UCSC finished !") | |
def generate_gff_file(self): | |
callstring = [self.path_to_bin, self.link, self.output] | |
subprocess.call(callstring) | |
def read_gff_to_gtf(self): | |
gtf_lines = [] | |
with open(self.output, 'r') as csvfile: | |
tsvreader = csv.reader(csvfile, delimiter='\t') | |
for row in tsvreader: | |
sequence = [] | |
sequence.append(row[0]) | |
sequence.append("UCSC") | |
sequence.append(row[3]) | |
sequence.append(row[1]) | |
sequence.append(row[2]) | |
sequence.append(".") | |
sequence.append(row[5]) | |
sequence.append(".") | |
sequence.append("; ".join([self.find_ID("".join(row[11:])), ", ".join(self.get_activity(".".join(row[11:])))])) | |
gtf_lines.append(sequence) | |
return gtf_lines | |
def find_ID(self, line): | |
pattern = re.compile(r'ID:[0-9]{,9}|$') | |
return re.search( pattern, line).group() | |
def get_activity(self, line): | |
key_status = [] | |
for key, value in self.ucsc_categories.items(): | |
if value: | |
if any([line.find(keyword) != -1 for keyword in value]): | |
key_status.append(key+">ACTIVE") | |
else: | |
key_status.append(key + ">NA") | |
else: | |
key_status.append(key + ">NA") | |
return key_status | |
@staticmethod | |
def get_organism_id(org): | |
if org == "homo_sapiens": | |
return "hg38" | |
elif org == "mus_musculus": | |
return "mm10" | |
@staticmethod | |
def get_activity_categories(organism, wd): | |
path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json") | |
categories = {} | |
with open(path_to_config) as input_file: | |
data = json.loads(input_file.read()) | |
for x in data: | |
categories[x["type"]] = x["alias_ucsc"] | |
return categories | |
def get_gtf(self): | |
return self.gtf_lines |