Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
master_project_JLU2018/bin/3.1_create_gtf/Modules/ucsc/ucsc.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
101 lines (86 sloc)
3.47 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import csv | |
import os | |
import json | |
import re | |
class UcscGtf: | |
""" | |
Class to gather ucsc refSeq-FuncElem data. | |
""" | |
def __init__(self, org, wd, data_dir): | |
self.organism_id = self.get_organism_id(org) | |
self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb" | |
if data_dir: | |
self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed") | |
else: | |
self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed") | |
self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed") | |
print("Getting UCSC Data") | |
print("Path to Bin: " + self.path_to_bin) | |
self.generate_gff_file() | |
self.ucsc_categories = self.get_activity_categories(org, wd) | |
self.gtf_lines = self.read_gff_to_gtf() | |
print("UCSC finished !") | |
def generate_gff_file(self): | |
# Call bigBedToBed binary to get a Bed file in the UCSCData folder | |
callstring = [self.path_to_bin, self.link, self.output] | |
subprocess.call(callstring) | |
def read_gff_to_gtf(self): | |
# Reads Bed File and return a gtf-formatted list of elements. | |
gtf_lines = [] | |
with open(self.output, 'r') as csvfile: | |
tsvreader = csv.reader(csvfile, delimiter='\t') | |
for row in tsvreader: | |
sequence = [] | |
sequence.append(row[0]) | |
sequence.append("UCSC") | |
sequence.append(row[3].lower()) | |
sequence.append(row[1]) | |
sequence.append(row[2]) | |
sequence.append(".") | |
sequence.append(row[5]) | |
sequence.append(".") | |
sequence.append('; '.join([self.find_ID(''.join(row[11:])), 'activity \"'+", ".join(self.get_activity(''.join(row[11:]))) + '"'])) | |
gtf_lines.append(sequence) | |
return gtf_lines | |
def find_ID(self, line): | |
# Find RefSeq ID in Line | |
pattern = re.compile(r'ID:[0-9]{,9}|$') | |
ref_id = re.search(pattern, line).group() | |
splitted = ref_id.split(":") | |
if len(splitted) == 2: | |
returnstring = str(splitted[0])+' "'+str(splitted[1])+'"' | |
else: | |
returnstring = 'ID '+'"NA"' | |
return returnstring | |
def get_activity(self, line): | |
# Find activity categories in bed file | |
key_status = [] | |
for key, value in self.ucsc_categories.items(): | |
if value: | |
if any([line.find(keyword) != -1 for keyword in value]): | |
key_status.append(key+">ACTIVE") | |
else: | |
key_status.append(key + ">NA") | |
else: | |
key_status.append(key + ">NA") | |
return key_status | |
@staticmethod | |
def get_organism_id(org): | |
# convert intern name e.g. "homo_sapiens" to ucsc name "hg38". | |
if org == "homo_sapiens": | |
return "hg38" | |
elif org == "mus_musculus": | |
return "mm10" | |
@staticmethod | |
def get_activity_categories(organism, wd): | |
# Method to get ucsc-celltype categories from JSON config | |
path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json") | |
categories = {} | |
with open(path_to_config) as input_file: | |
data = json.loads(input_file.read()) | |
for x in data: | |
categories[x["type"]] = x["alias_ucsc"] | |
return categories | |
def get_gtf(self): | |
return self.gtf_lines |