import subprocess import csv import os import json import re class UcscGtf: """ Class to gather ucsc refSeq-FuncElem data. """ def __init__(self, org, wd, data_dir): self.organism_id = self.get_organism_id(org) self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb" if data_dir: self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed") else: self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed") self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed") print("Getting UCSC Data") print("Path to Bin: " + self.path_to_bin) self.generate_gff_file() self.ucsc_categories = self.get_activity_categories(org, wd) self.gtf_lines = self.read_gff_to_gtf() print("UCSC finished !") def generate_gff_file(self): # Call bigBedToBed binary to get a Bed file in the UCSCData folder callstring = [self.path_to_bin, self.link, self.output] subprocess.call(callstring) def read_gff_to_gtf(self): # Reads Bed File and return a gtf-formatted list of elements. gtf_lines = [] with open(self.output, 'r') as csvfile: tsvreader = csv.reader(csvfile, delimiter='\t') for row in tsvreader: sequence = [] sequence.append(row[0]) sequence.append("UCSC") sequence.append(row[3].lower()) sequence.append(row[1]) sequence.append(row[2]) sequence.append(".") sequence.append(row[5]) sequence.append(".") sequence.append('; '.join([self.find_ID(''.join(row[11:])), 'activity \"'+", ".join(self.get_activity(''.join(row[11:]))) + '"'])) gtf_lines.append(sequence) return gtf_lines def find_ID(self, line): # Find RefSeq ID in Line pattern = re.compile(r'ID:[0-9]{,9}|$') ref_id = re.search(pattern, line).group() splitted = ref_id.split(":") if len(splitted) == 2: returnstring = str(splitted[0])+' "'+str(splitted[1])+'"' else: returnstring = 'ID '+'"NA"' return returnstring def get_activity(self, line): # Find activity categories in bed file key_status = [] for key, value in self.ucsc_categories.items(): if value: if any([line.find(keyword) != -1 for keyword in value]): key_status.append(key+">ACTIVE") else: key_status.append(key + ">NA") else: key_status.append(key + ">NA") return key_status @staticmethod def get_organism_id(org): # convert intern name e.g. "homo_sapiens" to ucsc name "hg38". if org == "homo_sapiens": return "hg38" elif org == "mus_musculus": return "mm10" @staticmethod def get_activity_categories(organism, wd): # Method to get ucsc-celltype categories from JSON config path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json") categories = {} with open(path_to_config) as input_file: data = json.loads(input_file.read()) for x in data: categories[x["type"]] = x["alias_ucsc"] return categories def get_gtf(self): return self.gtf_lines