Skip to content
Permalink
8d1ff19636
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
146 lines (110 sloc) 4.94 KB
import subprocess
import csv
import os
import json
import re
class UcscGtf:
"""
Class to gather data from UCSC Table Browsers, RefFuncGen Tracks.
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""
def __init__(self, org, wd, data_dir):
# Constructor for UcscGtf
# input parameter: org = organism
# wd = working directory
# data_dir = data_directory (if defined this is used to save and get data)
self.organism_id = self.get_organism_id(org)
# FTPlink to UCSC bigbed File
self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb"
# Where to save the output file
if data_dir:
self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed")
else:
self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed")
# Determine path to bigBedToBed binary.
self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed")
print("Getting UCSC Data")
print("Path to Bin: " + self.path_to_bin)
self.generate_gff_file()
self.ucsc_categories = self.get_activity_categories(org, wd)
self.gtf_lines = self.read_gff_to_gtf()
print("UCSC finished !")
def generate_gff_file(self):
# Call bigBedToBed binary to get a BED-file in the UCSCData folder
callstring = [self.path_to_bin, self.link, self.output]
subprocess.call(callstring)
def read_gff_to_gtf(self):
# Reads BED-file and return a GTF-formatted list of elements.
# return_value: GTF-formatted List of regulation entries from UCSC
gtf_lines = []
with open(self.output, 'r') as csvfile:
tsvreader = csv.reader(csvfile, delimiter='\t')
for row in tsvreader:
if row[9] not in ["region", "sequence_feature",
"CAAT_signal", "stem_loop",
"sequence_secondary_structure"]:
sequence = []
sequence.append(row[0])
sequence.append("UCSC")
sequence.append(row[9].lower().replace(' ', '_'))
sequence.append(row[1])
sequence.append(row[2])
sequence.append(".")
sequence.append(row[5])
sequence.append(".")
sequence.append('; '.join([self.find_ID(''.join(row[11:])), 'activity \"'+", ".join(self.get_activity(''.join(row[11:]))) + '"'])+";")
gtf_lines.append(sequence)
return gtf_lines
def find_ID(self, line):
# Find RefSeq ID in Line
# input_parameter: line = current line from BED-file
# return_value: string with gene_id in GTF-format
pattern = re.compile(r'ID:[0-9]{,9}|$')
ref_id = re.search(pattern, line).group()
splitted = ref_id.split(":")
if len(splitted) == 2:
returnstring = 'gene_id "'+str(splitted[1])+'"'
else:
returnstring = 'gene_id "NA"'
return returnstring
def get_activity(self, line):
# Find activity categories in BED-file
# input_parameter: line = current line from BED-file
# return_value: list with activity for specified line("keystatus")
key_status = []
for key, value in self.ucsc_categories.items():
if value:
if any([line.find(keyword) != -1 for keyword in value]):
key_status.append(key+">ACTIVE")
else:
key_status.append(key + ">NA")
else:
key_status.append(key + ">NA")
return key_status
@staticmethod
def get_organism_id(org):
# convert intern name e.g. "homo_sapiens" to ucsc name "hg38".
# input_parameter: org = organism parameter
# return_value: UCSC alias for this organism [ mm10 | hg38 ]
if org == "homo_sapiens":
return "hg38"
elif org == "mus_musculus":
return "mm10"
@staticmethod
def get_activity_categories(organism, wd):
# Method to get ucsc-celltype categories from JSON config
# input_parameter: organism = organism parameter
# wd = working directory, to find config file
# return_value: List of categories from config.
path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json")
categories = {}
with open(path_to_config) as input_file:
data = json.loads(input_file.read())
for x in data:
categories[x["type"]] = x["alias_ucsc"]
return categories
def get_gtf(self):
# Getter method for resulting gtf-lines
# return_value: List of gtf-formatted Strings (Lines)
return self.gtf_lines