Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
master_project_JLU2018/bin/3.1_create_gtf/Modules/ucsc/ucsc.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
146 lines (110 sloc)
4.94 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import csv | |
import os | |
import json | |
import re | |
class UcscGtf: | |
""" | |
Class to gather data from UCSC Table Browsers, RefFuncGen Tracks. | |
@author: Sebastian Beyvers | |
@contact: sebastian.beyvers@med.uni-giessen.de | |
""" | |
def __init__(self, org, wd, data_dir): | |
# Constructor for UcscGtf | |
# input parameter: org = organism | |
# wd = working directory | |
# data_dir = data_directory (if defined this is used to save and get data) | |
self.organism_id = self.get_organism_id(org) | |
# FTPlink to UCSC bigbed File | |
self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb" | |
# Where to save the output file | |
if data_dir: | |
self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed") | |
else: | |
self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed") | |
# Determine path to bigBedToBed binary. | |
self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed") | |
print("Getting UCSC Data") | |
print("Path to Bin: " + self.path_to_bin) | |
self.generate_gff_file() | |
self.ucsc_categories = self.get_activity_categories(org, wd) | |
self.gtf_lines = self.read_gff_to_gtf() | |
print("UCSC finished !") | |
def generate_gff_file(self): | |
# Call bigBedToBed binary to get a BED-file in the UCSCData folder | |
callstring = [self.path_to_bin, self.link, self.output] | |
subprocess.call(callstring) | |
def read_gff_to_gtf(self): | |
# Reads BED-file and return a GTF-formatted list of elements. | |
# return_value: GTF-formatted List of regulation entries from UCSC | |
gtf_lines = [] | |
with open(self.output, 'r') as csvfile: | |
tsvreader = csv.reader(csvfile, delimiter='\t') | |
for row in tsvreader: | |
if row[9] not in ["region", "sequence_feature", | |
"CAAT_signal", "stem_loop", | |
"sequence_secondary_structure"]: | |
sequence = [] | |
sequence.append(row[0]) | |
sequence.append("UCSC") | |
sequence.append(row[9].lower().replace(' ', '_')) | |
sequence.append(row[1]) | |
sequence.append(row[2]) | |
sequence.append(".") | |
sequence.append(row[5]) | |
sequence.append(".") | |
sequence.append('; '.join([self.find_ID(''.join(row[11:])), 'activity \"'+", ".join(self.get_activity(''.join(row[11:]))) + '"'])+";") | |
gtf_lines.append(sequence) | |
return gtf_lines | |
def find_ID(self, line): | |
# Find RefSeq ID in Line | |
# input_parameter: line = current line from BED-file | |
# return_value: string with gene_id in GTF-format | |
pattern = re.compile(r'ID:[0-9]{,9}|$') | |
ref_id = re.search(pattern, line).group() | |
splitted = ref_id.split(":") | |
if len(splitted) == 2: | |
returnstring = 'gene_id "'+str(splitted[1])+'"' | |
else: | |
returnstring = 'gene_id "NA"' | |
return returnstring | |
def get_activity(self, line): | |
# Find activity categories in BED-file | |
# input_parameter: line = current line from BED-file | |
# return_value: list with activity for specified line("keystatus") | |
key_status = [] | |
for key, value in self.ucsc_categories.items(): | |
if value: | |
if any([line.find(keyword) != -1 for keyword in value]): | |
key_status.append(key+">ACTIVE") | |
else: | |
key_status.append(key + ">NA") | |
else: | |
key_status.append(key + ">NA") | |
return key_status | |
@staticmethod | |
def get_organism_id(org): | |
# convert intern name e.g. "homo_sapiens" to ucsc name "hg38". | |
# input_parameter: org = organism parameter | |
# return_value: UCSC alias for this organism [ mm10 | hg38 ] | |
if org == "homo_sapiens": | |
return "hg38" | |
elif org == "mus_musculus": | |
return "mm10" | |
@staticmethod | |
def get_activity_categories(organism, wd): | |
# Method to get ucsc-celltype categories from JSON config | |
# input_parameter: organism = organism parameter | |
# wd = working directory, to find config file | |
# return_value: List of categories from config. | |
path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json") | |
categories = {} | |
with open(path_to_config) as input_file: | |
data = json.loads(input_file.read()) | |
for x in data: | |
categories[x["type"]] = x["alias_ucsc"] | |
return categories | |
def get_gtf(self): | |
# Getter method for resulting gtf-lines | |
# return_value: List of gtf-formatted Strings (Lines) | |
return self.gtf_lines |