ucsc.py

import subprocess
import csv
import os
import json
import re


class UcscGtf:

    """
    Class to gather ucsc refSeq-FuncElem data.

    """

    def __init__(self, org, wd, data_dir):
        self.organism_id = self.get_organism_id(org)
        self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb"
        if data_dir:
            self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed")
        else:
            self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed")
        self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed")
        print("Getting UCSC Data")
        print("Path to Bin: " + self.path_to_bin)
        self.generate_gff_file()
        self.ucsc_categories = self.get_activity_categories(org, wd)
        self.gtf_lines = self.read_gff_to_gtf()
        print("UCSC finished !")

    def generate_gff_file(self):
        # Call bigBedToBed binary to get a Bed file in the UCSCData folder
        callstring = [self.path_to_bin, self.link, self.output]
        subprocess.call(callstring)

    def read_gff_to_gtf(self):
        # Reads Bed File and return a gtf-formatted list of elements.
        gtf_lines = []
        with open(self.output, 'r') as csvfile:
            tsvreader = csv.reader(csvfile, delimiter='\t')
            for row in tsvreader:
                sequence = []
                sequence.append(row[0])
                sequence.append("UCSC")
                sequence.append(row[3].lower())
                sequence.append(row[1])
                sequence.append(row[2])
                sequence.append(".")
                sequence.append(row[5])
                sequence.append(".")
                sequence.append('; '.join([self.find_ID(''.join(row[11:])), 'activity \"'+", ".join(self.get_activity(''.join(row[11:]))) + '"']))
                gtf_lines.append(sequence)

            return gtf_lines

    def find_ID(self, line):
        # Find RefSeq ID in Line
        pattern = re.compile(r'ID:[0-9]{,9}|$')
        ref_id = re.search(pattern, line).group()
        splitted = ref_id.split(":")
        if len(splitted) == 2:
            returnstring = str(splitted[0])+' "'+str(splitted[1])+'"'
        else:
            returnstring = 'ID '+'"NA"'

        return returnstring

    def get_activity(self, line):
        # Find activity categories in bed file
        key_status = []
        for key, value in self.ucsc_categories.items():
            if value:
                if any([line.find(keyword) != -1 for keyword in value]):
                    key_status.append(key+">ACTIVE")
                else:
                    key_status.append(key + ">NA")
            else:
                key_status.append(key + ">NA")
        return key_status

    @staticmethod
    def get_organism_id(org):
        # convert intern name e.g. "homo_sapiens" to ucsc name "hg38".
        if org == "homo_sapiens":
            return "hg38"
        elif org == "mus_musculus":
            return "mm10"

    @staticmethod
    def get_activity_categories(organism, wd):
        # Method to get ucsc-celltype categories from JSON config
        path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json")
        categories = {}
        with open(path_to_config) as input_file:
            data = json.loads(input_file.read())
            for x in data:
                categories[x["type"]] = x["alias_ucsc"]

        return categories

    def get_gtf(self):
        return self.gtf_lines
	import subprocess
	import csv
	import os
	import json
	import re


	class UcscGtf:

	"""
	Class to gather ucsc refSeq-FuncElem data.

	"""

	def __init__(self, org, wd, data_dir):
	self.organism_id = self.get_organism_id(org)
	self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb"
	if data_dir:
	self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed")
	else:
	self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed")
	self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed")
	print("Getting UCSC Data")
	print("Path to Bin: " + self.path_to_bin)
	self.generate_gff_file()
	self.ucsc_categories = self.get_activity_categories(org, wd)
	self.gtf_lines = self.read_gff_to_gtf()
	print("UCSC finished !")

	def generate_gff_file(self):
	# Call bigBedToBed binary to get a Bed file in the UCSCData folder
	callstring = [self.path_to_bin, self.link, self.output]
	subprocess.call(callstring)

	def read_gff_to_gtf(self):
	# Reads Bed File and return a gtf-formatted list of elements.
	gtf_lines = []
	with open(self.output, 'r') as csvfile:
	tsvreader = csv.reader(csvfile, delimiter='\t')
	for row in tsvreader:
	sequence = []
	sequence.append(row[0])
	sequence.append("UCSC")
	sequence.append(row[3].lower())
	sequence.append(row[1])
	sequence.append(row[2])
	sequence.append(".")
	sequence.append(row[5])
	sequence.append(".")
	sequence.append('; '.join([self.find_ID(''.join(row[11:])), 'activity \"'+", ".join(self.get_activity(''.join(row[11:]))) + '"']))
	gtf_lines.append(sequence)

	return gtf_lines

	def find_ID(self, line):
	# Find RefSeq ID in Line
	pattern = re.compile(r'ID:[0-9]{,9}\|$')
	ref_id = re.search(pattern, line).group()
	splitted = ref_id.split(":")
	if len(splitted) == 2:
	returnstring = str(splitted[0])+' "'+str(splitted[1])+'"'
	else:
	returnstring = 'ID '+'"NA"'

	return returnstring

	def get_activity(self, line):
	# Find activity categories in bed file
	key_status = []
	for key, value in self.ucsc_categories.items():
	if value:
	if any([line.find(keyword) != -1 for keyword in value]):
	key_status.append(key+">ACTIVE")
	else:
	key_status.append(key + ">NA")
	else:
	key_status.append(key + ">NA")
	return key_status

	@staticmethod
	def get_organism_id(org):
	# convert intern name e.g. "homo_sapiens" to ucsc name "hg38".
	if org == "homo_sapiens":
	return "hg38"
	elif org == "mus_musculus":
	return "mm10"

	@staticmethod
	def get_activity_categories(organism, wd):
	# Method to get ucsc-celltype categories from JSON config
	path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json")
	categories = {}
	with open(path_to_config) as input_file:
	data = json.loads(input_file.read())
	for x in data:
	categories[x["type"]] = x["alias_ucsc"]

	return categories

	def get_gtf(self):
	return self.gtf_lines