ucsc.py

import subprocess
import csv
import os
import json
import re


class UcscGtf:

    """

        Class to gather data from UCSC Table Browsers, RefFuncGen Tracks.
        @author: Sebastian Beyvers
        @contact: sebastian.beyvers@med.uni-giessen.de

    """

    def __init__(self, org, wd, data_dir):

        # Constructor for UcscGtf
        # input parameter: org      = organism
        #                  wd       = working directory
        #                  data_dir = data_directory (if defined this is used to save and get data)

        self.organism_id = self.get_organism_id(org)

        # FTPlink to UCSC bigbed File

        self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb"

        # Where to save the output file
        if data_dir:
            self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed")
        else:
            self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed")
        # Determine path to bigBedToBed binary.
        self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed")
        print("Getting UCSC Data")
        print("Path to Bin: " + self.path_to_bin)
        self.generate_gff_file()
        self.ucsc_categories = self.get_activity_categories(org, wd)
        self.gtf_lines = self.read_gff_to_gtf()
        print("UCSC finished !")

    def generate_gff_file(self):

        # Call bigBedToBed binary to get a BED-file in the UCSCData folder

        callstring = [self.path_to_bin, self.link, self.output]
        subprocess.call(callstring)

    def read_gff_to_gtf(self):

        # Reads BED-file and return a GTF-formatted list of elements.
        # return_value: GTF-formatted List of regulation entries from UCSC

        gtf_lines = []
        with open(self.output, 'r') as csvfile:
            tsvreader = csv.reader(csvfile, delimiter='\t')
            for row in tsvreader:
                if row[9] not in ["region", "sequence_feature",
                                  "CAAT_signal", "stem_loop",
                                  "sequence_secondary_structure"]:

                    sequence = []
                    sequence.append(row[0])
                    sequence.append("UCSC")
                    sequence.append(row[9].lower().replace(' ', '_'))
                    sequence.append(row[1])
                    sequence.append(row[2])
                    sequence.append(".")
                    sequence.append(row[5])
                    sequence.append(".")
                    sequence.append('; '.join([self.find_ID(''.join(row[11:])), 'activity \"'+", ".join(self.get_activity(''.join(row[11:]))) + '"'])+";")
                    gtf_lines.append(sequence)

            return gtf_lines

    def find_ID(self, line):

        # Find RefSeq ID in Line
        # input_parameter: line = current line from BED-file
        # return_value: string with gene_id in GTF-format

        pattern = re.compile(r'ID:[0-9]{,9}|$')
        ref_id = re.search(pattern, line).group()
        splitted = ref_id.split(":")
        if len(splitted) == 2:
            returnstring = 'gene_id "'+str(splitted[1])+'"'
        else:
            returnstring = 'gene_id "NA"'

        return returnstring

    def get_activity(self, line):

        # Find activity categories in BED-file
        # input_parameter: line = current line from BED-file
        # return_value: list with activity for specified line("keystatus")

        key_status = []
        for key, value in self.ucsc_categories.items():
            if value:
                if any([line.find(keyword) != -1 for keyword in value]):
                    key_status.append(key+">ACTIVE")
                else:
                    key_status.append(key + ">NA")
            else:
                key_status.append(key + ">NA")
        return key_status

    @staticmethod
    def get_organism_id(org):

        # convert intern name e.g. "homo_sapiens" to ucsc name "hg38".
        # input_parameter: org = organism parameter
        # return_value: UCSC alias for this organism [ mm10 | hg38 ]

        if org == "homo_sapiens":
            return "hg38"
        elif org == "mus_musculus":
            return "mm10"

    @staticmethod
    def get_activity_categories(organism, wd):

        # Method to get ucsc-celltype categories from JSON config
        # input_parameter: organism = organism parameter
        #                  wd       = working directory, to find config file
        # return_value: List of categories from config.

        path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json")
        categories = {}
        with open(path_to_config) as input_file:
            data = json.loads(input_file.read())
            for x in data:
                categories[x["type"]] = x["alias_ucsc"]

        return categories

    def get_gtf(self):

        # Getter method for resulting gtf-lines
        # return_value: List of gtf-formatted Strings (Lines)

        return self.gtf_lines
	import subprocess
	import csv
	import os
	import json
	import re


	class UcscGtf:

	"""

	Class to gather data from UCSC Table Browsers, RefFuncGen Tracks.
	@author: Sebastian Beyvers
	@contact: sebastian.beyvers@med.uni-giessen.de

	"""

	def __init__(self, org, wd, data_dir):

	# Constructor for UcscGtf
	# input parameter: org = organism
	# wd = working directory
	# data_dir = data_directory (if defined this is used to save and get data)

	self.organism_id = self.get_organism_id(org)

	# FTPlink to UCSC bigbed File

	self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb"

	# Where to save the output file
	if data_dir:
	self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed")
	else:
	self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed")
	# Determine path to bigBedToBed binary.
	self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed")
	print("Getting UCSC Data")
	print("Path to Bin: " + self.path_to_bin)
	self.generate_gff_file()
	self.ucsc_categories = self.get_activity_categories(org, wd)
	self.gtf_lines = self.read_gff_to_gtf()
	print("UCSC finished !")

	def generate_gff_file(self):

	# Call bigBedToBed binary to get a BED-file in the UCSCData folder

	callstring = [self.path_to_bin, self.link, self.output]
	subprocess.call(callstring)

	def read_gff_to_gtf(self):

	# Reads BED-file and return a GTF-formatted list of elements.
	# return_value: GTF-formatted List of regulation entries from UCSC

	gtf_lines = []
	with open(self.output, 'r') as csvfile:
	tsvreader = csv.reader(csvfile, delimiter='\t')
	for row in tsvreader:
	if row[9] not in ["region", "sequence_feature",
	"CAAT_signal", "stem_loop",
	"sequence_secondary_structure"]:

	sequence = []
	sequence.append(row[0])
	sequence.append("UCSC")
	sequence.append(row[9].lower().replace(' ', '_'))
	sequence.append(row[1])
	sequence.append(row[2])
	sequence.append(".")
	sequence.append(row[5])
	sequence.append(".")
	sequence.append('; '.join([self.find_ID(''.join(row[11:])), 'activity \"'+", ".join(self.get_activity(''.join(row[11:]))) + '"'])+";")
	gtf_lines.append(sequence)

	return gtf_lines

	def find_ID(self, line):

	# Find RefSeq ID in Line
	# input_parameter: line = current line from BED-file
	# return_value: string with gene_id in GTF-format

	pattern = re.compile(r'ID:[0-9]{,9}\|$')
	ref_id = re.search(pattern, line).group()
	splitted = ref_id.split(":")
	if len(splitted) == 2:
	returnstring = 'gene_id "'+str(splitted[1])+'"'
	else:
	returnstring = 'gene_id "NA"'

	return returnstring

	def get_activity(self, line):

	# Find activity categories in BED-file
	# input_parameter: line = current line from BED-file
	# return_value: list with activity for specified line("keystatus")

	key_status = []
	for key, value in self.ucsc_categories.items():
	if value:
	if any([line.find(keyword) != -1 for keyword in value]):
	key_status.append(key+">ACTIVE")
	else:
	key_status.append(key + ">NA")
	else:
	key_status.append(key + ">NA")
	return key_status

	@staticmethod
	def get_organism_id(org):

	# convert intern name e.g. "homo_sapiens" to ucsc name "hg38".
	# input_parameter: org = organism parameter
	# return_value: UCSC alias for this organism [ mm10 \| hg38 ]

	if org == "homo_sapiens":
	return "hg38"
	elif org == "mus_musculus":
	return "mm10"

	@staticmethod
	def get_activity_categories(organism, wd):

	# Method to get ucsc-celltype categories from JSON config
	# input_parameter: organism = organism parameter
	# wd = working directory, to find config file
	# return_value: List of categories from config.

	path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json")
	categories = {}
	with open(path_to_config) as input_file:
	data = json.loads(input_file.read())
	for x in data:
	categories[x["type"]] = x["alias_ucsc"]

	return categories

	def get_gtf(self):

	# Getter method for resulting gtf-lines
	# return_value: List of gtf-formatted Strings (Lines)

	return self.gtf_lines