import subprocess
import csv
import os
import json
import re


class UcscGtf:

    """
    Class to gather ucsc refSeq-FuncElem data.

    """

    def __init__(self, org, wd, data_dir):
        self.organism_id = self.get_organism_id(org)
        self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb"
        if data_dir:
            self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed")
        else:
            self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed")
        self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed")
        print("Getting UCSC Data")
        print("Path to Bin: " + self.path_to_bin)
        self.generate_gff_file()
        self.ucsc_categories = self.get_activity_categories(org, wd)
        self.gtf_lines = self.read_gff_to_gtf()
        print("UCSC finished !")

    def generate_gff_file(self):
        # Call bigBedToBed binary to get a Bed file in the UCSCData folder
        callstring = [self.path_to_bin, self.link, self.output]
        subprocess.call(callstring)

    def read_gff_to_gtf(self):
        # Reads Bed File and return a gtf-formatted list of elements.
        gtf_lines = []
        with open(self.output, 'r') as csvfile:
            tsvreader = csv.reader(csvfile, delimiter='\t')
            for row in tsvreader:
                sequence = []
                sequence.append(row[0])
                sequence.append("UCSC")
                sequence.append(row[3].lower())
                sequence.append(row[1])
                sequence.append(row[2])
                sequence.append(".")
                sequence.append(row[5])
                sequence.append(".")
                sequence.append('; '.join([self.find_ID(''.join(row[11:])), 'activity \"'+", ".join(self.get_activity(''.join(row[11:]))) + '"']))
                gtf_lines.append(sequence)

            return gtf_lines

    def find_ID(self, line):
        # Find RefSeq ID in Line
        pattern = re.compile(r'ID:[0-9]{,9}|$')
        ref_id = re.search(pattern, line).group()
        splitted = ref_id.split(":")
        if len(splitted) == 2:
            returnstring = str(splitted[0])+' "'+str(splitted[1])+'"'
        else:
            returnstring = 'ID '+'"NA"'

        return returnstring

    def get_activity(self, line):
        # Find activity categories in bed file
        key_status = []
        for key, value in self.ucsc_categories.items():
            if value:
                if any([line.find(keyword) != -1 for keyword in value]):
                    key_status.append(key+">ACTIVE")
                else:
                    key_status.append(key + ">NA")
            else:
                key_status.append(key + ">NA")
        return key_status

    @staticmethod
    def get_organism_id(org):
        # convert intern name e.g. "homo_sapiens" to ucsc name "hg38".
        if org == "homo_sapiens":
            return "hg38"
        elif org == "mus_musculus":
            return "mm10"

    @staticmethod
    def get_activity_categories(organism, wd):
        # Method to get ucsc-celltype categories from JSON config
        path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json")
        categories = {}
        with open(path_to_config) as input_file:
            data = json.loads(input_file.read())
            for x in data:
                categories[x["type"]] = x["alias_ucsc"]

        return categories

    def get_gtf(self):
        return self.gtf_lines