uds.py

import math

import numpy as np
import pandas as pd

import data_generation as dg
from correlation_measures.binning import Binning
from data_generation import correlated_data
import experiments_logging as log

# bins count
UDS_BETA = 20


def compute_cond_CE(data, dim, I, point_ids):
    totalPointsCount = data.shape[0]
    return sum([len(c) / totalPointsCount * compute_CE(data.loc[point_ids.intersection(c), dim]) for c in I])


# discretization of the next dimension
def dim_optimal_disc(curr, prev, I, data):
    binning = Binning(data, prev, UDS_BETA)
    binned_points = binning.equal_frequency_binning_by_rank_int_categories()

    # Series with bins support
    support = binned_points.value_counts().sort_index().cumsum()

    b = []
    val = []

    merged_bins = []
    # compute cost of single merged bins (transposed unlike in the paper) i - row, upper bound; j - column, lower bound
    # todo worth considering implementation of the ufunc in C (slow)
    f = []
    # upper bound
    for i in range(UDS_BETA):
        f_row = []
        merged_bins_row = []
        # lower bound
        for j in range(i + 1):
            merged_bin = binned_points[np.logical_and(binned_points <= i, binned_points >= j)].index
            merged_bins_row.append(merged_bin)

            f_row.append(compute_cond_CE(data, curr, I, merged_bin))
        f.append(f_row)
        merged_bins.append(merged_bins_row)

        b.append([[merged_bins_row[0]]])
        val.append([f_row[0]])

    for l in range(1, UDS_BETA):
        for i in range(l, UDS_BETA):
            min_cost = None
            arg_min = None
            for j in range(l - 1, i):
                temp_cost = (support[i] - support[j]) / support[i] * f[i][j + 1] \
                            + support[j] / support[i] * val[j][l - 1]
                if not min_cost or temp_cost < min_cost:
                    min_cost = temp_cost
                    arg_min = j

            val[i].append(min_cost)
            disc = b[arg_min][l - 1].copy()
            disc.append(merged_bins[i][arg_min + 1])
            b[i].append(disc)

    return val[-1], b[-1]


def compute_CEs(data):
    dim_count = data.shape[1]
    CEs = []
    for curr in range(dim_count):
        CE = compute_CE(data[curr])
        CEs.append(CE)
    return CEs


def compute_CE(data):
    m = data.shape[0]
    if m <= 1:
        return 0
    curr_data = data.sort_values()
    data_diff = (curr_data[1:] - curr_data.shift(1)[1:]).reset_index(drop=True)
    CE = -math.log(pd.Series([((i + 1) / m) ** ((i + 1) * data_diff[i] / m) for i in range(len(data_diff))]).prod(), 2)
    return CE if CE != 0 else float('inf')


def entropy(I, N):
    return - sum([len(i) / N * math.log(len(i) / N, 2) for i in I])


# compute permutation
def compute_permutation(CEs):
    argsort = np.argsort(CEs).tolist()
    argsort.reverse()
    return argsort


def extend_I(I, disc):
    disc_ = [i.intersection(j) for i in I for j in disc]

    # todo python361
    # return [d for d in disc_ if not d.empty]
    # todo python342
    return [d for d in disc_ if not d.size == 0]

def compute_uds(data):
    data = data.rename(columns={data.columns[i]: i for i in range(len(data.columns))})
    # compute CE for all the dimensions
    CEs = compute_CEs(data)
    perm = compute_permutation(CEs)
    # discretized dimensions array of arrays of point ids
    I = [data.index]
    es = []
    uds = 0
    prev = perm[0]
    for dim in perm[1:]:
        # print("compute for prev dim", prev, 'with dim', dim)
        scores, discs = dim_optimal_disc(dim, prev, I, data)

        #     regularization step
        opt_cost = None
        opt_l = None
        opt_I = None
        opt_score = None
        for l, score in enumerate(scores):
            temp_I = extend_I(I, discs[l])
            temp_cost = score / CEs[dim] + (entropy(temp_I, len(data)) / (
                # todo old
                math.log(UDS_BETA, 2) + sum([math.log(e + 1, 2) for e in es])))
                # math.log(l + 1, 2) + sum([math.log(e + 1, 2) for e in es]))
                #                             if math.log(l + 1, 2) + sum([math.log(e + 1, 2) for e in es]) != 0 else 0)
            if not opt_cost or temp_cost < opt_cost:
                opt_cost = temp_cost
                opt_score = score
                opt_l = l
                opt_I = temp_I
        I = opt_I
        # print('optimal disc', [[min([data[prev][i] for i in ind]), max([data[prev][i] for i in ind])] for ind in discs[opt_l]])
        #
        # print('dimension', dim)
        # print('score', opt_score)
        # print('CE', CEs[dim])

        es.append(opt_l)
        uds += CEs[dim] - opt_score
        prev = dim
    uds /= sum(CEs[1:])
    return uds


if __name__ == "__main__":

    # data = pd.read_csv('synthetic_cases/uds_new.csv', delimiter=',', header=None) # uds_new.csv 0.361766479055
    data = pd.DataFrame(dg.correlated_data(4000, 2, 0.1, dg.func2))

    # data = pd.read_csv('new_cubes/cubes_02_03_c.csv', delimiter=';', header=None)
    data = data.loc[:, :3]
    # data = pd.DataFrame(correlated_data(4000, 20, 10, dg.func2))
    # data = pd.DataFrame(generate_correlated_data(1000, 10, 2, func1))
    # data = pd.DataFrame(generate_uncorrelated_data(4000, 20))
    # classLabels = data.pop(len(data.columns) - 1)
    uds = compute_uds(data)

    # print(uds)
    log.plot_data_2d(data)
    # print(es)

    # compute
    # for prev dim 2 with dim 1
    # optimal
    # disc[[-1.6501946015802218, 5.6783209693758776], [5.6809485788581906, 16.952503563781992], [17.021263419339629,
    #                                                                                            91.038762481183142]]
    # compute
    # for prev dim 1 with dim 0
    # optimal
    # disc[[-1.5583644544233035, 3.9951064626841521], [4.001463149284902, 7.5090488371807727], [7.5256579015119875,
    #                                                                                           81.314708533537967]]
    # 0.361758255598