uds.py

import math
import pandas as pd
import numpy as np
from correlation_measures.binning import Binning

# bins count
BETA = 20


def compute_cond_CE(data, dim, I, point_ids):
    totalPointsCount = data.shape[0]
    return sum([len(c) / totalPointsCount * compute_CE(data.loc[point_ids.intersection(c), dim]) for c in I])


# discretization of the next dimension
def dim_optimal_disc(prev, curr, binning, I, data):
    binned_points = binning.equal_frequency_binning2(prev, BETA)

    # Series with bins support
    support = binned_points.value_counts().sort_index().cumsum()

    b = []
    val = []

    merged_bins = []
    # compute cost of single merged bins (transposed unlike in the paper) i - row, upper bound; j - column, lower bound
    # todo worth considering implementation of the unfunc in C (slow)
    f = []
    # upper bound
    for i in range(BETA):
        f_row = []
        merged_bins_row = []
        # lower bound
        for j in range(i + 1):
            merged_bin = binned_points[np.logical_and(binned_points <= i, binned_points >= j)].index
            merged_bins_row.append(merged_bin)

            f_row.append(compute_cond_CE(data, curr, I, merged_bin))
        f.append(f_row)
        merged_bins.append(merged_bins_row)

        b.append([[merged_bins_row[0]]])
        val.append([f_row[0]])

    for l in range(1, BETA):
        for i in range(l, BETA):
            min_cost = None
            arg_min = None
            for j in range(l - 1, i):
                temp_cost = (support[i] - support[j]) / support[i] * f[i][j + 1] + support[j] / support[i] * val[j][
                    l - 1]
                if not min_cost or temp_cost < min_cost:
                    min_cost = temp_cost
                    arg_min = j

            # val[i][l]
            val[i].append(min_cost)
            disc = b[arg_min][l - 1].copy()
            disc.append(merged_bins[i][arg_min + 1])
            b[i].append(disc)

    return val[-1], b[-1]


def compute_CEs(data):
    dim_count = data.shape[1]
    CEs = []
    for curr in range(dim_count):
        CE = compute_CE(data[curr])
        CEs.append(CE)
    return CEs


def compute_CE(data):
    m = data.shape[0]
    if m <= 1:
        return 0
    curr_data = data.sort_values()
    data_diff = (curr_data[1:] - curr_data.shift(1)[1:]).reset_index(drop=True)
    CE = -math.log(pd.Series([((i + 1) / m) ** ((i + 1) * data_diff[i] / m) for i in range(len(data_diff))]).prod(), 2)
    return CE


def entropy(I, N):
    return - sum([len(i) / N * math.log(len(i) / N, 2) for i in I])


# compute permutation
def compute_permutation(CEs):
    argsort = np.argsort(CEs).tolist()
    argsort.reverse()
    return argsort


def extend_I(I, disc):
    disc_ = [i.intersection(j) for i in I for j in disc]

    return [d for d in disc_ if not d.empty]


if __name__ == "__main__":
    data = pd.read_csv('data/testdata.csv', delimiter=',', header=None)
    classLabels = data.pop(len(data.columns) - 1)
    dim_count = data.shape[1]

    binning = Binning(data)

    # compute CE for all the dimensions
    CEs = compute_CEs(data)

    perm = compute_permutation(CEs)

    # discretized dimensions array of arrays of point ids
    I = [data.index]
    es = []

    uds = 0
    prev = perm[0]
    for dim in perm[1:]:
        # todo should I pass binning?
        costs, discs = dim_optimal_disc(prev, dim, binning, I, data)

        #     regularization step
        opt_cost = None
        opt_l = None
        opt_I = None
        for l, cost in enumerate(costs):
            temp_I = extend_I(I, discs[l])
            temp_cost = cost / CEs[dim] + entropy(temp_I, len(data)) / (
                math.log(BETA, 2) + sum([math.log(e + 1, 2) for e in es]))
            if not opt_cost or temp_cost < opt_cost:
                opt_cost = cost
                opt_l = l
                opt_I = temp_I

        I = opt_I
        es.append(opt_l)
        uds += CEs[dim] - opt_cost
        prev = dim
    uds /= sum(CEs[1:])

    print(uds)
	import math
	import pandas as pd
	import numpy as np
	from correlation_measures.binning import Binning

	# bins count
	BETA = 20


	def compute_cond_CE(data, dim, I, point_ids):
	totalPointsCount = data.shape[0]
	return sum([len(c) / totalPointsCount * compute_CE(data.loc[point_ids.intersection(c), dim]) for c in I])


	# discretization of the next dimension
	def dim_optimal_disc(prev, curr, binning, I, data):
	binned_points = binning.equal_frequency_binning2(prev, BETA)

	# Series with bins support
	support = binned_points.value_counts().sort_index().cumsum()

	b = []
	val = []

	merged_bins = []
	# compute cost of single merged bins (transposed unlike in the paper) i - row, upper bound; j - column, lower bound
	# todo worth considering implementation of the unfunc in C (slow)
	f = []
	# upper bound
	for i in range(BETA):
	f_row = []
	merged_bins_row = []
	# lower bound
	for j in range(i + 1):
	merged_bin = binned_points[np.logical_and(binned_points <= i, binned_points >= j)].index
	merged_bins_row.append(merged_bin)

	f_row.append(compute_cond_CE(data, curr, I, merged_bin))
	f.append(f_row)
	merged_bins.append(merged_bins_row)

	b.append([[merged_bins_row[0]]])
	val.append([f_row[0]])

	for l in range(1, BETA):
	for i in range(l, BETA):
	min_cost = None
	arg_min = None
	for j in range(l - 1, i):
	temp_cost = (support[i] - support[j]) / support[i] * f[i][j + 1] + support[j] / support[i] * val[j][
	l - 1]
	if not min_cost or temp_cost < min_cost:
	min_cost = temp_cost
	arg_min = j

	# val[i][l]
	val[i].append(min_cost)
	disc = b[arg_min][l - 1].copy()
	disc.append(merged_bins[i][arg_min + 1])
	b[i].append(disc)

	return val[-1], b[-1]


	def compute_CEs(data):
	dim_count = data.shape[1]
	CEs = []
	for curr in range(dim_count):
	CE = compute_CE(data[curr])
	CEs.append(CE)
	return CEs


	def compute_CE(data):
	m = data.shape[0]
	if m <= 1:
	return 0
	curr_data = data.sort_values()
	data_diff = (curr_data[1:] - curr_data.shift(1)[1:]).reset_index(drop=True)
	CE = -math.log(pd.Series([((i + 1) / m) ** ((i + 1) * data_diff[i] / m) for i in range(len(data_diff))]).prod(), 2)
	return CE


	def entropy(I, N):
	return - sum([len(i) / N * math.log(len(i) / N, 2) for i in I])


	# compute permutation
	def compute_permutation(CEs):
	argsort = np.argsort(CEs).tolist()
	argsort.reverse()
	return argsort


	def extend_I(I, disc):
	disc_ = [i.intersection(j) for i in I for j in disc]

	return [d for d in disc_ if not d.empty]


	if __name__ == "__main__":
	data = pd.read_csv('data/testdata.csv', delimiter=',', header=None)
	classLabels = data.pop(len(data.columns) - 1)
	dim_count = data.shape[1]

	binning = Binning(data)

	# compute CE for all the dimensions
	CEs = compute_CEs(data)

	perm = compute_permutation(CEs)

	# discretized dimensions array of arrays of point ids
	I = [data.index]
	es = []

	uds = 0
	prev = perm[0]
	for dim in perm[1:]:
	# todo should I pass binning?
	costs, discs = dim_optimal_disc(prev, dim, binning, I, data)

	# regularization step
	opt_cost = None
	opt_l = None
	opt_I = None
	for l, cost in enumerate(costs):
	temp_I = extend_I(I, discs[l])
	temp_cost = cost / CEs[dim] + entropy(temp_I, len(data)) / (
	math.log(BETA, 2) + sum([math.log(e + 1, 2) for e in es]))
	if not opt_cost or temp_cost < opt_cost:
	opt_cost = cost
	opt_l = l
	opt_I = temp_I

	I = opt_I
	es.append(opt_l)
	uds += CEs[dim] - opt_cost
	prev = dim
	uds /= sum(CEs[1:])

	print(uds)