Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/uds.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
142 lines (108 sloc)
3.92 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import pandas as pd | |
import numpy as np | |
from correlation_measures.binning import Binning | |
# bins count | |
BETA = 20 | |
def compute_cond_CE(data, dim, I, point_ids): | |
totalPointsCount = data.shape[0] | |
return sum([len(c) / totalPointsCount * compute_CE(data.loc[point_ids.intersection(c), dim]) for c in I]) | |
# discretization of the next dimension | |
def dim_optimal_disc(prev, curr, binning, I, data): | |
binned_points = binning.equal_frequency_binning2(prev, BETA) | |
# Series with bins support | |
support = binned_points.value_counts().sort_index().cumsum() | |
b = [] | |
val = [] | |
merged_bins = [] | |
# compute cost of single merged bins (transposed unlike in the paper) i - row, upper bound; j - column, lower bound | |
# todo worth considering implementation of the unfunc in C (slow) | |
f = [] | |
# upper bound | |
for i in range(BETA): | |
f_row = [] | |
merged_bins_row = [] | |
# lower bound | |
for j in range(i + 1): | |
merged_bin = binned_points[np.logical_and(binned_points <= i, binned_points >= j)].index | |
merged_bins_row.append(merged_bin) | |
f_row.append(compute_cond_CE(data, curr, I, merged_bin)) | |
f.append(f_row) | |
merged_bins.append(merged_bins_row) | |
b.append([[merged_bins_row[0]]]) | |
val.append([f_row[0]]) | |
for l in range(1, BETA): | |
for i in range(l, BETA): | |
min_cost = None | |
arg_min = None | |
for j in range(l - 1, i): | |
temp_cost = (support[i] - support[j]) / support[i] * f[i][j + 1] + support[j] / support[i] * val[j][ | |
l - 1] | |
if not min_cost or temp_cost < min_cost: | |
min_cost = temp_cost | |
arg_min = j | |
# val[i][l] | |
val[i].append(min_cost) | |
disc = b[arg_min][l - 1].copy() | |
disc.append(merged_bins[i][arg_min + 1]) | |
b[i].append(disc) | |
return val[-1], b[-1] | |
def compute_CEs(data): | |
dim_count = data.shape[1] | |
CEs = [] | |
for curr in range(dim_count): | |
CE = compute_CE(data[curr]) | |
CEs.append(CE) | |
return CEs | |
def compute_CE(data): | |
m = data.shape[0] | |
if m <= 1: | |
return 0 | |
curr_data = data.sort_values() | |
data_diff = (curr_data[1:] - curr_data.shift(1)[1:]).reset_index(drop=True) | |
CE = -math.log(pd.Series([((i + 1) / m) ** ((i + 1) * data_diff[i] / m) for i in range(len(data_diff))]).prod(), 2) | |
return CE | |
def entropy(I, N): | |
return - sum([len(i) / N * math.log(len(i) / N, 2) for i in I]) | |
# compute permutation | |
def compute_permutation(CEs): | |
argsort = np.argsort(CEs).tolist() | |
argsort.reverse() | |
return argsort | |
def extend_I(I, disc): | |
disc_ = [i.intersection(j) for i in I for j in disc] | |
return [d for d in disc_ if not d.empty] | |
if __name__ == "__main__": | |
data = pd.read_csv('data/testdata.csv', delimiter=',', header=None) | |
classLabels = data.pop(len(data.columns) - 1) | |
dim_count = data.shape[1] | |
binning = Binning(data) | |
# compute CE for all the dimensions | |
CEs = compute_CEs(data) | |
perm = compute_permutation(CEs) | |
# discretized dimensions array of arrays of point ids | |
I = [data.index] | |
es = [] | |
uds = 0 | |
prev = perm[0] | |
for dim in perm[1:]: | |
# todo should I pass binning? | |
costs, discs = dim_optimal_disc(prev, dim, binning, I, data) | |
# regularization step | |
opt_cost = None | |
opt_l = None | |
opt_I = None | |
for l, cost in enumerate(costs): | |
temp_I = extend_I(I, discs[l]) | |
temp_cost = cost / CEs[dim] + entropy(temp_I, len(data)) / ( | |
math.log(BETA, 2) + sum([math.log(e + 1, 2) for e in es])) | |
if not opt_cost or temp_cost < opt_cost: | |
opt_cost = cost | |
opt_l = l | |
opt_I = temp_I | |
I = opt_I | |
es.append(opt_l) | |
uds += CEs[dim] - opt_cost | |
prev = dim | |
uds /= sum(CEs[1:]) | |
print(uds) |