Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/uds.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
180 lines (145 sloc)
5.88 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import numpy as np | |
import pandas as pd | |
import data_generation as dg | |
from correlation_measures.binning import Binning | |
from data_generation import correlated_data | |
import experiments_logging as log | |
# bins count | |
UDS_BETA = 20 | |
def compute_cond_CE(data, dim, I, point_ids): | |
totalPointsCount = data.shape[0] | |
return sum([len(c) / totalPointsCount * compute_CE(data.loc[point_ids.intersection(c), dim]) for c in I]) | |
# discretization of the next dimension | |
def dim_optimal_disc(curr, prev, I, data): | |
binning = Binning(data, prev, UDS_BETA) | |
binned_points = binning.equal_frequency_binning_by_rank_int_categories() | |
# Series with bins support | |
support = binned_points.value_counts().sort_index().cumsum() | |
b = [] | |
val = [] | |
merged_bins = [] | |
# compute cost of single merged bins (transposed unlike in the paper) i - row, upper bound; j - column, lower bound | |
# todo worth considering implementation of the ufunc in C (slow) | |
f = [] | |
# upper bound | |
for i in range(UDS_BETA): | |
f_row = [] | |
merged_bins_row = [] | |
# lower bound | |
for j in range(i + 1): | |
merged_bin = binned_points[np.logical_and(binned_points <= i, binned_points >= j)].index | |
merged_bins_row.append(merged_bin) | |
f_row.append(compute_cond_CE(data, curr, I, merged_bin)) | |
f.append(f_row) | |
merged_bins.append(merged_bins_row) | |
b.append([[merged_bins_row[0]]]) | |
val.append([f_row[0]]) | |
for l in range(1, UDS_BETA): | |
for i in range(l, UDS_BETA): | |
min_cost = None | |
arg_min = None | |
for j in range(l - 1, i): | |
temp_cost = (support[i] - support[j]) / support[i] * f[i][j + 1] \ | |
+ support[j] / support[i] * val[j][l - 1] | |
if not min_cost or temp_cost < min_cost: | |
min_cost = temp_cost | |
arg_min = j | |
val[i].append(min_cost) | |
disc = b[arg_min][l - 1].copy() | |
disc.append(merged_bins[i][arg_min + 1]) | |
b[i].append(disc) | |
return val[-1], b[-1] | |
def compute_CEs(data): | |
dim_count = data.shape[1] | |
CEs = [] | |
for curr in range(dim_count): | |
CE = compute_CE(data[curr]) | |
CEs.append(CE) | |
return CEs | |
def compute_CE(data): | |
m = data.shape[0] | |
if m <= 1: | |
return 0 | |
curr_data = data.sort_values() | |
data_diff = (curr_data[1:] - curr_data.shift(1)[1:]).reset_index(drop=True) | |
CE = -math.log(pd.Series([((i + 1) / m) ** ((i + 1) * data_diff[i] / m) for i in range(len(data_diff))]).prod(), 2) | |
return CE if CE != 0 else float('inf') | |
def entropy(I, N): | |
return - sum([len(i) / N * math.log(len(i) / N, 2) for i in I]) | |
# compute permutation | |
def compute_permutation(CEs): | |
argsort = np.argsort(CEs).tolist() | |
argsort.reverse() | |
return argsort | |
def extend_I(I, disc): | |
disc_ = [i.intersection(j) for i in I for j in disc] | |
# todo python361 | |
# return [d for d in disc_ if not d.empty] | |
# todo python342 | |
return [d for d in disc_ if not d.size == 0] | |
def compute_uds(data): | |
data = data.rename(columns={data.columns[i]: i for i in range(len(data.columns))}) | |
# compute CE for all the dimensions | |
CEs = compute_CEs(data) | |
perm = compute_permutation(CEs) | |
# discretized dimensions array of arrays of point ids | |
I = [data.index] | |
es = [] | |
uds = 0 | |
prev = perm[0] | |
for dim in perm[1:]: | |
# print("compute for prev dim", prev, 'with dim', dim) | |
scores, discs = dim_optimal_disc(dim, prev, I, data) | |
# regularization step | |
opt_cost = None | |
opt_l = None | |
opt_I = None | |
opt_score = None | |
for l, score in enumerate(scores): | |
temp_I = extend_I(I, discs[l]) | |
temp_cost = score / CEs[dim] + (entropy(temp_I, len(data)) / ( | |
# todo old | |
math.log(UDS_BETA, 2) + sum([math.log(e + 1, 2) for e in es]))) | |
# math.log(l + 1, 2) + sum([math.log(e + 1, 2) for e in es])) | |
# if math.log(l + 1, 2) + sum([math.log(e + 1, 2) for e in es]) != 0 else 0) | |
if not opt_cost or temp_cost < opt_cost: | |
opt_cost = temp_cost | |
opt_score = score | |
opt_l = l | |
opt_I = temp_I | |
I = opt_I | |
# print('optimal disc', [[min([data[prev][i] for i in ind]), max([data[prev][i] for i in ind])] for ind in discs[opt_l]]) | |
# | |
# print('dimension', dim) | |
# print('score', opt_score) | |
# print('CE', CEs[dim]) | |
es.append(opt_l) | |
uds += CEs[dim] - opt_score | |
prev = dim | |
uds /= sum(CEs[1:]) | |
return uds | |
if __name__ == "__main__": | |
# data = pd.read_csv('synthetic_cases/uds_new.csv', delimiter=',', header=None) # uds_new.csv 0.361766479055 | |
data = pd.DataFrame(dg.correlated_data(4000, 2, 0.1, dg.func2)) | |
# data = pd.read_csv('new_cubes/cubes_02_03_c.csv', delimiter=';', header=None) | |
data = data.loc[:, :3] | |
# data = pd.DataFrame(correlated_data(4000, 20, 10, dg.func2)) | |
# data = pd.DataFrame(generate_correlated_data(1000, 10, 2, func1)) | |
# data = pd.DataFrame(generate_uncorrelated_data(4000, 20)) | |
# classLabels = data.pop(len(data.columns) - 1) | |
uds = compute_uds(data) | |
# print(uds) | |
log.plot_data_2d(data) | |
# print(es) | |
# compute | |
# for prev dim 2 with dim 1 | |
# optimal | |
# disc[[-1.6501946015802218, 5.6783209693758776], [5.6809485788581906, 16.952503563781992], [17.021263419339629, | |
# 91.038762481183142]] | |
# compute | |
# for prev dim 1 with dim 0 | |
# optimal | |
# disc[[-1.5583644544233035, 3.9951064626841521], [4.001463149284902, 7.5090488371807727], [7.5256579015119875, | |
# 81.314708533537967]] | |
# 0.361758255598 |