Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import math
import numpy as np
import pandas as pd
import data_generation as dg
from correlation_measures.binning import Binning
from data_generation import correlated_data
import experiments_logging as log
# bins count
UDS_BETA = 20
def compute_cond_CE(data, dim, I, point_ids):
totalPointsCount = data.shape[0]
return sum([len(c) / totalPointsCount * compute_CE(data.loc[point_ids.intersection(c), dim]) for c in I])
# discretization of the next dimension
def dim_optimal_disc(curr, prev, I, data):
binning = Binning(data, prev, UDS_BETA)
binned_points = binning.equal_frequency_binning_by_rank_int_categories()
# Series with bins support
support = binned_points.value_counts().sort_index().cumsum()
b = []
val = []
merged_bins = []
# compute cost of single merged bins (transposed unlike in the paper) i - row, upper bound; j - column, lower bound
# todo worth considering implementation of the ufunc in C (slow)
f = []
# upper bound
for i in range(UDS_BETA):
f_row = []
merged_bins_row = []
# lower bound
for j in range(i + 1):
merged_bin = binned_points[np.logical_and(binned_points <= i, binned_points >= j)].index
merged_bins_row.append(merged_bin)
f_row.append(compute_cond_CE(data, curr, I, merged_bin))
f.append(f_row)
merged_bins.append(merged_bins_row)
b.append([[merged_bins_row[0]]])
val.append([f_row[0]])
for l in range(1, UDS_BETA):
for i in range(l, UDS_BETA):
min_cost = None
arg_min = None
for j in range(l - 1, i):
temp_cost = (support[i] - support[j]) / support[i] * f[i][j + 1] \
+ support[j] / support[i] * val[j][l - 1]
if not min_cost or temp_cost < min_cost:
min_cost = temp_cost
arg_min = j
val[i].append(min_cost)
disc = b[arg_min][l - 1].copy()
disc.append(merged_bins[i][arg_min + 1])
b[i].append(disc)
return val[-1], b[-1]
def compute_CEs(data):
dim_count = data.shape[1]
CEs = []
for curr in range(dim_count):
CE = compute_CE(data[curr])
CEs.append(CE)
return CEs
def compute_CE(data):
m = data.shape[0]
if m <= 1:
return 0
curr_data = data.sort_values()
data_diff = (curr_data[1:] - curr_data.shift(1)[1:]).reset_index(drop=True)
CE = -math.log(pd.Series([((i + 1) / m) ** ((i + 1) * data_diff[i] / m) for i in range(len(data_diff))]).prod(), 2)
return CE if CE != 0 else float('inf')
def entropy(I, N):
return - sum([len(i) / N * math.log(len(i) / N, 2) for i in I])
# compute permutation
def compute_permutation(CEs):
argsort = np.argsort(CEs).tolist()
argsort.reverse()
return argsort
def extend_I(I, disc):
disc_ = [i.intersection(j) for i in I for j in disc]
# todo python361
# return [d for d in disc_ if not d.empty]
# todo python342
return [d for d in disc_ if not d.size == 0]
def compute_uds(data):
data = data.rename(columns={data.columns[i]: i for i in range(len(data.columns))})
# compute CE for all the dimensions
CEs = compute_CEs(data)
perm = compute_permutation(CEs)
# discretized dimensions array of arrays of point ids
I = [data.index]
es = []
uds = 0
prev = perm[0]
for dim in perm[1:]:
# print("compute for prev dim", prev, 'with dim', dim)
scores, discs = dim_optimal_disc(dim, prev, I, data)
# regularization step
opt_cost = None
opt_l = None
opt_I = None
opt_score = None
for l, score in enumerate(scores):
temp_I = extend_I(I, discs[l])
temp_cost = score / CEs[dim] + (entropy(temp_I, len(data)) / (
# todo old
math.log(UDS_BETA, 2) + sum([math.log(e + 1, 2) for e in es])))
# math.log(l + 1, 2) + sum([math.log(e + 1, 2) for e in es]))
# if math.log(l + 1, 2) + sum([math.log(e + 1, 2) for e in es]) != 0 else 0)
if not opt_cost or temp_cost < opt_cost:
opt_cost = temp_cost
opt_score = score
opt_l = l
opt_I = temp_I
I = opt_I
# print('optimal disc', [[min([data[prev][i] for i in ind]), max([data[prev][i] for i in ind])] for ind in discs[opt_l]])
#
# print('dimension', dim)
# print('score', opt_score)
# print('CE', CEs[dim])
es.append(opt_l)
uds += CEs[dim] - opt_score
prev = dim
uds /= sum(CEs[1:])
return uds
if __name__ == "__main__":
# data = pd.read_csv('synthetic_cases/uds_new.csv', delimiter=',', header=None) # uds_new.csv 0.361766479055
data = pd.DataFrame(dg.correlated_data(4000, 2, 0.1, dg.func2))
# data = pd.read_csv('new_cubes/cubes_02_03_c.csv', delimiter=';', header=None)
data = data.loc[:, :3]
# data = pd.DataFrame(correlated_data(4000, 20, 10, dg.func2))
# data = pd.DataFrame(generate_correlated_data(1000, 10, 2, func1))
# data = pd.DataFrame(generate_uncorrelated_data(4000, 20))
# classLabels = data.pop(len(data.columns) - 1)
uds = compute_uds(data)
# print(uds)
log.plot_data_2d(data)
# print(es)
# compute
# for prev dim 2 with dim 1
# optimal
# disc[[-1.6501946015802218, 5.6783209693758776], [5.6809485788581906, 16.952503563781992], [17.021263419339629,
# 91.038762481183142]]
# compute
# for prev dim 1 with dim 0
# optimal
# disc[[-1.5583644544233035, 3.9951064626841521], [4.001463149284902, 7.5090488371807727], [7.5256579015119875,
# 81.314708533537967]]
# 0.361758255598