Skip to content
Permalink
f810d1f0cc
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
161 lines (127 sloc) 4.88 KB
import math
import numpy as np
import pandas as pd
import data_generation as dg
from correlation_measures.binning import Binning
from data_generation import correlated_data
# bins count
UDS_BETA = 20
def compute_cond_CE(data, dim, I, point_ids):
totalPointsCount = data.shape[0]
return sum([len(c) / totalPointsCount * compute_CE(data.loc[point_ids.intersection(c), dim]) for c in I])
# discretization of the next dimension
def dim_optimal_disc(curr, prev, I, data):
binning = Binning(data, prev, UDS_BETA)
binned_points = binning.equal_frequency_binning_by_rank_int_categories()
# Series with bins support
support = binned_points.value_counts().sort_index().cumsum()
b = []
val = []
merged_bins = []
# compute cost of single merged bins (transposed unlike in the paper) i - row, upper bound; j - column, lower bound
# todo worth considering implementation of the ufunc in C (slow)
f = []
# upper bound
for i in range(UDS_BETA):
f_row = []
merged_bins_row = []
# lower bound
for j in range(i + 1):
merged_bin = binned_points[np.logical_and(binned_points <= i, binned_points >= j)].index
merged_bins_row.append(merged_bin)
f_row.append(compute_cond_CE(data, curr, I, merged_bin))
f.append(f_row)
merged_bins.append(merged_bins_row)
b.append([[merged_bins_row[0]]])
val.append([f_row[0]])
for l in range(1, UDS_BETA):
for i in range(l, UDS_BETA):
min_cost = None
arg_min = None
for j in range(l - 1, i):
temp_cost = (support[i] - support[j]) / support[i] * f[i][j + 1] \
+ support[j] / support[i] * val[j][l - 1]
if not min_cost or temp_cost < min_cost:
min_cost = temp_cost
arg_min = j
val[i].append(min_cost)
disc = b[arg_min][l - 1].copy()
disc.append(merged_bins[i][arg_min + 1])
b[i].append(disc)
return val[-1], b[-1]
def compute_CEs(data):
dim_count = data.shape[1]
CEs = []
for curr in range(dim_count):
CE = compute_CE(data[curr])
CEs.append(CE)
return CEs
def compute_CE(data):
m = data.shape[0]
if m <= 1:
return 0
curr_data = data.sort_values()
data_diff = (curr_data[1:] - curr_data.shift(1)[1:]).reset_index(drop=True)
CE = -math.log(pd.Series([((i + 1) / m) ** ((i + 1) * data_diff[i] / m) for i in range(len(data_diff))]).prod(), 2)
return CE if CE != 0 else float('inf')
def entropy(I, N):
return - sum([len(i) / N * math.log(len(i) / N, 2) for i in I])
# compute permutation
def compute_permutation(CEs):
argsort = np.argsort(CEs).tolist()
argsort.reverse()
return argsort
def extend_I(I, disc):
disc_ = [i.intersection(j) for i in I for j in disc]
# todo python361
# return [d for d in disc_ if not d.empty]
# todo python342
return [d for d in disc_ if not d.size == 0]
def compute_uds(data):
data = data.rename(columns={data.columns[i]: i for i in range(len(data.columns))})
# compute CE for all the dimensions
CEs = compute_CEs(data)
perm = compute_permutation(CEs)
# discretized dimensions array of arrays of point ids
I = [data.index]
es = []
uds = 0
prev = perm[0]
for dim in perm[1:]:
scores, discs = dim_optimal_disc(dim, prev, I, data)
# regularization step
opt_cost = None
opt_l = None
opt_I = None
opt_score = None
for l, score in enumerate(scores):
temp_I = extend_I(I, discs[l])
temp_cost = score / CEs[dim] + (entropy(temp_I, len(data)) / (
# todo old
math.log(UDS_BETA, 2) + sum([math.log(e + 1, 2) for e in es])))
# math.log(l + 1, 2) + sum([math.log(e + 1, 2) for e in es]))
# if math.log(l + 1, 2) + sum([math.log(e + 1, 2) for e in es]) != 0 else 0)
if not opt_cost or temp_cost < opt_cost:
opt_cost = temp_cost
opt_score = score
opt_l = l
opt_I = temp_I
I = opt_I
#
# print('dimension', dim)
# print('score', opt_score)
# print('CE', CEs[dim])
es.append(opt_l)
uds += CEs[dim] - opt_score
prev = dim
uds /= sum(CEs[1:])
return uds
if __name__ == "__main__":
data = pd.read_csv('synthetic_cases/synthetic_3d_parity_problem.csv', delimiter=';', usecols=[0,1,2], header=None) #0.0716387590375
# data = pd.DataFrame(correlated_data(4000, 20, 10, dg.func2))
# data = pd.DataFrame(generate_correlated_data(1000, 10, 2, func1))
# data = pd.DataFrame(generate_uncorrelated_data(4000, 20))
# classLabels = data.pop(len(data.columns) - 1)
uds = compute_uds(data)
print(uds)
# print(es)