Skip to content
Permalink
92e72c3bf7
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
142 lines (108 sloc) 3.92 KB
import math
import pandas as pd
import numpy as np
from correlation_measures.binning import Binning
# bins count
BETA = 20
def compute_cond_CE(data, dim, I, point_ids):
totalPointsCount = data.shape[0]
return sum([len(c) / totalPointsCount * compute_CE(data.loc[point_ids.intersection(c), dim]) for c in I])
# discretization of the next dimension
def dim_optimal_disc(prev, curr, binning, I, data):
binned_points = binning.equal_frequency_binning2(prev, BETA)
# Series with bins support
support = binned_points.value_counts().sort_index().cumsum()
b = []
val = []
merged_bins = []
# compute cost of single merged bins (transposed unlike in the paper) i - row, upper bound; j - column, lower bound
# todo worth considering implementation of the unfunc in C (slow)
f = []
# upper bound
for i in range(BETA):
f_row = []
merged_bins_row = []
# lower bound
for j in range(i + 1):
merged_bin = binned_points[np.logical_and(binned_points <= i, binned_points >= j)].index
merged_bins_row.append(merged_bin)
f_row.append(compute_cond_CE(data, curr, I, merged_bin))
f.append(f_row)
merged_bins.append(merged_bins_row)
b.append([[merged_bins_row[0]]])
val.append([f_row[0]])
for l in range(1, BETA):
for i in range(l, BETA):
min_cost = None
arg_min = None
for j in range(l - 1, i):
temp_cost = (support[i] - support[j]) / support[i] * f[i][j + 1] + support[j] / support[i] * val[j][
l - 1]
if not min_cost or temp_cost < min_cost:
min_cost = temp_cost
arg_min = j
# val[i][l]
val[i].append(min_cost)
disc = b[arg_min][l - 1].copy()
disc.append(merged_bins[i][arg_min + 1])
b[i].append(disc)
return val[-1], b[-1]
def compute_CEs(data):
dim_count = data.shape[1]
CEs = []
for curr in range(dim_count):
CE = compute_CE(data[curr])
CEs.append(CE)
return CEs
def compute_CE(data):
m = data.shape[0]
if m <= 1:
return 0
curr_data = data.sort_values()
data_diff = (curr_data[1:] - curr_data.shift(1)[1:]).reset_index(drop=True)
CE = -math.log(pd.Series([((i + 1) / m) ** ((i + 1) * data_diff[i] / m) for i in range(len(data_diff))]).prod(), 2)
return CE
def entropy(I, N):
return - sum([len(i) / N * math.log(len(i) / N, 2) for i in I])
# compute permutation
def compute_permutation(CEs):
argsort = np.argsort(CEs).tolist()
argsort.reverse()
return argsort
def extend_I(I, disc):
disc_ = [i.intersection(j) for i in I for j in disc]
return [d for d in disc_ if not d.empty]
if __name__ == "__main__":
data = pd.read_csv('data/testdata.csv', delimiter=',', header=None)
classLabels = data.pop(len(data.columns) - 1)
dim_count = data.shape[1]
binning = Binning(data)
# compute CE for all the dimensions
CEs = compute_CEs(data)
perm = compute_permutation(CEs)
# discretized dimensions array of arrays of point ids
I = [data.index]
es = []
uds = 0
prev = perm[0]
for dim in perm[1:]:
# todo should I pass binning?
costs, discs = dim_optimal_disc(prev, dim, binning, I, data)
# regularization step
opt_cost = None
opt_l = None
opt_I = None
for l, cost in enumerate(costs):
temp_I = extend_I(I, discs[l])
temp_cost = cost / CEs[dim] + entropy(temp_I, len(data)) / (
math.log(BETA, 2) + sum([math.log(e + 1, 2) for e in es]))
if not opt_cost or temp_cost < opt_cost:
opt_cost = cost
opt_l = l
opt_I = temp_I
I = opt_I
es.append(opt_l)
uds += CEs[dim] - opt_cost
prev = dim
uds /= sum(CEs[1:])
print(uds)