Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
uds - init commit (always returns zero for now... :( )
- Loading branch information
Tatiana Dembelova
committed
May 25, 2017
1 parent
eebe8d8
commit 92e72c3
Showing
7 changed files
with
2,247 additions
and
10 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import pandas as pd | ||
|
||
|
||
class Binning: | ||
def __init__(self, data): | ||
self.rank_data = data.rank(method='first') | ||
|
||
# todo (small reminder) in the original ipd it is NOT equal binning | ||
# Series of binned points (with dropDuplicates produces not equally frequent bins) | ||
def equal_frequency_binning(self, dim, bins_count): | ||
return pd.qcut(self.rank_data.sort_values(by=dim)[dim], bins_count) | ||
|
||
def equal_frequency_binning2(self, dim, bins_count): | ||
qcut = pd.qcut(self.rank_data.sort_values(by=dim)[dim], bins_count) | ||
return qcut.cat.rename_categories([i for i in range(bins_count)]).reindex(qcut.index) | ||
|
||
def get_rank_data(self): | ||
return self.rank_data |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
0.1,0.4,0.9,0.5,0 | ||
0.2,0.4,0.8,0.5,0 | ||
0.3,0.4,0.7,0.5,0 | ||
0.4,0.8,0.1,0.5,0 | ||
0.5,0.8,0.1,0.5,0 | ||
0.6,0.8,0.1,0.6,0 | ||
0.1,0.4,0.9,0.5,0 | ||
0.2,0.4,0.8,0.5,0 | ||
0.3,0.4,0.7,0.5,0 | ||
0.4,0.8,0.1,0.5,0 | ||
0.5,0.8,0.1,0.5,0 | ||
0.6,0.8,0.1,0.6,0 | ||
0.1,0.4,0.9,0.5,0 | ||
0.2,0.4,0.8,0.5,0 | ||
0.3,0.4,0.7,0.5,0 | ||
0.4,0.8,0.1,0.5,0 | ||
0.5,0.8,0.1,0.5,0 | ||
0.6,0.8,0.1,0.6,0 | ||
0.1,0.4,0.9,0.5,0 | ||
0.2,0.4,0.8,0.5,0 | ||
0.3,0.4,0.7,0.5,0 | ||
0.4,0.8,0.1,0.5,0 | ||
0.5,0.8,0.1,0.5,0 | ||
0.6,0.8,0.1,0.6,0 | ||
0.1,0.4,0.9,0.5,0 | ||
0.2,0.4,0.8,0.5,0 | ||
0.3,0.4,0.7,0.5,0 | ||
0.4,0.8,0.1,0.5,0 | ||
0.5,0.8,0.1,0.5,0 | ||
0.6,0.8,0.1,0.6,0 | ||
0.1,0.4,0.9,0.5,0 | ||
0.2,0.4,0.8,0.5,0 | ||
0.3,0.4,0.7,0.5,0 | ||
0.4,0.8,0.1,0.5,0 | ||
0.5,0.8,0.1,0.5,0 | ||
0.6,0.8,0.1,0.6,0 | ||
0.1,0.4,0.9,0.5,0 | ||
0.2,0.4,0.8,0.5,0 | ||
0.3,0.4,0.7,0.5,0 | ||
0.4,0.8,0.1,0.5,0 | ||
0.5,0.8,0.1,0.5,0 | ||
0.6,0.8,0.1,0.6,0 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 | ||
0.2,0.2,0.2,0.2,1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
from unittest import TestCase | ||
import uds as u | ||
import pandas as pd | ||
|
||
|
||
class TestUds(TestCase): | ||
|
||
def setUp(self): | ||
self.data = pd.read_csv('data/testdata.csv', delimiter=',', header=None) | ||
|
||
def test_compute_CEs(self): | ||
c_es = u.compute_CEs(self.data) | ||
print(c_es) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
import math | ||
import pandas as pd | ||
import numpy as np | ||
from correlation_measures.binning import Binning | ||
|
||
# bins count | ||
BETA = 20 | ||
|
||
|
||
def compute_cond_CE(data, dim, I, point_ids): | ||
totalPointsCount = data.shape[0] | ||
return sum([len(c) / totalPointsCount * compute_CE(data.loc[point_ids.intersection(c), dim]) for c in I]) | ||
|
||
|
||
# discretization of the next dimension | ||
def dim_optimal_disc(prev, curr, binning, I, data): | ||
binned_points = binning.equal_frequency_binning2(prev, BETA) | ||
|
||
# Series with bins support | ||
support = binned_points.value_counts().sort_index().cumsum() | ||
|
||
b = [] | ||
val = [] | ||
|
||
merged_bins = [] | ||
# compute cost of single merged bins (transposed unlike in the paper) i - row, upper bound; j - column, lower bound | ||
# todo worth considering implementation of the unfunc in C (slow) | ||
f = [] | ||
# upper bound | ||
for i in range(BETA): | ||
f_row = [] | ||
merged_bins_row = [] | ||
# lower bound | ||
for j in range(i + 1): | ||
merged_bin = binned_points[np.logical_and(binned_points <= i, binned_points >= j)].index | ||
merged_bins_row.append(merged_bin) | ||
|
||
f_row.append(compute_cond_CE(data, curr, I, merged_bin)) | ||
f.append(f_row) | ||
merged_bins.append(merged_bins_row) | ||
|
||
b.append([[merged_bins_row[0]]]) | ||
val.append([f_row[0]]) | ||
|
||
for l in range(1, BETA): | ||
for i in range(l, BETA): | ||
min_cost = None | ||
arg_min = None | ||
for j in range(l - 1, i): | ||
temp_cost = (support[i] - support[j]) / support[i] * f[i][j + 1] + support[j] / support[i] * val[j][ | ||
l - 1] | ||
if not min_cost or temp_cost < min_cost: | ||
min_cost = temp_cost | ||
arg_min = j | ||
|
||
# val[i][l] | ||
val[i].append(min_cost) | ||
disc = b[arg_min][l - 1].copy() | ||
disc.append(merged_bins[i][arg_min + 1]) | ||
b[i].append(disc) | ||
|
||
return val[-1], b[-1] | ||
|
||
|
||
def compute_CEs(data): | ||
dim_count = data.shape[1] | ||
CEs = [] | ||
for curr in range(dim_count): | ||
CE = compute_CE(data[curr]) | ||
CEs.append(CE) | ||
return CEs | ||
|
||
|
||
def compute_CE(data): | ||
m = data.shape[0] | ||
if m <= 1: | ||
return 0 | ||
curr_data = data.sort_values() | ||
data_diff = (curr_data[1:] - curr_data.shift(1)[1:]).reset_index(drop=True) | ||
CE = -math.log(pd.Series([((i + 1) / m) ** ((i + 1) * data_diff[i] / m) for i in range(len(data_diff))]).prod(), 2) | ||
return CE | ||
|
||
|
||
def entropy(I, N): | ||
return - sum([len(i) / N * math.log(len(i) / N, 2) for i in I]) | ||
|
||
|
||
# compute permutation | ||
def compute_permutation(CEs): | ||
argsort = np.argsort(CEs).tolist() | ||
argsort.reverse() | ||
return argsort | ||
|
||
|
||
def extend_I(I, disc): | ||
disc_ = [i.intersection(j) for i in I for j in disc] | ||
|
||
return [d for d in disc_ if not d.empty] | ||
|
||
|
||
if __name__ == "__main__": | ||
data = pd.read_csv('data/testdata.csv', delimiter=',', header=None) | ||
classLabels = data.pop(len(data.columns) - 1) | ||
dim_count = data.shape[1] | ||
|
||
binning = Binning(data) | ||
|
||
# compute CE for all the dimensions | ||
CEs = compute_CEs(data) | ||
|
||
perm = compute_permutation(CEs) | ||
|
||
# discretized dimensions array of arrays of point ids | ||
I = [data.index] | ||
es = [] | ||
|
||
uds = 0 | ||
prev = perm[0] | ||
for dim in perm[1:]: | ||
# todo should I pass binning? | ||
costs, discs = dim_optimal_disc(prev, dim, binning, I, data) | ||
|
||
# regularization step | ||
opt_cost = None | ||
opt_l = None | ||
opt_I = None | ||
for l, cost in enumerate(costs): | ||
temp_I = extend_I(I, discs[l]) | ||
temp_cost = cost / CEs[dim] + entropy(temp_I, len(data)) / ( | ||
math.log(BETA, 2) + sum([math.log(e + 1, 2) for e in es])) | ||
if not opt_cost or temp_cost < opt_cost: | ||
opt_cost = cost | ||
opt_l = l | ||
opt_I = temp_I | ||
|
||
I = opt_I | ||
es.append(opt_l) | ||
uds += CEs[dim] - opt_cost | ||
prev = dim | ||
uds /= sum(CEs[1:]) | ||
|
||
print(uds) |