Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
uds - init commit (always returns zero for now... :( )
  • Loading branch information
Tatiana Dembelova committed May 25, 2017
1 parent eebe8d8 commit 92e72c3
Show file tree
Hide file tree
Showing 7 changed files with 2,247 additions and 10 deletions.
Empty file.
18 changes: 18 additions & 0 deletions correlation_measures/binning.py
@@ -0,0 +1,18 @@
import pandas as pd


class Binning:
def __init__(self, data):
self.rank_data = data.rank(method='first')

# todo (small reminder) in the original ipd it is NOT equal binning
# Series of binned points (with dropDuplicates produces not equally frequent bins)
def equal_frequency_binning(self, dim, bins_count):
return pd.qcut(self.rank_data.sort_values(by=dim)[dim], bins_count)

def equal_frequency_binning2(self, dim, bins_count):
qcut = pd.qcut(self.rank_data.sort_values(by=dim)[dim], bins_count)
return qcut.cat.rename_categories([i for i in range(bins_count)]).reindex(qcut.index)

def get_rank_data(self):
return self.rank_data
1,994 changes: 1,994 additions & 0 deletions data/crime_small.csv

Large diffs are not rendered by default.

72 changes: 72 additions & 0 deletions data/testdata.csv
@@ -0,0 +1,72 @@
0.1,0.4,0.9,0.5,0
0.2,0.4,0.8,0.5,0
0.3,0.4,0.7,0.5,0
0.4,0.8,0.1,0.5,0
0.5,0.8,0.1,0.5,0
0.6,0.8,0.1,0.6,0
0.1,0.4,0.9,0.5,0
0.2,0.4,0.8,0.5,0
0.3,0.4,0.7,0.5,0
0.4,0.8,0.1,0.5,0
0.5,0.8,0.1,0.5,0
0.6,0.8,0.1,0.6,0
0.1,0.4,0.9,0.5,0
0.2,0.4,0.8,0.5,0
0.3,0.4,0.7,0.5,0
0.4,0.8,0.1,0.5,0
0.5,0.8,0.1,0.5,0
0.6,0.8,0.1,0.6,0
0.1,0.4,0.9,0.5,0
0.2,0.4,0.8,0.5,0
0.3,0.4,0.7,0.5,0
0.4,0.8,0.1,0.5,0
0.5,0.8,0.1,0.5,0
0.6,0.8,0.1,0.6,0
0.1,0.4,0.9,0.5,0
0.2,0.4,0.8,0.5,0
0.3,0.4,0.7,0.5,0
0.4,0.8,0.1,0.5,0
0.5,0.8,0.1,0.5,0
0.6,0.8,0.1,0.6,0
0.1,0.4,0.9,0.5,0
0.2,0.4,0.8,0.5,0
0.3,0.4,0.7,0.5,0
0.4,0.8,0.1,0.5,0
0.5,0.8,0.1,0.5,0
0.6,0.8,0.1,0.6,0
0.1,0.4,0.9,0.5,0
0.2,0.4,0.8,0.5,0
0.3,0.4,0.7,0.5,0
0.4,0.8,0.1,0.5,0
0.5,0.8,0.1,0.5,0
0.6,0.8,0.1,0.6,0
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
0.2,0.2,0.2,0.2,1
18 changes: 8 additions & 10 deletions main.py
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
import numpy as np

from correlation_measures.binning import Binning
from interaction_distance import computeIDs, compute_ID_threshold
from merging import dynamic_merging

Expand Down Expand Up @@ -59,8 +60,8 @@ def compute_optimal_discretization(data):
# number of initial dist_bins
# initBinsCount = int(math.ceil(math.sqrt(row_count))) # ceil in original ipd...
# todo remove later
initBinsCount = 20 # ceil in original ipd...
print('initBinsCount: ', initBinsCount)
init_bins_count = 20 # ceil in original ipd...
print('initBinsCount: ', init_bins_count)

# normalization step todo(optional)

Expand All @@ -69,15 +70,12 @@ def compute_optimal_discretization(data):

disc_macro_intervals = []
disc_points = []
rank_data = data.rank(method='first')
orig_binning = Binning(data)
rank_data = orig_binning.get_rank_data()
# iterate over all the dimensions
for curr in range(dim_count):
# original ids -> ranked_ids in the current dimension
rank_data = rank_data.sort_values(by=curr)

# todo (small reminder) in the original ipd it is NOT equal binning
# Series of binned points
bin_map = pd.qcut(rank_data[curr], initBinsCount)
bin_map = orig_binning.equal_frequency_binning(curr, init_bins_count)

# distinct bins
dist_bins = bin_map.drop_duplicates().values
Expand All @@ -93,7 +91,7 @@ def compute_optimal_discretization(data):

# todo replace by empty method later
# table with costs, the value in i-th row and j-th column means cost of (i+1) micro bins merged into (j+1) macro bins
F, discretizations = dynamic_merging(ID_threshold, IDs, initBinsCount)
F, discretizations = dynamic_merging(ID_threshold, IDs, init_bins_count)

print('dimension ' + str(curr))
min_id = np.argmin(F[-1])
Expand Down Expand Up @@ -131,7 +129,7 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_
for point in data.iterrows():
macro_points.append(find_disc_macro_id(disc_macro_intervals, point[1][curr]))

return (disc_macro_intervals, macro_points)
return disc_macro_intervals, macro_points


data = pd.read_csv('example/simple.csv', delimiter=';', header=None)
Expand Down
13 changes: 13 additions & 0 deletions test_uds.py
@@ -0,0 +1,13 @@
from unittest import TestCase
import uds as u
import pandas as pd


class TestUds(TestCase):

def setUp(self):
self.data = pd.read_csv('data/testdata.csv', delimiter=',', header=None)

def test_compute_CEs(self):
c_es = u.compute_CEs(self.data)
print(c_es)
142 changes: 142 additions & 0 deletions uds.py
@@ -0,0 +1,142 @@
import math
import pandas as pd
import numpy as np
from correlation_measures.binning import Binning

# bins count
BETA = 20


def compute_cond_CE(data, dim, I, point_ids):
totalPointsCount = data.shape[0]
return sum([len(c) / totalPointsCount * compute_CE(data.loc[point_ids.intersection(c), dim]) for c in I])


# discretization of the next dimension
def dim_optimal_disc(prev, curr, binning, I, data):
binned_points = binning.equal_frequency_binning2(prev, BETA)

# Series with bins support
support = binned_points.value_counts().sort_index().cumsum()

b = []
val = []

merged_bins = []
# compute cost of single merged bins (transposed unlike in the paper) i - row, upper bound; j - column, lower bound
# todo worth considering implementation of the unfunc in C (slow)
f = []
# upper bound
for i in range(BETA):
f_row = []
merged_bins_row = []
# lower bound
for j in range(i + 1):
merged_bin = binned_points[np.logical_and(binned_points <= i, binned_points >= j)].index
merged_bins_row.append(merged_bin)

f_row.append(compute_cond_CE(data, curr, I, merged_bin))
f.append(f_row)
merged_bins.append(merged_bins_row)

b.append([[merged_bins_row[0]]])
val.append([f_row[0]])

for l in range(1, BETA):
for i in range(l, BETA):
min_cost = None
arg_min = None
for j in range(l - 1, i):
temp_cost = (support[i] - support[j]) / support[i] * f[i][j + 1] + support[j] / support[i] * val[j][
l - 1]
if not min_cost or temp_cost < min_cost:
min_cost = temp_cost
arg_min = j

# val[i][l]
val[i].append(min_cost)
disc = b[arg_min][l - 1].copy()
disc.append(merged_bins[i][arg_min + 1])
b[i].append(disc)

return val[-1], b[-1]


def compute_CEs(data):
dim_count = data.shape[1]
CEs = []
for curr in range(dim_count):
CE = compute_CE(data[curr])
CEs.append(CE)
return CEs


def compute_CE(data):
m = data.shape[0]
if m <= 1:
return 0
curr_data = data.sort_values()
data_diff = (curr_data[1:] - curr_data.shift(1)[1:]).reset_index(drop=True)
CE = -math.log(pd.Series([((i + 1) / m) ** ((i + 1) * data_diff[i] / m) for i in range(len(data_diff))]).prod(), 2)
return CE


def entropy(I, N):
return - sum([len(i) / N * math.log(len(i) / N, 2) for i in I])


# compute permutation
def compute_permutation(CEs):
argsort = np.argsort(CEs).tolist()
argsort.reverse()
return argsort


def extend_I(I, disc):
disc_ = [i.intersection(j) for i in I for j in disc]

return [d for d in disc_ if not d.empty]


if __name__ == "__main__":
data = pd.read_csv('data/testdata.csv', delimiter=',', header=None)
classLabels = data.pop(len(data.columns) - 1)
dim_count = data.shape[1]

binning = Binning(data)

# compute CE for all the dimensions
CEs = compute_CEs(data)

perm = compute_permutation(CEs)

# discretized dimensions array of arrays of point ids
I = [data.index]
es = []

uds = 0
prev = perm[0]
for dim in perm[1:]:
# todo should I pass binning?
costs, discs = dim_optimal_disc(prev, dim, binning, I, data)

# regularization step
opt_cost = None
opt_l = None
opt_I = None
for l, cost in enumerate(costs):
temp_I = extend_I(I, discs[l])
temp_cost = cost / CEs[dim] + entropy(temp_I, len(data)) / (
math.log(BETA, 2) + sum([math.log(e + 1, 2) for e in es]))
if not opt_cost or temp_cost < opt_cost:
opt_cost = cost
opt_l = l
opt_I = temp_I

I = opt_I
es.append(opt_l)
uds += CEs[dim] - opt_cost
prev = dim
uds /= sum(CEs[1:])

print(uds)

0 comments on commit 92e72c3

Please sign in to comment.