Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
optimal IPD
  • Loading branch information
Tatiana Dembelova committed May 22, 2017
1 parent c483532 commit eebe8d8
Show file tree
Hide file tree
Showing 5 changed files with 298 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -1,4 +1,6 @@
*.class
.idea/*
*.iml

# Mobile Tools for Java (J2ME)
.mtj.tmp/
Expand Down
15 changes: 15 additions & 0 deletions example/simple15.csv
@@ -0,0 +1,15 @@
0.660217;0.750373;0
0.460440;0.793855;0
0.460440;0.765323;0
0.460440;0.730397;0
0.460440;0.692839;0
0.445508;0.817600;0
0.428743;0.837934;0
0.428743;0.878658;0
0.428743;0.915953;0
0.428743;0.999258;0
0.819264;0.516152;0
0.819264;0.523430;0
0.559855;0.641614;0
0.534924;0.699147;0
0.801006;0.326723;0
60 changes: 60 additions & 0 deletions interaction_distance.py
@@ -0,0 +1,60 @@
import math
import numpy as np

def computeIDs(bin_map, curr, data, dist_bins, dim_maxes):
intra_bin_measures = []
inter_bin_measures = []

data_wo_curr = data.copy()
data_wo_curr.pop(curr) # todo slow?
for bin_id, binn in enumerate(dist_bins):
bin_data = data_wo_curr.loc[bin_map == binn]
# print(bin_data)
points_count = bin_data.shape[0]
prev_bin_data = None
inter_prod_matrix = None
prev_points_count = None
if bin_id > 0:
prev_bin_data = data_wo_curr.loc[bin_map == dist_bins[bin_id - 1]]
# print(prev_bin_data)
prev_points_count = prev_bin_data.shape[0]
inter_prod_matrix = np.ones([points_count, prev_points_count])

intra_prod_matrix = np.ones([points_count, points_count])
# product elements for each dimension
for dim in bin_data:
intra_elem = compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim])
intra_prod_matrix = np.multiply(intra_prod_matrix, intra_elem)

if bin_id > 0:
inter_elem = compute_ID_elem(bin_data[dim], prev_bin_data[dim], dim_maxes[dim])
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem)

intra_bin_measures.append(np.sum(intra_prod_matrix) / points_count ** 2)

if bin_id > 0:
inter_bin_measures.append(2 * np.sum(inter_prod_matrix) / (points_count * prev_points_count))
IDs = []
for c, inter_measure in enumerate(inter_bin_measures):
IDs.append(intra_bin_measures[c] - inter_measure + intra_bin_measures[c + 1])
IDs = np.array(IDs)
return IDs


def compute_ID_elem(bin1, bin2, dim_max):
points_count1 = bin1.shape[0]
points_count2 = bin2.shape[0]
# max_i array
max_array = np.ones([points_count1, points_count2])
max_array.fill(dim_max)
# max_i - max(R^i_{j_1}, R^i_{j_2})
outer_max = np.maximum.outer(bin1, np.transpose(bin2))
return max_array - outer_max


def compute_ID_threshold(IDs):
IDs = IDs.copy()
IDs.sort()
# similar to original ipd (but possibly wrong) todo
return IDs[math.ceil(int(len(IDs) / 3)) - 1]
# return IDs[int(len(IDs) * ID_THRESHOLD_QUANTILE)]
143 changes: 143 additions & 0 deletions main.py
@@ -0,0 +1,143 @@
import math
import pandas as pd
import numpy as np

from interaction_distance import computeIDs, compute_ID_threshold
from merging import dynamic_merging

# ----------------------CONSTANTS-----------------------

ID_THRESHOLD_QUANTILE = 1.0 / 3
NORMALIZATION_RADIUS = 1
FILE_DATA_OUTPUT = "out.txt"
FILE_DATA_CUTS = 'cut.txt'


# ------------------------------------------------------

def find_disc_macro_id(disc_macro_intervals, point):
for macro in disc_macro_intervals.items():
if macro[1][0] <= point <= macro[1][1]:
return macro[0]
raise ValueError("Micro bin is not covered by any of the macro intervals!", disc_macro_intervals, point)


def writeOutFile(name, disc_intervals, disc_points, class_labels):
with open(name, 'w') as out:
out.write('@relation DB\n\n')
counter = [1]
for i in range(len(disc_intervals)):
out.write(
'@attribute dim' + str(i) + ' {' + ','.join([str(j + counter[-1]) for j in disc_intervals[i]]) + '}\n')
counter.append(counter[-1] + len(disc_intervals[i]))
out.write('@attribute class {' + ','.join(['"' + str(i) + '"' for i in class_labels.unique()]) + '}\n\n')
out.write('@data\n')

for i in range(len(disc_points[0])):
for j in range(len(disc_points)):
out.write(str(disc_points[j][i] + counter[j]))
out.write(',')
out.write('"' + str(class_labels[i]) + '"\n')


def writeCutFile(name, disc_intervals):
with open(name, 'w') as out:
for i in range(len(disc_intervals)):
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
for bin in disc_intervals[i]:
out.write(str(disc_intervals[i][bin][1]) + '\n')
out.write('-------------------------------------\n')


def compute_optimal_discretization(data):
# class labels are not of much use in original ipd..
class_labels = data.pop(data.shape[1] - 1)
dim_count = data.shape[1]
# dimension maximums
dim_maxes = data.max(0)

# number of initial dist_bins
# initBinsCount = int(math.ceil(math.sqrt(row_count))) # ceil in original ipd...
# todo remove later
initBinsCount = 20 # ceil in original ipd...
print('initBinsCount: ', initBinsCount)

# normalization step todo(optional)

# data = data.apply(lambda x: 2 * NORMALIZATION_RADIUS * (x - x.min()) / (
# x.max() - x.min()) - NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape)))

disc_macro_intervals = []
disc_points = []
rank_data = data.rank(method='first')
# iterate over all the dimensions
for curr in range(dim_count):
# original ids -> ranked_ids in the current dimension
rank_data = rank_data.sort_values(by=curr)

# todo (small reminder) in the original ipd it is NOT equal binning
# Series of binned points
bin_map = pd.qcut(rank_data[curr], initBinsCount)

# distinct bins
dist_bins = bin_map.drop_duplicates().values

# -----------------------------INTERACTION DISTANCES----------------------------------

# for each bin along the current dimension compute inner measure B and inter measure
IDs = computeIDs(bin_map, curr, data, dist_bins, dim_maxes)
ID_threshold = compute_ID_threshold(IDs)
print('ID_threshold', ID_threshold)

# -----------------------------OPTIMAL MERGE STRATEGY----------------------------------

# todo replace by empty method later
# table with costs, the value in i-th row and j-th column means cost of (i+1) micro bins merged into (j+1) macro bins
F, discretizations = dynamic_merging(ID_threshold, IDs, initBinsCount)

print('dimension ' + str(curr))
min_id = np.argmin(F[-1])
print('cost ' + str(F[-1, min_id]))

(curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations, dist_bins,
min_id, rank_data)

print(curr_macro_intervals)
print(curr_macro_points)

disc_macro_intervals.append(curr_macro_intervals)
disc_points.append(curr_macro_points)

return disc_macro_intervals, disc_points, class_labels


def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_data):
disc_macro_intervals = dict()
for i, macro_bin in enumerate(discretizations[-1][min_id]):
macro_interval = []
for micro_bin_id in macro_bin:
right = \
data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[micro_bin_id].right)][curr].index[0]][curr]
if not len(macro_interval):
macro_interval.append(
data.loc[rank_data[rank_data[curr] == math.ceil(dist_bins[micro_bin_id].left)][curr].index[0]][
curr])
macro_interval.append(right)
else:
macro_interval[1] = right
disc_macro_intervals[i] = macro_interval

macro_points = []
for point in data.iterrows():
macro_points.append(find_disc_macro_id(disc_macro_intervals, point[1][curr]))

return (disc_macro_intervals, macro_points)


data = pd.read_csv('example/simple.csv', delimiter=';', header=None)

disc_intervals, disc_points, class_labels = compute_optimal_discretization(data)

writeOutFile(FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels)

writeCutFile(FILE_DATA_CUTS, disc_intervals)
78 changes: 78 additions & 0 deletions merging.py
@@ -0,0 +1,78 @@
import math
import pandas as pd
import numpy as np
from scipy.special import comb

QUASI_UNIFORM_CODE_INITIAL_NUMBER = 2.865064

def quasi_uniform_code(n):
l = 0
while n > 1:
n = math.log(n, 2)
l += n
return l + math.log(QUASI_UNIFORM_CODE_INITIAL_NUMBER, 2)


def break_points_number(macro_bin, IDs, ID_threshold):
ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]]
return sum(ID_boolean)

def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold):
macro_bin_size = len(macro_bin)
if macro_bin_size != c - l:
raise ValueError(c + "!=" + l)

macro_bin_size_code = quasi_uniform_code(macro_bin_size)
break_points_size = break_points_number(macro_bin, IDs, ID_threshold)

# todo in the original ipd L_disc L_N is computed for (k-1)
# L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2)
L_disc = quasi_uniform_code(k-1) + math.log(comb(c - 1, k - 1), 2)
# todo in the original ipd L_disc L_N is computed for (k-1)
# L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)

L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1)
L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0)

L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \
if break_points_size > 0 else 0

L_errors = math.log(macro_bin_size, 2) * macro_bin_size

return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev


def dynamic_merging(ID_threshold, IDs, initBinsCount):
F = np.zeros([initBinsCount, initBinsCount])
discretizations = []
# compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1
k_ = 0
k = k_ + 1
for c_ in range(initBinsCount):
c = c_ + 1
micro_bins = [i for i in range(c)]
F[c_, k_] = compute_bin_cost(c, 0, k, micro_bins, IDs, ID_threshold)
c_disc = [[micro_bins]]
discretizations.append(c_disc)
for k_ in range(1, initBinsCount):
k = k_ + 1
for c_ in range(k_, initBinsCount):
c = c_ + 1
min_F = None
first_l_micro_bins = None
last_micro_bins = None
# search for the best # of microbins in the first (k - 1) macrobins: l
for l_ in range(k_ - 1, c_):
l = l_ + 1
micro_bins = [i for i in range(l, c)]
temp_F = F[l_, k_ - 1] + compute_bin_cost(c, l, k, micro_bins, IDs, ID_threshold)
if not min_F or temp_F < min_F:
min_F = temp_F
first_l_micro_bins = discretizations[l_][k_ - 1]
last_micro_bins = micro_bins
F[c_, k_] = min_F
disc = first_l_micro_bins.copy()
disc.append(last_micro_bins)
discretizations[c_].append(disc)
return F, discretizations

0 comments on commit eebe8d8

Please sign in to comment.