Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
optimal IPD
- Loading branch information
Tatiana Dembelova
committed
May 22, 2017
1 parent
c483532
commit eebe8d8
Showing
5 changed files
with
298 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,6 @@ | ||
*.class | ||
.idea/* | ||
*.iml | ||
|
||
# Mobile Tools for Java (J2ME) | ||
.mtj.tmp/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
0.660217;0.750373;0 | ||
0.460440;0.793855;0 | ||
0.460440;0.765323;0 | ||
0.460440;0.730397;0 | ||
0.460440;0.692839;0 | ||
0.445508;0.817600;0 | ||
0.428743;0.837934;0 | ||
0.428743;0.878658;0 | ||
0.428743;0.915953;0 | ||
0.428743;0.999258;0 | ||
0.819264;0.516152;0 | ||
0.819264;0.523430;0 | ||
0.559855;0.641614;0 | ||
0.534924;0.699147;0 | ||
0.801006;0.326723;0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import math | ||
import numpy as np | ||
|
||
def computeIDs(bin_map, curr, data, dist_bins, dim_maxes): | ||
intra_bin_measures = [] | ||
inter_bin_measures = [] | ||
|
||
data_wo_curr = data.copy() | ||
data_wo_curr.pop(curr) # todo slow? | ||
for bin_id, binn in enumerate(dist_bins): | ||
bin_data = data_wo_curr.loc[bin_map == binn] | ||
# print(bin_data) | ||
points_count = bin_data.shape[0] | ||
prev_bin_data = None | ||
inter_prod_matrix = None | ||
prev_points_count = None | ||
if bin_id > 0: | ||
prev_bin_data = data_wo_curr.loc[bin_map == dist_bins[bin_id - 1]] | ||
# print(prev_bin_data) | ||
prev_points_count = prev_bin_data.shape[0] | ||
inter_prod_matrix = np.ones([points_count, prev_points_count]) | ||
|
||
intra_prod_matrix = np.ones([points_count, points_count]) | ||
# product elements for each dimension | ||
for dim in bin_data: | ||
intra_elem = compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim]) | ||
intra_prod_matrix = np.multiply(intra_prod_matrix, intra_elem) | ||
|
||
if bin_id > 0: | ||
inter_elem = compute_ID_elem(bin_data[dim], prev_bin_data[dim], dim_maxes[dim]) | ||
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem) | ||
|
||
intra_bin_measures.append(np.sum(intra_prod_matrix) / points_count ** 2) | ||
|
||
if bin_id > 0: | ||
inter_bin_measures.append(2 * np.sum(inter_prod_matrix) / (points_count * prev_points_count)) | ||
IDs = [] | ||
for c, inter_measure in enumerate(inter_bin_measures): | ||
IDs.append(intra_bin_measures[c] - inter_measure + intra_bin_measures[c + 1]) | ||
IDs = np.array(IDs) | ||
return IDs | ||
|
||
|
||
def compute_ID_elem(bin1, bin2, dim_max): | ||
points_count1 = bin1.shape[0] | ||
points_count2 = bin2.shape[0] | ||
# max_i array | ||
max_array = np.ones([points_count1, points_count2]) | ||
max_array.fill(dim_max) | ||
# max_i - max(R^i_{j_1}, R^i_{j_2}) | ||
outer_max = np.maximum.outer(bin1, np.transpose(bin2)) | ||
return max_array - outer_max | ||
|
||
|
||
def compute_ID_threshold(IDs): | ||
IDs = IDs.copy() | ||
IDs.sort() | ||
# similar to original ipd (but possibly wrong) todo | ||
return IDs[math.ceil(int(len(IDs) / 3)) - 1] | ||
# return IDs[int(len(IDs) * ID_THRESHOLD_QUANTILE)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
import math | ||
import pandas as pd | ||
import numpy as np | ||
|
||
from interaction_distance import computeIDs, compute_ID_threshold | ||
from merging import dynamic_merging | ||
|
||
# ----------------------CONSTANTS----------------------- | ||
|
||
ID_THRESHOLD_QUANTILE = 1.0 / 3 | ||
NORMALIZATION_RADIUS = 1 | ||
FILE_DATA_OUTPUT = "out.txt" | ||
FILE_DATA_CUTS = 'cut.txt' | ||
|
||
|
||
# ------------------------------------------------------ | ||
|
||
def find_disc_macro_id(disc_macro_intervals, point): | ||
for macro in disc_macro_intervals.items(): | ||
if macro[1][0] <= point <= macro[1][1]: | ||
return macro[0] | ||
raise ValueError("Micro bin is not covered by any of the macro intervals!", disc_macro_intervals, point) | ||
|
||
|
||
def writeOutFile(name, disc_intervals, disc_points, class_labels): | ||
with open(name, 'w') as out: | ||
out.write('@relation DB\n\n') | ||
counter = [1] | ||
for i in range(len(disc_intervals)): | ||
out.write( | ||
'@attribute dim' + str(i) + ' {' + ','.join([str(j + counter[-1]) for j in disc_intervals[i]]) + '}\n') | ||
counter.append(counter[-1] + len(disc_intervals[i])) | ||
out.write('@attribute class {' + ','.join(['"' + str(i) + '"' for i in class_labels.unique()]) + '}\n\n') | ||
out.write('@data\n') | ||
|
||
for i in range(len(disc_points[0])): | ||
for j in range(len(disc_points)): | ||
out.write(str(disc_points[j][i] + counter[j])) | ||
out.write(',') | ||
out.write('"' + str(class_labels[i]) + '"\n') | ||
|
||
|
||
def writeCutFile(name, disc_intervals): | ||
with open(name, 'w') as out: | ||
for i in range(len(disc_intervals)): | ||
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n') | ||
for bin in disc_intervals[i]: | ||
out.write(str(disc_intervals[i][bin][1]) + '\n') | ||
out.write('-------------------------------------\n') | ||
|
||
|
||
def compute_optimal_discretization(data): | ||
# class labels are not of much use in original ipd.. | ||
class_labels = data.pop(data.shape[1] - 1) | ||
dim_count = data.shape[1] | ||
# dimension maximums | ||
dim_maxes = data.max(0) | ||
|
||
# number of initial dist_bins | ||
# initBinsCount = int(math.ceil(math.sqrt(row_count))) # ceil in original ipd... | ||
# todo remove later | ||
initBinsCount = 20 # ceil in original ipd... | ||
print('initBinsCount: ', initBinsCount) | ||
|
||
# normalization step todo(optional) | ||
|
||
# data = data.apply(lambda x: 2 * NORMALIZATION_RADIUS * (x - x.min()) / ( | ||
# x.max() - x.min()) - NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape))) | ||
|
||
disc_macro_intervals = [] | ||
disc_points = [] | ||
rank_data = data.rank(method='first') | ||
# iterate over all the dimensions | ||
for curr in range(dim_count): | ||
# original ids -> ranked_ids in the current dimension | ||
rank_data = rank_data.sort_values(by=curr) | ||
|
||
# todo (small reminder) in the original ipd it is NOT equal binning | ||
# Series of binned points | ||
bin_map = pd.qcut(rank_data[curr], initBinsCount) | ||
|
||
# distinct bins | ||
dist_bins = bin_map.drop_duplicates().values | ||
|
||
# -----------------------------INTERACTION DISTANCES---------------------------------- | ||
|
||
# for each bin along the current dimension compute inner measure B and inter measure | ||
IDs = computeIDs(bin_map, curr, data, dist_bins, dim_maxes) | ||
ID_threshold = compute_ID_threshold(IDs) | ||
print('ID_threshold', ID_threshold) | ||
|
||
# -----------------------------OPTIMAL MERGE STRATEGY---------------------------------- | ||
|
||
# todo replace by empty method later | ||
# table with costs, the value in i-th row and j-th column means cost of (i+1) micro bins merged into (j+1) macro bins | ||
F, discretizations = dynamic_merging(ID_threshold, IDs, initBinsCount) | ||
|
||
print('dimension ' + str(curr)) | ||
min_id = np.argmin(F[-1]) | ||
print('cost ' + str(F[-1, min_id])) | ||
|
||
(curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations, dist_bins, | ||
min_id, rank_data) | ||
|
||
print(curr_macro_intervals) | ||
print(curr_macro_points) | ||
|
||
disc_macro_intervals.append(curr_macro_intervals) | ||
disc_points.append(curr_macro_points) | ||
|
||
return disc_macro_intervals, disc_points, class_labels | ||
|
||
|
||
def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_data): | ||
disc_macro_intervals = dict() | ||
for i, macro_bin in enumerate(discretizations[-1][min_id]): | ||
macro_interval = [] | ||
for micro_bin_id in macro_bin: | ||
right = \ | ||
data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[micro_bin_id].right)][curr].index[0]][curr] | ||
if not len(macro_interval): | ||
macro_interval.append( | ||
data.loc[rank_data[rank_data[curr] == math.ceil(dist_bins[micro_bin_id].left)][curr].index[0]][ | ||
curr]) | ||
macro_interval.append(right) | ||
else: | ||
macro_interval[1] = right | ||
disc_macro_intervals[i] = macro_interval | ||
|
||
macro_points = [] | ||
for point in data.iterrows(): | ||
macro_points.append(find_disc_macro_id(disc_macro_intervals, point[1][curr])) | ||
|
||
return (disc_macro_intervals, macro_points) | ||
|
||
|
||
data = pd.read_csv('example/simple.csv', delimiter=';', header=None) | ||
|
||
disc_intervals, disc_points, class_labels = compute_optimal_discretization(data) | ||
|
||
writeOutFile(FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels) | ||
|
||
writeCutFile(FILE_DATA_CUTS, disc_intervals) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import math | ||
import pandas as pd | ||
import numpy as np | ||
from scipy.special import comb | ||
|
||
QUASI_UNIFORM_CODE_INITIAL_NUMBER = 2.865064 | ||
|
||
def quasi_uniform_code(n): | ||
l = 0 | ||
while n > 1: | ||
n = math.log(n, 2) | ||
l += n | ||
return l + math.log(QUASI_UNIFORM_CODE_INITIAL_NUMBER, 2) | ||
|
||
|
||
def break_points_number(macro_bin, IDs, ID_threshold): | ||
ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]] | ||
return sum(ID_boolean) | ||
|
||
def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold): | ||
macro_bin_size = len(macro_bin) | ||
if macro_bin_size != c - l: | ||
raise ValueError(c + "!=" + l) | ||
|
||
macro_bin_size_code = quasi_uniform_code(macro_bin_size) | ||
break_points_size = break_points_number(macro_bin, IDs, ID_threshold) | ||
|
||
# todo in the original ipd L_disc L_N is computed for (k-1) | ||
# L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2) | ||
L_disc = quasi_uniform_code(k-1) + math.log(comb(c - 1, k - 1), 2) | ||
# todo in the original ipd L_disc L_N is computed for (k-1) | ||
# L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0) | ||
L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0) | ||
|
||
L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1) | ||
L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0) | ||
|
||
L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \ | ||
if break_points_size > 0 else 0 | ||
|
||
L_errors = math.log(macro_bin_size, 2) * macro_bin_size | ||
|
||
return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev | ||
|
||
|
||
def dynamic_merging(ID_threshold, IDs, initBinsCount): | ||
F = np.zeros([initBinsCount, initBinsCount]) | ||
discretizations = [] | ||
# compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1 | ||
k_ = 0 | ||
k = k_ + 1 | ||
for c_ in range(initBinsCount): | ||
c = c_ + 1 | ||
micro_bins = [i for i in range(c)] | ||
F[c_, k_] = compute_bin_cost(c, 0, k, micro_bins, IDs, ID_threshold) | ||
c_disc = [[micro_bins]] | ||
discretizations.append(c_disc) | ||
for k_ in range(1, initBinsCount): | ||
k = k_ + 1 | ||
for c_ in range(k_, initBinsCount): | ||
c = c_ + 1 | ||
min_F = None | ||
first_l_micro_bins = None | ||
last_micro_bins = None | ||
# search for the best # of microbins in the first (k - 1) macrobins: l | ||
for l_ in range(k_ - 1, c_): | ||
l = l_ + 1 | ||
micro_bins = [i for i in range(l, c)] | ||
temp_F = F[l_, k_ - 1] + compute_bin_cost(c, l, k, micro_bins, IDs, ID_threshold) | ||
if not min_F or temp_F < min_F: | ||
min_F = temp_F | ||
first_l_micro_bins = discretizations[l_][k_ - 1] | ||
last_micro_bins = micro_bins | ||
F[c_, k_] = min_F | ||
disc = first_l_micro_bins.copy() | ||
disc.append(last_micro_bins) | ||
discretizations[c_].append(disc) | ||
return F, discretizations |