Skip to content
Permalink
eebe8d803f
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
78 lines (65 sloc) 3.04 KB
import math
import pandas as pd
import numpy as np
from scipy.special import comb
QUASI_UNIFORM_CODE_INITIAL_NUMBER = 2.865064
def quasi_uniform_code(n):
l = 0
while n > 1:
n = math.log(n, 2)
l += n
return l + math.log(QUASI_UNIFORM_CODE_INITIAL_NUMBER, 2)
def break_points_number(macro_bin, IDs, ID_threshold):
ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]]
return sum(ID_boolean)
def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold):
macro_bin_size = len(macro_bin)
if macro_bin_size != c - l:
raise ValueError(c + "!=" + l)
macro_bin_size_code = quasi_uniform_code(macro_bin_size)
break_points_size = break_points_number(macro_bin, IDs, ID_threshold)
# todo in the original ipd L_disc L_N is computed for (k-1)
# L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2)
L_disc = quasi_uniform_code(k-1) + math.log(comb(c - 1, k - 1), 2)
# todo in the original ipd L_disc L_N is computed for (k-1)
# L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1)
L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0)
L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \
if break_points_size > 0 else 0
L_errors = math.log(macro_bin_size, 2) * macro_bin_size
return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev
def dynamic_merging(ID_threshold, IDs, initBinsCount):
F = np.zeros([initBinsCount, initBinsCount])
discretizations = []
# compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1
k_ = 0
k = k_ + 1
for c_ in range(initBinsCount):
c = c_ + 1
micro_bins = [i for i in range(c)]
F[c_, k_] = compute_bin_cost(c, 0, k, micro_bins, IDs, ID_threshold)
c_disc = [[micro_bins]]
discretizations.append(c_disc)
for k_ in range(1, initBinsCount):
k = k_ + 1
for c_ in range(k_, initBinsCount):
c = c_ + 1
min_F = None
first_l_micro_bins = None
last_micro_bins = None
# search for the best # of microbins in the first (k - 1) macrobins: l
for l_ in range(k_ - 1, c_):
l = l_ + 1
micro_bins = [i for i in range(l, c)]
temp_F = F[l_, k_ - 1] + compute_bin_cost(c, l, k, micro_bins, IDs, ID_threshold)
if not min_F or temp_F < min_F:
min_F = temp_F
first_l_micro_bins = discretizations[l_][k_ - 1]
last_micro_bins = micro_bins
F[c_, k_] = min_F
disc = first_l_micro_bins.copy()
disc.append(last_micro_bins)
discretizations[c_].append(disc)
return F, discretizations