Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/merging.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
78 lines (65 sloc)
3.04 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import pandas as pd | |
import numpy as np | |
from scipy.special import comb | |
QUASI_UNIFORM_CODE_INITIAL_NUMBER = 2.865064 | |
def quasi_uniform_code(n): | |
l = 0 | |
while n > 1: | |
n = math.log(n, 2) | |
l += n | |
return l + math.log(QUASI_UNIFORM_CODE_INITIAL_NUMBER, 2) | |
def break_points_number(macro_bin, IDs, ID_threshold): | |
ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]] | |
return sum(ID_boolean) | |
def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold): | |
macro_bin_size = len(macro_bin) | |
if macro_bin_size != c - l: | |
raise ValueError(c + "!=" + l) | |
macro_bin_size_code = quasi_uniform_code(macro_bin_size) | |
break_points_size = break_points_number(macro_bin, IDs, ID_threshold) | |
# todo in the original ipd L_disc L_N is computed for (k-1) | |
# L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2) | |
L_disc = quasi_uniform_code(k-1) + math.log(comb(c - 1, k - 1), 2) | |
# todo in the original ipd L_disc L_N is computed for (k-1) | |
# L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0) | |
L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0) | |
L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1) | |
L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0) | |
L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \ | |
if break_points_size > 0 else 0 | |
L_errors = math.log(macro_bin_size, 2) * macro_bin_size | |
return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev | |
def dynamic_merging(ID_threshold, IDs, initBinsCount): | |
F = np.zeros([initBinsCount, initBinsCount]) | |
discretizations = [] | |
# compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1 | |
k_ = 0 | |
k = k_ + 1 | |
for c_ in range(initBinsCount): | |
c = c_ + 1 | |
micro_bins = [i for i in range(c)] | |
F[c_, k_] = compute_bin_cost(c, 0, k, micro_bins, IDs, ID_threshold) | |
c_disc = [[micro_bins]] | |
discretizations.append(c_disc) | |
for k_ in range(1, initBinsCount): | |
k = k_ + 1 | |
for c_ in range(k_, initBinsCount): | |
c = c_ + 1 | |
min_F = None | |
first_l_micro_bins = None | |
last_micro_bins = None | |
# search for the best # of microbins in the first (k - 1) macrobins: l | |
for l_ in range(k_ - 1, c_): | |
l = l_ + 1 | |
micro_bins = [i for i in range(l, c)] | |
temp_F = F[l_, k_ - 1] + compute_bin_cost(c, l, k, micro_bins, IDs, ID_threshold) | |
if not min_F or temp_F < min_F: | |
min_F = temp_F | |
first_l_micro_bins = discretizations[l_][k_ - 1] | |
last_micro_bins = micro_bins | |
F[c_, k_] = min_F | |
disc = first_l_micro_bins.copy() | |
disc.append(last_micro_bins) | |
discretizations[c_].append(disc) | |
return F, discretizations |