merging.py

import math
import pandas as pd
import numpy as np
from scipy.special import comb

QUASI_UNIFORM_CODE_INITIAL_NUMBER = 2.865064

def quasi_uniform_code(n):
    l = 0
    while n > 1:
        n = math.log(n, 2)
        l += n
    return l + math.log(QUASI_UNIFORM_CODE_INITIAL_NUMBER, 2)


def break_points_number(macro_bin, IDs, ID_threshold):
    ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]]
    return sum(ID_boolean)

def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold):
    macro_bin_size = len(macro_bin)
    if macro_bin_size != c - l:
        raise ValueError(c + "!=" + l)

    macro_bin_size_code = quasi_uniform_code(macro_bin_size)
    break_points_size = break_points_number(macro_bin, IDs, ID_threshold)

    # todo in the original ipd L_disc L_N is computed for (k-1)
    # L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2)
    L_disc = quasi_uniform_code(k-1) + math.log(comb(c - 1, k - 1), 2)
    # todo in the original ipd L_disc L_N is computed for (k-1)
    # L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
    L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)

    L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1)
    L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0)

    L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \
        if break_points_size > 0 else 0

    L_errors = math.log(macro_bin_size, 2) * macro_bin_size

    return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev


def dynamic_merging(ID_threshold, IDs, initBinsCount):
    F = np.zeros([initBinsCount, initBinsCount])
    discretizations = []
    # compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1
    k_ = 0
    k = k_ + 1
    for c_ in range(initBinsCount):
        c = c_ + 1
        micro_bins = [i for i in range(c)]
        F[c_, k_] = compute_bin_cost(c, 0, k, micro_bins, IDs, ID_threshold)
        c_disc = [[micro_bins]]
        discretizations.append(c_disc)
    for k_ in range(1, initBinsCount):
        k = k_ + 1
        for c_ in range(k_, initBinsCount):
            c = c_ + 1
            min_F = None
            first_l_micro_bins = None
            last_micro_bins = None
            # search for the best # of microbins in the first (k - 1) macrobins: l
            for l_ in range(k_ - 1, c_):
                l = l_ + 1
                micro_bins = [i for i in range(l, c)]
                temp_F = F[l_, k_ - 1] + compute_bin_cost(c, l, k, micro_bins, IDs, ID_threshold)
                if not min_F or temp_F < min_F:
                    min_F = temp_F
                    first_l_micro_bins = discretizations[l_][k_ - 1]
                    last_micro_bins = micro_bins
            F[c_, k_] = min_F
            disc = first_l_micro_bins.copy()
            disc.append(last_micro_bins)
            discretizations[c_].append(disc)
    return F, discretizations
	import math
	import pandas as pd
	import numpy as np
	from scipy.special import comb

	QUASI_UNIFORM_CODE_INITIAL_NUMBER = 2.865064

	def quasi_uniform_code(n):
	l = 0
	while n > 1:
	n = math.log(n, 2)
	l += n
	return l + math.log(QUASI_UNIFORM_CODE_INITIAL_NUMBER, 2)


	def break_points_number(macro_bin, IDs, ID_threshold):
	ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]]
	return sum(ID_boolean)

	def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold):
	macro_bin_size = len(macro_bin)
	if macro_bin_size != c - l:
	raise ValueError(c + "!=" + l)

	macro_bin_size_code = quasi_uniform_code(macro_bin_size)
	break_points_size = break_points_number(macro_bin, IDs, ID_threshold)

	# todo in the original ipd L_disc L_N is computed for (k-1)
	# L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2)
	L_disc = quasi_uniform_code(k-1) + math.log(comb(c - 1, k - 1), 2)
	# todo in the original ipd L_disc L_N is computed for (k-1)
	# L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
	L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)

	L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1)
	L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0)

	L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \
	if break_points_size > 0 else 0

	L_errors = math.log(macro_bin_size, 2) * macro_bin_size

	return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev


	def dynamic_merging(ID_threshold, IDs, initBinsCount):
	F = np.zeros([initBinsCount, initBinsCount])
	discretizations = []
	# compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1
	k_ = 0
	k = k_ + 1
	for c_ in range(initBinsCount):
	c = c_ + 1
	micro_bins = [i for i in range(c)]
	F[c_, k_] = compute_bin_cost(c, 0, k, micro_bins, IDs, ID_threshold)
	c_disc = [[micro_bins]]
	discretizations.append(c_disc)
	for k_ in range(1, initBinsCount):
	k = k_ + 1
	for c_ in range(k_, initBinsCount):
	c = c_ + 1
	min_F = None
	first_l_micro_bins = None
	last_micro_bins = None
	# search for the best # of microbins in the first (k - 1) macrobins: l
	for l_ in range(k_ - 1, c_):
	l = l_ + 1
	micro_bins = [i for i in range(l, c)]
	temp_F = F[l_, k_ - 1] + compute_bin_cost(c, l, k, micro_bins, IDs, ID_threshold)
	if not min_F or temp_F < min_F:
	min_F = temp_F
	first_l_micro_bins = discretizations[l_][k_ - 1]
	last_micro_bins = micro_bins
	F[c_, k_] = min_F
	disc = first_l_micro_bins.copy()
	disc.append(last_micro_bins)
	discretizations[c_].append(disc)
	return F, discretizations