merging.py

import math
import pandas as pd
import numpy as np
from scipy.special import comb

QUASI_UNIFORM_CODE_INITIAL_NUMBER = 2.865064


def quasi_uniform_code(n):
    l = 0
    while n > 1:
        n = math.log(n, 2)
        l += n
    return l + math.log(QUASI_UNIFORM_CODE_INITIAL_NUMBER, 2)


def break_points_number(macro_bin, IDs, ID_threshold):
    '''
    returns count of the break points in which ID is GREATER OR EQUAL than ID_threshold
    :param macro_bin:
    :param IDs:
    :param ID_threshold:
    :return:
    '''
    # todo old condition as in original IPD: ID > ID_threshold
    ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]]
    return sum(ID_boolean)


def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold):
    macro_bin_size = len(macro_bin)
    if macro_bin_size != c - l:
        raise ValueError(c + "!=" + l)

    macro_bin_size_code = quasi_uniform_code(macro_bin_size)
    break_points_size = break_points_number(macro_bin, IDs, ID_threshold)

    # todo old in the original ipd L_disc L_N is computed for (k-1)
    # L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2)
    L_disc = quasi_uniform_code(k - 1) + math.log(comb(c - 1, k - 1), 2)
    # todo old in the original ipd L_disc L_N is computed for (k-1)
    # L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
    L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)

    L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1)
    L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0)

    L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \
        if break_points_size > 0 else 0

    L_errors = math.log(macro_bin_size, 2) * macro_bin_size

    return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev


def dynamic_merging(ID_threshold, IDs, init_bins_count):
    F = np.zeros([init_bins_count, init_bins_count])
    discretizations = []
    # compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1
    k_ = 0
    k = k_ + 1
    for c_ in range(init_bins_count):
        c = c_ + 1
        micro_bins = [i for i in range(c)]
        F[c_, k_] = compute_bin_cost(c, 0, k, micro_bins, IDs, ID_threshold)
        c_disc = [[micro_bins]]
        discretizations.append(c_disc)
    for k_ in range(1, init_bins_count):
        k = k_ + 1
        for c_ in range(k_, init_bins_count):
            c = c_ + 1
            min_F = None
            first_l_micro_bins = None
            last_micro_bins = None
            # search for the best # of microbins in the first (k - 1) macrobins: l
            for l_ in range(k_ - 1, c_):
                l = l_ + 1
                micro_bins = [i for i in range(l, c)]
                temp_F = F[l_, k_ - 1] + compute_bin_cost(c, l, k, micro_bins, IDs, ID_threshold)
                if not min_F or temp_F < min_F:
                    min_F = temp_F
                    first_l_micro_bins = discretizations[l_][k_ - 1]
                    last_micro_bins = micro_bins
            F[c_, k_] = min_F
            disc = first_l_micro_bins.copy()
            disc.append(last_micro_bins)
            discretizations[c_].append(disc)
    return F, discretizations
	import math
	import pandas as pd
	import numpy as np
	from scipy.special import comb

	QUASI_UNIFORM_CODE_INITIAL_NUMBER = 2.865064


	def quasi_uniform_code(n):
	l = 0
	while n > 1:
	n = math.log(n, 2)
	l += n
	return l + math.log(QUASI_UNIFORM_CODE_INITIAL_NUMBER, 2)


	def break_points_number(macro_bin, IDs, ID_threshold):
	'''
	returns count of the break points in which ID is GREATER OR EQUAL than ID_threshold
	:param macro_bin:
	:param IDs:
	:param ID_threshold:
	:return:
	'''
	# todo old condition as in original IPD: ID > ID_threshold
	ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]]
	return sum(ID_boolean)


	def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold):
	macro_bin_size = len(macro_bin)
	if macro_bin_size != c - l:
	raise ValueError(c + "!=" + l)

	macro_bin_size_code = quasi_uniform_code(macro_bin_size)
	break_points_size = break_points_number(macro_bin, IDs, ID_threshold)

	# todo old in the original ipd L_disc L_N is computed for (k-1)
	# L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2)
	L_disc = quasi_uniform_code(k - 1) + math.log(comb(c - 1, k - 1), 2)
	# todo old in the original ipd L_disc L_N is computed for (k-1)
	# L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
	L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)

	L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1)
	L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0)

	L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \
	if break_points_size > 0 else 0

	L_errors = math.log(macro_bin_size, 2) * macro_bin_size

	return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev


	def dynamic_merging(ID_threshold, IDs, init_bins_count):
	F = np.zeros([init_bins_count, init_bins_count])
	discretizations = []
	# compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1
	k_ = 0
	k = k_ + 1
	for c_ in range(init_bins_count):
	c = c_ + 1
	micro_bins = [i for i in range(c)]
	F[c_, k_] = compute_bin_cost(c, 0, k, micro_bins, IDs, ID_threshold)
	c_disc = [[micro_bins]]
	discretizations.append(c_disc)
	for k_ in range(1, init_bins_count):
	k = k_ + 1
	for c_ in range(k_, init_bins_count):
	c = c_ + 1
	min_F = None
	first_l_micro_bins = None
	last_micro_bins = None
	# search for the best # of microbins in the first (k - 1) macrobins: l
	for l_ in range(k_ - 1, c_):
	l = l_ + 1
	micro_bins = [i for i in range(l, c)]
	temp_F = F[l_, k_ - 1] + compute_bin_cost(c, l, k, micro_bins, IDs, ID_threshold)
	if not min_F or temp_F < min_F:
	min_F = temp_F
	first_l_micro_bins = discretizations[l_][k_ - 1]
	last_micro_bins = micro_bins
	F[c_, k_] = min_F
	disc = first_l_micro_bins.copy()
	disc.append(last_micro_bins)
	discretizations[c_].append(disc)
	return F, discretizations