merging.py

import math
import pandas as pd
import numpy as np
from scipy.special import comb
from enum import Enum

QUASI_UNIFORM_CODE_INITIAL_NUMBER = 2.865064


def quasi_uniform_code(n):
    l = 0
    while n > 1:
        n = math.log(n, 2)
        l += n
    return l + math.log(QUASI_UNIFORM_CODE_INITIAL_NUMBER, 2)

class BreakPointsStrategy(Enum):
    DEFAULT = 1
    MULTIPLE_ONE_PLACE = 2
    EXPONENTIAL_ONE_PLACE = 3

def break_points_number(macro_bin, IDs, ID_threshold, break_points_strategy):
    '''
    returns count of the break points in which ID is GREATER than ID_threshold
    :param macro_bin:
    :param IDs:
    :param ID_threshold:
    :return:
    '''
    if break_points_strategy is BreakPointsStrategy.DEFAULT:
        ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]]
        return sum(ID_boolean)
        # min_id = min(IDs)
    elif break_points_strategy is BreakPointsStrategy.MULTIPLE_ONE_PLACE:
        # assigns a number of points according the height of ID
        return sum([int(ID / ID_threshold) for ID in IDs[macro_bin[:-1]]]) if ID_threshold > 0 else 0
    elif break_points_strategy is BreakPointsStrategy.EXPONENTIAL_ONE_PLACE:
        return sum([int(pow(2, ID / ID_threshold - 1)) for ID in IDs[macro_bin[:-1]]]) if ID_threshold > 0 else 0


def break_points_number_footprints(macro_bin, min_footprint, footprint_diffs=None):
    total = len(macro_bin[:-1])
    if total == 0:
        return 0

    uniq, counts = np.unique(min_footprint[macro_bin[:-1]], return_counts=True)

    if footprint_diffs is None:
        return len(macro_bin[:-1]) - max(counts)

    majority_f_id = np.argmax(counts)
    majority_footprint = uniq[majority_f_id]
    max_diff = max(footprint_diffs[majority_footprint].values())
    # max_diff = None
    # for footprint in pair_diffs[majority_f_id]:
    #     if max_diff is None or footprint in uniq and pair_diffs[majority_f_id][footprint] > max_diff:
    #         max_diff = pair_diffs[majority_f_id][footprint]
    return total - counts[majority_f_id] - sum([counts[f_id] * (1 - footprint_diffs[majority_footprint][footprint] / max_diff) for f_id, footprint in enumerate(uniq) if footprint != majority_footprint])

#
# def break_points_number_footprints(macro_bin, footprint_IDs):
#     footprint_IDs = footprint_IDs[macro_bin[:-1]]
#     pairs = [[c1, c2] for c1 in range(footprint_IDs.shape[1]) for c2 in range(c1 + 1, footprint_IDs.shape[1])]
#     return len(footprint_IDs[1:]) * len(pairs) - sum([np.argsort(fp[p]).tolist() == np.argsort(footprint_IDs[i-1][p]).tolist() for p in pairs for i, fp in enumerate(footprint_IDs[1:])])


def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold, break_points_strategy):
    macro_bin_size = len(macro_bin)
    if macro_bin_size != c - l:
        raise ValueError(c + "!=" + l)

    macro_bin_size_code = quasi_uniform_code(macro_bin_size)
    break_points_size = break_points_number(macro_bin, IDs, ID_threshold, break_points_strategy)

    # todo old in the original ipd L_disc L_N is computed for (k-1)
    L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2)
    # L_disc = quasi_uniform_code(k - 1) + math.log(comb(c - 1, k - 1), 2)
    # todo old in the original ipd L_disc L_N is computed for (k-1)
    L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
    # L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)

    L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1)
    L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0)

    L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \
        if break_points_size > 0 else 0

    L_errors = math.log(macro_bin_size, 2) * macro_bin_size

    return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev


def compute_bin_cost_footprints(c, l, k, macro_bin, footprints, footprint_diffs=None):
    macro_bin_size = len(macro_bin)
    if macro_bin_size != c - l:
        raise ValueError(c + "!=" + l)
    macro_bin_size_code = quasi_uniform_code(macro_bin_size)
    break_points_size = break_points_number_footprints(macro_bin, footprints, footprint_diffs)
    # break_points_size = break_points_number_footprints(macro_bin, minhash)

    # todo old in the original ipd L_disc L_N is computed for (k-1)
    L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2)
    # L_disc = quasi_uniform_code(k - 1) + math.log(comb(c - 1, k - 1), 2)
    # todo old in the original ipd L_disc L_N is computed for (k-1)
    L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
    # L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)

    L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1)
    L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0)

    L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \
        if break_points_size > 0 else 0

    L_errors = math.log(macro_bin_size, 2) * macro_bin_size

    return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev


def dynamic_merging(ID_threshold, IDs, init_bins_count, break_points_strategy=BreakPointsStrategy.DEFAULT):
    F = np.zeros([init_bins_count, init_bins_count])
    # bps = []
    discretizations = []
    # compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1
    k_ = 0
    k = k_ + 1
    for c_ in range(init_bins_count):
        c = c_ + 1
        micro_bins = [i for i in range(c)]
        F[c_, k_] = compute_bin_cost(c, 0, k, micro_bins, IDs, ID_threshold, break_points_strategy)
        # bps.append([[break_points_number(micro_bins, IDs, ID_threshold, break_points_strategy)]])
        c_disc = [[micro_bins]]
        discretizations.append(c_disc)
    for k_ in range(1, init_bins_count):
        k = k_ + 1
        for c_ in range(k_, init_bins_count):
            c = c_ + 1
            min_F = None
            # first_bps = None
            # last_bp = None
            first_l_micro_bins = None
            last_micro_bins = None
            # search for the best # of microbins in the first (k - 1) macrobins: l
            for l_ in range(k_ - 1, c_):
                l = l_ + 1
                micro_bins = [i for i in range(l, c)]
                temp_F = F[l_, k_ - 1] + compute_bin_cost(c, l, k, micro_bins, IDs, ID_threshold, break_points_strategy)
                # temp_bp = break_points_number(micro_bins, IDs, ID_threshold, break_points_strategy)
                if not min_F or temp_F < min_F:
                    min_F = temp_F
                    # last_bp = temp_bp
                    # first_bps = bps[l_][k_ - 1]
                    first_l_micro_bins = discretizations[l_][k_ - 1]
                    last_micro_bins = micro_bins
            F[c_, k_] = min_F
            # new_bps = first_bps.copy()
            # new_bps.append(last_bp)
            # bps[c_].append(new_bps)
            disc = first_l_micro_bins.copy()
            disc.append(last_micro_bins)
            discretizations[c_].append(disc)
    # print(np.argmin(F[-1]))
    # print(bps[-1])
    return F, discretizations


def dynamic_merging_footprints(footprints, init_bins_count, footprint_diffs=None):
    F = np.zeros([init_bins_count, init_bins_count])
    # bps = []
    discretizations = []
    # compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1
    k_ = 0
    k = k_ + 1
    for c_ in range(init_bins_count):
        c = c_ + 1
        micro_bins = [i for i in range(c)]
        F[c_, k_] = compute_bin_cost_footprints(c, 0, k, micro_bins, footprints, footprint_diffs)
        # bps.append([[break_points_number_minhash(micro_bins, minhash)]])
        c_disc = [[micro_bins]]
        discretizations.append(c_disc)
    for k_ in range(1, init_bins_count):
        k = k_ + 1
        for c_ in range(k_, init_bins_count):
            c = c_ + 1
            min_F = None
            # first_bps = None
            # last_bp = None
            first_l_micro_bins = None
            last_micro_bins = None
            # search for the best # of microbins in the first (k - 1) macrobins: l
            for l_ in range(k_ - 1, c_):
                l = l_ + 1
                micro_bins = [i for i in range(l, c)]
                temp_F = F[l_, k_ - 1] + compute_bin_cost_footprints(c, l, k, micro_bins, footprints, footprint_diffs)
                # temp_bp = break_points_number_minhash(micro_bins, minhash)
                if not min_F or temp_F < min_F:
                    min_F = temp_F
                    # last_bp = temp_bp
                    # first_bps = bps[l_][k_ - 1]
                    first_l_micro_bins = discretizations[l_][k_ - 1]
                    last_micro_bins = micro_bins
            F[c_, k_] = min_F
            # new_bps = first_bps.copy()
            # new_bps.append(last_bp)
            # bps[c_].append(new_bps)
            disc = first_l_micro_bins.copy()
            disc.append(last_micro_bins)
            discretizations[c_].append(disc)
    # print(np.argmin(F[-1]))
    # print(bps[-1])
    return F, discretizations
	import math
	import pandas as pd
	import numpy as np
	from scipy.special import comb
	from enum import Enum

	QUASI_UNIFORM_CODE_INITIAL_NUMBER = 2.865064


	def quasi_uniform_code(n):
	l = 0
	while n > 1:
	n = math.log(n, 2)
	l += n
	return l + math.log(QUASI_UNIFORM_CODE_INITIAL_NUMBER, 2)

	class BreakPointsStrategy(Enum):
	DEFAULT = 1
	MULTIPLE_ONE_PLACE = 2
	EXPONENTIAL_ONE_PLACE = 3

	def break_points_number(macro_bin, IDs, ID_threshold, break_points_strategy):
	'''
	returns count of the break points in which ID is GREATER than ID_threshold
	:param macro_bin:
	:param IDs:
	:param ID_threshold:
	:return:
	'''
	if break_points_strategy is BreakPointsStrategy.DEFAULT:
	ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]]
	return sum(ID_boolean)
	# min_id = min(IDs)
	elif break_points_strategy is BreakPointsStrategy.MULTIPLE_ONE_PLACE:
	# assigns a number of points according the height of ID
	return sum([int(ID / ID_threshold) for ID in IDs[macro_bin[:-1]]]) if ID_threshold > 0 else 0
	elif break_points_strategy is BreakPointsStrategy.EXPONENTIAL_ONE_PLACE:
	return sum([int(pow(2, ID / ID_threshold - 1)) for ID in IDs[macro_bin[:-1]]]) if ID_threshold > 0 else 0


	def break_points_number_footprints(macro_bin, min_footprint, footprint_diffs=None):
	total = len(macro_bin[:-1])
	if total == 0:
	return 0

	uniq, counts = np.unique(min_footprint[macro_bin[:-1]], return_counts=True)

	if footprint_diffs is None:
	return len(macro_bin[:-1]) - max(counts)

	majority_f_id = np.argmax(counts)
	majority_footprint = uniq[majority_f_id]
	max_diff = max(footprint_diffs[majority_footprint].values())
	# max_diff = None
	# for footprint in pair_diffs[majority_f_id]:
	# if max_diff is None or footprint in uniq and pair_diffs[majority_f_id][footprint] > max_diff:
	# max_diff = pair_diffs[majority_f_id][footprint]
	return total - counts[majority_f_id] - sum([counts[f_id] * (1 - footprint_diffs[majority_footprint][footprint] / max_diff) for f_id, footprint in enumerate(uniq) if footprint != majority_footprint])

	#
	# def break_points_number_footprints(macro_bin, footprint_IDs):
	# footprint_IDs = footprint_IDs[macro_bin[:-1]]
	# pairs = [[c1, c2] for c1 in range(footprint_IDs.shape[1]) for c2 in range(c1 + 1, footprint_IDs.shape[1])]
	# return len(footprint_IDs[1:]) * len(pairs) - sum([np.argsort(fp[p]).tolist() == np.argsort(footprint_IDs[i-1][p]).tolist() for p in pairs for i, fp in enumerate(footprint_IDs[1:])])


	def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold, break_points_strategy):
	macro_bin_size = len(macro_bin)
	if macro_bin_size != c - l:
	raise ValueError(c + "!=" + l)

	macro_bin_size_code = quasi_uniform_code(macro_bin_size)
	break_points_size = break_points_number(macro_bin, IDs, ID_threshold, break_points_strategy)

	# todo old in the original ipd L_disc L_N is computed for (k-1)
	L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2)
	# L_disc = quasi_uniform_code(k - 1) + math.log(comb(c - 1, k - 1), 2)
	# todo old in the original ipd L_disc L_N is computed for (k-1)
	L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
	# L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)

	L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1)
	L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0)

	L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \
	if break_points_size > 0 else 0

	L_errors = math.log(macro_bin_size, 2) * macro_bin_size

	return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev


	def compute_bin_cost_footprints(c, l, k, macro_bin, footprints, footprint_diffs=None):
	macro_bin_size = len(macro_bin)
	if macro_bin_size != c - l:
	raise ValueError(c + "!=" + l)
	macro_bin_size_code = quasi_uniform_code(macro_bin_size)
	break_points_size = break_points_number_footprints(macro_bin, footprints, footprint_diffs)
	# break_points_size = break_points_number_footprints(macro_bin, minhash)

	# todo old in the original ipd L_disc L_N is computed for (k-1)
	L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2)
	# L_disc = quasi_uniform_code(k - 1) + math.log(comb(c - 1, k - 1), 2)
	# todo old in the original ipd L_disc L_N is computed for (k-1)
	L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
	# L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)

	L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1)
	L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0)

	L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \
	if break_points_size > 0 else 0

	L_errors = math.log(macro_bin_size, 2) * macro_bin_size

	return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev


	def dynamic_merging(ID_threshold, IDs, init_bins_count, break_points_strategy=BreakPointsStrategy.DEFAULT):
	F = np.zeros([init_bins_count, init_bins_count])
	# bps = []
	discretizations = []
	# compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1
	k_ = 0
	k = k_ + 1
	for c_ in range(init_bins_count):
	c = c_ + 1
	micro_bins = [i for i in range(c)]
	F[c_, k_] = compute_bin_cost(c, 0, k, micro_bins, IDs, ID_threshold, break_points_strategy)
	# bps.append([[break_points_number(micro_bins, IDs, ID_threshold, break_points_strategy)]])
	c_disc = [[micro_bins]]
	discretizations.append(c_disc)
	for k_ in range(1, init_bins_count):
	k = k_ + 1
	for c_ in range(k_, init_bins_count):
	c = c_ + 1
	min_F = None
	# first_bps = None
	# last_bp = None
	first_l_micro_bins = None
	last_micro_bins = None
	# search for the best # of microbins in the first (k - 1) macrobins: l
	for l_ in range(k_ - 1, c_):
	l = l_ + 1
	micro_bins = [i for i in range(l, c)]
	temp_F = F[l_, k_ - 1] + compute_bin_cost(c, l, k, micro_bins, IDs, ID_threshold, break_points_strategy)
	# temp_bp = break_points_number(micro_bins, IDs, ID_threshold, break_points_strategy)
	if not min_F or temp_F < min_F:
	min_F = temp_F
	# last_bp = temp_bp
	# first_bps = bps[l_][k_ - 1]
	first_l_micro_bins = discretizations[l_][k_ - 1]
	last_micro_bins = micro_bins
	F[c_, k_] = min_F
	# new_bps = first_bps.copy()
	# new_bps.append(last_bp)
	# bps[c_].append(new_bps)
	disc = first_l_micro_bins.copy()
	disc.append(last_micro_bins)
	discretizations[c_].append(disc)
	# print(np.argmin(F[-1]))
	# print(bps[-1])
	return F, discretizations


	def dynamic_merging_footprints(footprints, init_bins_count, footprint_diffs=None):
	F = np.zeros([init_bins_count, init_bins_count])
	# bps = []
	discretizations = []
	# compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1
	k_ = 0
	k = k_ + 1
	for c_ in range(init_bins_count):
	c = c_ + 1
	micro_bins = [i for i in range(c)]
	F[c_, k_] = compute_bin_cost_footprints(c, 0, k, micro_bins, footprints, footprint_diffs)
	# bps.append([[break_points_number_minhash(micro_bins, minhash)]])
	c_disc = [[micro_bins]]
	discretizations.append(c_disc)
	for k_ in range(1, init_bins_count):
	k = k_ + 1
	for c_ in range(k_, init_bins_count):
	c = c_ + 1
	min_F = None
	# first_bps = None
	# last_bp = None
	first_l_micro_bins = None
	last_micro_bins = None
	# search for the best # of microbins in the first (k - 1) macrobins: l
	for l_ in range(k_ - 1, c_):
	l = l_ + 1
	micro_bins = [i for i in range(l, c)]
	temp_F = F[l_, k_ - 1] + compute_bin_cost_footprints(c, l, k, micro_bins, footprints, footprint_diffs)
	# temp_bp = break_points_number_minhash(micro_bins, minhash)
	if not min_F or temp_F < min_F:
	min_F = temp_F
	# last_bp = temp_bp
	# first_bps = bps[l_][k_ - 1]
	first_l_micro_bins = discretizations[l_][k_ - 1]
	last_micro_bins = micro_bins
	F[c_, k_] = min_F
	# new_bps = first_bps.copy()
	# new_bps.append(last_bp)
	# bps[c_].append(new_bps)
	disc = first_l_micro_bins.copy()
	disc.append(last_micro_bins)
	discretizations[c_].append(disc)
	# print(np.argmin(F[-1]))
	# print(bps[-1])
	return F, discretizations