Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import math
import pandas as pd
import numpy as np
from scipy.special import comb
from enum import Enum
QUASI_UNIFORM_CODE_INITIAL_NUMBER = 2.865064
def quasi_uniform_code(n):
l = 0
while n > 1:
n = math.log(n, 2)
l += n
return l + math.log(QUASI_UNIFORM_CODE_INITIAL_NUMBER, 2)
class BreakPointsStrategy(Enum):
DEFAULT = 1
MULTIPLE_ONE_PLACE = 2
EXPONENTIAL_ONE_PLACE = 3
def break_points_number(macro_bin, IDs, ID_threshold, break_points_strategy):
'''
returns count of the break points in which ID is GREATER than ID_threshold
:param macro_bin:
:param IDs:
:param ID_threshold:
:return:
'''
if break_points_strategy is BreakPointsStrategy.DEFAULT:
ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]]
return sum(ID_boolean)
# min_id = min(IDs)
elif break_points_strategy is BreakPointsStrategy.MULTIPLE_ONE_PLACE:
# assigns a number of points according the height of ID
return sum([int(ID / ID_threshold) for ID in IDs[macro_bin[:-1]]]) if ID_threshold > 0 else 0
elif break_points_strategy is BreakPointsStrategy.EXPONENTIAL_ONE_PLACE:
return sum([int(pow(2, ID / ID_threshold - 1)) for ID in IDs[macro_bin[:-1]]]) if ID_threshold > 0 else 0
def break_points_number_footprints(macro_bin, min_footprint, footprint_diffs=None):
total = len(macro_bin[:-1])
if total == 0:
return 0
uniq, counts = np.unique(min_footprint[macro_bin[:-1]], return_counts=True)
if footprint_diffs is None:
return len(macro_bin[:-1]) - max(counts)
majority_f_id = np.argmax(counts)
majority_footprint = uniq[majority_f_id]
max_diff = max(footprint_diffs[majority_footprint].values())
# max_diff = None
# for footprint in pair_diffs[majority_f_id]:
# if max_diff is None or footprint in uniq and pair_diffs[majority_f_id][footprint] > max_diff:
# max_diff = pair_diffs[majority_f_id][footprint]
return total - counts[majority_f_id] - sum([counts[f_id] * (1 - footprint_diffs[majority_footprint][footprint] / max_diff) for f_id, footprint in enumerate(uniq) if footprint != majority_footprint])
#
# def break_points_number_footprints(macro_bin, footprint_IDs):
# footprint_IDs = footprint_IDs[macro_bin[:-1]]
# pairs = [[c1, c2] for c1 in range(footprint_IDs.shape[1]) for c2 in range(c1 + 1, footprint_IDs.shape[1])]
# return len(footprint_IDs[1:]) * len(pairs) - sum([np.argsort(fp[p]).tolist() == np.argsort(footprint_IDs[i-1][p]).tolist() for p in pairs for i, fp in enumerate(footprint_IDs[1:])])
def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold, break_points_strategy):
macro_bin_size = len(macro_bin)
if macro_bin_size != c - l:
raise ValueError(c + "!=" + l)
macro_bin_size_code = quasi_uniform_code(macro_bin_size)
break_points_size = break_points_number(macro_bin, IDs, ID_threshold, break_points_strategy)
# todo old in the original ipd L_disc L_N is computed for (k-1)
L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2)
# L_disc = quasi_uniform_code(k - 1) + math.log(comb(c - 1, k - 1), 2)
# todo old in the original ipd L_disc L_N is computed for (k-1)
L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
# L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1)
L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0)
L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \
if break_points_size > 0 else 0
L_errors = math.log(macro_bin_size, 2) * macro_bin_size
return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev
def compute_bin_cost_footprints(c, l, k, macro_bin, footprints, footprint_diffs=None):
macro_bin_size = len(macro_bin)
if macro_bin_size != c - l:
raise ValueError(c + "!=" + l)
macro_bin_size_code = quasi_uniform_code(macro_bin_size)
break_points_size = break_points_number_footprints(macro_bin, footprints, footprint_diffs)
# break_points_size = break_points_number_footprints(macro_bin, minhash)
# todo old in the original ipd L_disc L_N is computed for (k-1)
L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2)
# L_disc = quasi_uniform_code(k - 1) + math.log(comb(c - 1, k - 1), 2)
# todo old in the original ipd L_disc L_N is computed for (k-1)
L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
# L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1)
L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0)
L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \
if break_points_size > 0 else 0
L_errors = math.log(macro_bin_size, 2) * macro_bin_size
return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev
def dynamic_merging(ID_threshold, IDs, init_bins_count, break_points_strategy=BreakPointsStrategy.DEFAULT):
F = np.zeros([init_bins_count, init_bins_count])
# bps = []
discretizations = []
# compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1
k_ = 0
k = k_ + 1
for c_ in range(init_bins_count):
c = c_ + 1
micro_bins = [i for i in range(c)]
F[c_, k_] = compute_bin_cost(c, 0, k, micro_bins, IDs, ID_threshold, break_points_strategy)
# bps.append([[break_points_number(micro_bins, IDs, ID_threshold, break_points_strategy)]])
c_disc = [[micro_bins]]
discretizations.append(c_disc)
for k_ in range(1, init_bins_count):
k = k_ + 1
for c_ in range(k_, init_bins_count):
c = c_ + 1
min_F = None
# first_bps = None
# last_bp = None
first_l_micro_bins = None
last_micro_bins = None
# search for the best # of microbins in the first (k - 1) macrobins: l
for l_ in range(k_ - 1, c_):
l = l_ + 1
micro_bins = [i for i in range(l, c)]
temp_F = F[l_, k_ - 1] + compute_bin_cost(c, l, k, micro_bins, IDs, ID_threshold, break_points_strategy)
# temp_bp = break_points_number(micro_bins, IDs, ID_threshold, break_points_strategy)
if not min_F or temp_F < min_F:
min_F = temp_F
# last_bp = temp_bp
# first_bps = bps[l_][k_ - 1]
first_l_micro_bins = discretizations[l_][k_ - 1]
last_micro_bins = micro_bins
F[c_, k_] = min_F
# new_bps = first_bps.copy()
# new_bps.append(last_bp)
# bps[c_].append(new_bps)
disc = first_l_micro_bins.copy()
disc.append(last_micro_bins)
discretizations[c_].append(disc)
# print(np.argmin(F[-1]))
# print(bps[-1])
return F, discretizations
def dynamic_merging_footprints(footprints, init_bins_count, footprint_diffs=None):
F = np.zeros([init_bins_count, init_bins_count])
# bps = []
discretizations = []
# compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1
k_ = 0
k = k_ + 1
for c_ in range(init_bins_count):
c = c_ + 1
micro_bins = [i for i in range(c)]
F[c_, k_] = compute_bin_cost_footprints(c, 0, k, micro_bins, footprints, footprint_diffs)
# bps.append([[break_points_number_minhash(micro_bins, minhash)]])
c_disc = [[micro_bins]]
discretizations.append(c_disc)
for k_ in range(1, init_bins_count):
k = k_ + 1
for c_ in range(k_, init_bins_count):
c = c_ + 1
min_F = None
# first_bps = None
# last_bp = None
first_l_micro_bins = None
last_micro_bins = None
# search for the best # of microbins in the first (k - 1) macrobins: l
for l_ in range(k_ - 1, c_):
l = l_ + 1
micro_bins = [i for i in range(l, c)]
temp_F = F[l_, k_ - 1] + compute_bin_cost_footprints(c, l, k, micro_bins, footprints, footprint_diffs)
# temp_bp = break_points_number_minhash(micro_bins, minhash)
if not min_F or temp_F < min_F:
min_F = temp_F
# last_bp = temp_bp
# first_bps = bps[l_][k_ - 1]
first_l_micro_bins = discretizations[l_][k_ - 1]
last_micro_bins = micro_bins
F[c_, k_] = min_F
# new_bps = first_bps.copy()
# new_bps.append(last_bp)
# bps[c_].append(new_bps)
disc = first_l_micro_bins.copy()
disc.append(last_micro_bins)
discretizations[c_].append(disc)
# print(np.argmin(F[-1]))
# print(bps[-1])
return F, discretizations