Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/merging.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
210 lines (180 sloc)
9.14 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import pandas as pd | |
import numpy as np | |
from scipy.special import comb | |
from enum import Enum | |
QUASI_UNIFORM_CODE_INITIAL_NUMBER = 2.865064 | |
def quasi_uniform_code(n): | |
l = 0 | |
while n > 1: | |
n = math.log(n, 2) | |
l += n | |
return l + math.log(QUASI_UNIFORM_CODE_INITIAL_NUMBER, 2) | |
class BreakPointsStrategy(Enum): | |
DEFAULT = 1 | |
MULTIPLE_ONE_PLACE = 2 | |
EXPONENTIAL_ONE_PLACE = 3 | |
def break_points_number(macro_bin, IDs, ID_threshold, break_points_strategy): | |
''' | |
returns count of the break points in which ID is GREATER than ID_threshold | |
:param macro_bin: | |
:param IDs: | |
:param ID_threshold: | |
:return: | |
''' | |
if break_points_strategy is BreakPointsStrategy.DEFAULT: | |
ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]] | |
return sum(ID_boolean) | |
# min_id = min(IDs) | |
elif break_points_strategy is BreakPointsStrategy.MULTIPLE_ONE_PLACE: | |
# assigns a number of points according the height of ID | |
return sum([int(ID / ID_threshold) for ID in IDs[macro_bin[:-1]]]) if ID_threshold > 0 else 0 | |
elif break_points_strategy is BreakPointsStrategy.EXPONENTIAL_ONE_PLACE: | |
return sum([int(pow(2, ID / ID_threshold - 1)) for ID in IDs[macro_bin[:-1]]]) if ID_threshold > 0 else 0 | |
def break_points_number_footprints(macro_bin, min_footprint, footprint_diffs=None): | |
total = len(macro_bin[:-1]) | |
if total == 0: | |
return 0 | |
uniq, counts = np.unique(min_footprint[macro_bin[:-1]], return_counts=True) | |
if footprint_diffs is None: | |
return len(macro_bin[:-1]) - max(counts) | |
majority_f_id = np.argmax(counts) | |
majority_footprint = uniq[majority_f_id] | |
max_diff = max(footprint_diffs[majority_footprint].values()) | |
# max_diff = None | |
# for footprint in pair_diffs[majority_f_id]: | |
# if max_diff is None or footprint in uniq and pair_diffs[majority_f_id][footprint] > max_diff: | |
# max_diff = pair_diffs[majority_f_id][footprint] | |
return total - counts[majority_f_id] - sum([counts[f_id] * (1 - footprint_diffs[majority_footprint][footprint] / max_diff) for f_id, footprint in enumerate(uniq) if footprint != majority_footprint]) | |
# | |
# def break_points_number_footprints(macro_bin, footprint_IDs): | |
# footprint_IDs = footprint_IDs[macro_bin[:-1]] | |
# pairs = [[c1, c2] for c1 in range(footprint_IDs.shape[1]) for c2 in range(c1 + 1, footprint_IDs.shape[1])] | |
# return len(footprint_IDs[1:]) * len(pairs) - sum([np.argsort(fp[p]).tolist() == np.argsort(footprint_IDs[i-1][p]).tolist() for p in pairs for i, fp in enumerate(footprint_IDs[1:])]) | |
def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold, break_points_strategy): | |
macro_bin_size = len(macro_bin) | |
if macro_bin_size != c - l: | |
raise ValueError(c + "!=" + l) | |
macro_bin_size_code = quasi_uniform_code(macro_bin_size) | |
break_points_size = break_points_number(macro_bin, IDs, ID_threshold, break_points_strategy) | |
# todo old in the original ipd L_disc L_N is computed for (k-1) | |
L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2) | |
# L_disc = quasi_uniform_code(k - 1) + math.log(comb(c - 1, k - 1), 2) | |
# todo old in the original ipd L_disc L_N is computed for (k-1) | |
L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0) | |
# L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0) | |
L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1) | |
L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0) | |
L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \ | |
if break_points_size > 0 else 0 | |
L_errors = math.log(macro_bin_size, 2) * macro_bin_size | |
return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev | |
def compute_bin_cost_footprints(c, l, k, macro_bin, footprints, footprint_diffs=None): | |
macro_bin_size = len(macro_bin) | |
if macro_bin_size != c - l: | |
raise ValueError(c + "!=" + l) | |
macro_bin_size_code = quasi_uniform_code(macro_bin_size) | |
break_points_size = break_points_number_footprints(macro_bin, footprints, footprint_diffs) | |
# break_points_size = break_points_number_footprints(macro_bin, minhash) | |
# todo old in the original ipd L_disc L_N is computed for (k-1) | |
L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2) | |
# L_disc = quasi_uniform_code(k - 1) + math.log(comb(c - 1, k - 1), 2) | |
# todo old in the original ipd L_disc L_N is computed for (k-1) | |
L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0) | |
# L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0) | |
L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1) | |
L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0) | |
L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \ | |
if break_points_size > 0 else 0 | |
L_errors = math.log(macro_bin_size, 2) * macro_bin_size | |
return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev | |
def dynamic_merging(ID_threshold, IDs, init_bins_count, break_points_strategy=BreakPointsStrategy.DEFAULT): | |
F = np.zeros([init_bins_count, init_bins_count]) | |
# bps = [] | |
discretizations = [] | |
# compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1 | |
k_ = 0 | |
k = k_ + 1 | |
for c_ in range(init_bins_count): | |
c = c_ + 1 | |
micro_bins = [i for i in range(c)] | |
F[c_, k_] = compute_bin_cost(c, 0, k, micro_bins, IDs, ID_threshold, break_points_strategy) | |
# bps.append([[break_points_number(micro_bins, IDs, ID_threshold, break_points_strategy)]]) | |
c_disc = [[micro_bins]] | |
discretizations.append(c_disc) | |
for k_ in range(1, init_bins_count): | |
k = k_ + 1 | |
for c_ in range(k_, init_bins_count): | |
c = c_ + 1 | |
min_F = None | |
# first_bps = None | |
# last_bp = None | |
first_l_micro_bins = None | |
last_micro_bins = None | |
# search for the best # of microbins in the first (k - 1) macrobins: l | |
for l_ in range(k_ - 1, c_): | |
l = l_ + 1 | |
micro_bins = [i for i in range(l, c)] | |
temp_F = F[l_, k_ - 1] + compute_bin_cost(c, l, k, micro_bins, IDs, ID_threshold, break_points_strategy) | |
# temp_bp = break_points_number(micro_bins, IDs, ID_threshold, break_points_strategy) | |
if not min_F or temp_F < min_F: | |
min_F = temp_F | |
# last_bp = temp_bp | |
# first_bps = bps[l_][k_ - 1] | |
first_l_micro_bins = discretizations[l_][k_ - 1] | |
last_micro_bins = micro_bins | |
F[c_, k_] = min_F | |
# new_bps = first_bps.copy() | |
# new_bps.append(last_bp) | |
# bps[c_].append(new_bps) | |
disc = first_l_micro_bins.copy() | |
disc.append(last_micro_bins) | |
discretizations[c_].append(disc) | |
# print(np.argmin(F[-1])) | |
# print(bps[-1]) | |
return F, discretizations | |
def dynamic_merging_footprints(footprints, init_bins_count, footprint_diffs=None): | |
F = np.zeros([init_bins_count, init_bins_count]) | |
# bps = [] | |
discretizations = [] | |
# compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1 | |
k_ = 0 | |
k = k_ + 1 | |
for c_ in range(init_bins_count): | |
c = c_ + 1 | |
micro_bins = [i for i in range(c)] | |
F[c_, k_] = compute_bin_cost_footprints(c, 0, k, micro_bins, footprints, footprint_diffs) | |
# bps.append([[break_points_number_minhash(micro_bins, minhash)]]) | |
c_disc = [[micro_bins]] | |
discretizations.append(c_disc) | |
for k_ in range(1, init_bins_count): | |
k = k_ + 1 | |
for c_ in range(k_, init_bins_count): | |
c = c_ + 1 | |
min_F = None | |
# first_bps = None | |
# last_bp = None | |
first_l_micro_bins = None | |
last_micro_bins = None | |
# search for the best # of microbins in the first (k - 1) macrobins: l | |
for l_ in range(k_ - 1, c_): | |
l = l_ + 1 | |
micro_bins = [i for i in range(l, c)] | |
temp_F = F[l_, k_ - 1] + compute_bin_cost_footprints(c, l, k, micro_bins, footprints, footprint_diffs) | |
# temp_bp = break_points_number_minhash(micro_bins, minhash) | |
if not min_F or temp_F < min_F: | |
min_F = temp_F | |
# last_bp = temp_bp | |
# first_bps = bps[l_][k_ - 1] | |
first_l_micro_bins = discretizations[l_][k_ - 1] | |
last_micro_bins = micro_bins | |
F[c_, k_] = min_F | |
# new_bps = first_bps.copy() | |
# new_bps.append(last_bp) | |
# bps[c_].append(new_bps) | |
disc = first_l_micro_bins.copy() | |
disc.append(last_micro_bins) | |
discretizations[c_].append(disc) | |
# print(np.argmin(F[-1])) | |
# print(bps[-1]) | |
return F, discretizations |