Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/footprint.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
192 lines (145 sloc)
8.94 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import interaction_distance as id | |
def _compute_footprint_seeds(bin_widths, footprints_number, background_seed=True): | |
if footprints_number < 2: | |
raise ValueError("footprints_number should be >= 2") | |
min_width_id = np.argmin(bin_widths) | |
bin_ids = [min_width_id] | |
min_width = bin_widths[min_width_id] | |
max_width = max(bin_widths) | |
# print('min_width', min_width) | |
# print('max_width', max_width) | |
while len(bin_ids) < footprints_number - 1: | |
mean = np.mean(bin_ids) | |
bin_scores = [abs(mean - bid) / len(bin_widths) + min_width / bw for bid, bw in enumerate(bin_widths)] | |
for mbid in bin_ids: | |
bin_scores[mbid] = 0 | |
bin_ids.append(np.argmax(bin_scores)) | |
if background_seed: | |
mean = np.mean(bin_ids) | |
bin_scores = [abs(mean - bid) / len(bin_widths) + bw / max_width for bid, bw in | |
enumerate(bin_widths)] | |
for mbid in bin_ids: | |
bin_scores[mbid] = 0 | |
bin_ids.append(np.argmax(bin_scores)) | |
return bin_ids | |
def _compute_ID_minfootprints(footprint_IDs): | |
return np.apply_along_axis(lambda a: np.argmin(a), 1, footprint_IDs) | |
def compute_ID_footprints(bin_map, curr, data, dim_maxes, footprints_number, bins_right_bounds, seed_heuristic=False, tradeoff=False): | |
curr_min = data[curr].min() | |
data_wo_curr = data.copy() | |
if curr in data.columns: | |
data_wo_curr.pop(curr) | |
footprint_IDs = compute_footprint_IDs1(bin_map, data_wo_curr, curr_min, dim_maxes, footprints_number, bins_right_bounds, seed_heuristic, tradeoff) | |
cols = footprint_IDs.shape[1] | |
pairs = [[c1, c2] for c1 in range(cols) for c2 in range(c1 + 1, cols)] | |
footprint_diffs = {c1: dict() for c1 in range(cols)} | |
for p in pairs: | |
diff = sum([abs(fp[p[0]] - fp[p[1]]) for fp in footprint_IDs]) | |
footprint_diffs[p[0]][p[1]] = diff | |
footprint_diffs[p[1]][p[0]] = diff | |
return _compute_ID_minfootprints(footprint_IDs), footprint_IDs, footprint_diffs | |
def compute_footprint_IDs1(bin_map, data_wo_curr, curr_min, dim_maxes, footprints_number, bins_right_bounds, seed_heuristic=False, tradeoff=False): | |
footprint_bin_ids = None | |
if seed_heuristic: | |
bin_widths = np.array([j - i for i, j in zip([curr_min] + bins_right_bounds, bins_right_bounds)]) | |
footprint_bin_ids = _compute_footprint_seeds(bin_widths, footprints_number, True) | |
dist_bins = bin_map.cat.categories | |
bins_count = len(dist_bins) | |
inner_bin_measures = [] | |
if footprint_bin_ids is None: | |
# todo make equi width instead equi frequency seeding | |
footprint_bin_ids = [int(i / (footprints_number - 1) * (bins_count - 1)) for i in range(footprints_number)] if footprints_number > 1 else [0] | |
# print('selected footprint seeds', [bins_right_bounds[b] for b in footprint_bin_ids]) | |
# increased width does not increase performance much | |
# footprints_data = [pd.concat([data.loc[bin_map == dist_bins[bin_id + i]] for i in range(fp_bin_number)], axis=0) for bin_id in footprint_bin_ids] | |
footprints_data = [data_wo_curr.loc[bin_map == dist_bins[bin_id]] for bin_id in footprint_bin_ids] | |
if tradeoff: | |
footprints_data = [footprint_data.loc[[bool(i % len(footprint_bin_ids)) for i in range(len(footprint_data))]] for footprint_data in footprints_data] | |
footprints_points_count = [fbd.shape[0] for fbd in footprints_data] | |
footprint_inner_prod_matrices = [np.ones([fp_count, fp_count]) for fp_count in footprints_points_count] | |
for fid, footprint_data in enumerate(footprints_data): | |
for dim in footprint_data: | |
footprint_inner_elem = id.compute_ID_elem(footprint_data[dim], footprint_data[dim], dim_maxes[dim]) | |
footprint_inner_prod_matrices[fid] = np.multiply(footprint_inner_prod_matrices[fid], footprint_inner_elem) | |
footprint_inner_bin_measures = [np.sum(matrix) / footprints_points_count[fid] ** 2 for fid, matrix in enumerate(footprint_inner_prod_matrices)] | |
footprint_inter_bin_measures = [[] for i in range(len(footprint_bin_ids))] | |
for bin_id, binn in enumerate(dist_bins): | |
bin_data = data_wo_curr.loc[bin_map == binn] | |
points_count = bin_data.shape[0] | |
inner_prod_matrix = np.ones([points_count, points_count]) | |
footprint_inter_prod_matrices = [np.ones([points_count, fp_count]) for fp_count in footprints_points_count] | |
# product elements for each dimension | |
for dim in bin_data: | |
inner_elem = id.compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim]) | |
inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem) | |
for fid, footprint_data in enumerate(footprints_data): | |
footprint_inter_elem = id.compute_ID_elem(bin_data[dim], footprint_data[dim], dim_maxes[dim]) | |
footprint_inter_prod_matrices[fid] = np.multiply(footprint_inter_prod_matrices[fid], footprint_inter_elem) | |
inner_bin_measures.append(np.sum(inner_prod_matrix) / points_count ** 2) | |
for fid, fp_inter_measure in enumerate(footprint_inter_bin_measures): | |
fp_inter_measure.append(2 * np.sum(footprint_inter_prod_matrices[fid]) / (points_count * footprints_points_count[fid])) | |
IDs = [] | |
for bin_id, inner_measure in enumerate(inner_bin_measures): | |
bin_footprint_IDs = [] | |
for fid, fp_inter_measure in enumerate(footprint_inter_bin_measures): | |
bin_footprint_IDs.append(inner_measure - fp_inter_measure[bin_id] + footprint_inner_bin_measures[fid]) | |
IDs.append(bin_footprint_IDs) | |
IDs = np.array(IDs) | |
return IDs | |
def compute_self_adjusting_footprint_IDs1(bin_map, data_wo_curr, dim_maxes, footprint_number): | |
if footprint_number < 1 or type(footprint_number) is not int: | |
raise ValueError("footprint_number should be integer > 0") | |
dist_bins = bin_map.cat.categories | |
inner_bin_measures = [] | |
# fp_width = 2 | |
# compute inner measures for all bins | |
for bin_id, binn in enumerate(dist_bins): | |
bin_data = data_wo_curr.loc[bin_map == binn] | |
points_count = bin_data.shape[0] | |
inner_prod_matrix = np.ones([points_count, points_count]) | |
# product elements for each dimension | |
for dim in bin_data: | |
inner_elem = id.compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim]) | |
inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem) | |
inner_bin_measures.append(np.sum(inner_prod_matrix) / points_count ** 2) | |
IDs = [] | |
# initial footprint | |
footprint_bin_id = 0 | |
for fid in range(footprint_number): | |
footprint_data = data_wo_curr.loc[bin_map == dist_bins[footprint_bin_id]] | |
footprint_data = footprint_data.loc[[bool(i % footprint_number) for i in range(len(footprint_data))]] | |
footprint_points_count = footprint_data.shape[0] | |
footprint_inner_prod_matrix = np.ones([footprint_points_count, footprint_points_count]) | |
# compute inner measures for footprint bin | |
for dim in footprint_data: | |
footprint_inner_elem = id.compute_ID_elem(footprint_data[dim], footprint_data[dim], dim_maxes[dim]) | |
footprint_inner_prod_matrix = np.multiply(footprint_inner_prod_matrix, footprint_inner_elem) | |
footprint_inner_bin_measure = np.sum(footprint_inner_prod_matrix) / footprint_points_count ** 2 | |
# compute inter measures bwn all bins agains the footprint bin | |
footprint_inter_bin_measure = [] | |
for bin_id, binn in enumerate(dist_bins): | |
bin_data = data_wo_curr.loc[bin_map == binn] | |
points_count = bin_data.shape[0] | |
footprint_inter_prod_matrix = np.ones([points_count, footprint_points_count]) | |
# product elements for each dimension | |
for dim in bin_data: | |
footprint_inter_elem = id.compute_ID_elem(bin_data[dim], footprint_data[dim], dim_maxes[dim]) | |
footprint_inter_prod_matrix = np.multiply(footprint_inter_prod_matrix, footprint_inter_elem) | |
footprint_inter_bin_measure.append(2 * np.sum(footprint_inter_prod_matrix) / (points_count * footprint_points_count)) | |
bin_footprint_IDs = [] | |
for bin_id, inner_measure in enumerate(inner_bin_measures): | |
bin_footprint_IDs.append(inner_measure - footprint_inter_bin_measure[bin_id] + footprint_inner_bin_measure) | |
fp_IDs_ids = [i for i in range(len(bin_footprint_IDs))] | |
IDs.append(bin_footprint_IDs) | |
bin_footprint_IDs = bin_footprint_IDs.copy() | |
fp_IDs_ids.remove(footprint_bin_id) | |
bin_footprint_IDs.pop(footprint_bin_id) | |
footprint_bin_id = np.argmax(bin_footprint_IDs) | |
IDs = np.array(IDs).transpose() | |
return IDs | |
def compute_subspace_interaction_measure(bin_map, curr, data, bins_right_bounds, dim_maxes): | |
fn = 2 | |
minhash, footprints, footprint_diffs = compute_ID_footprints(bin_map, curr, data, dim_maxes, fn, bins_right_bounds, seed_heuristic=True) | |
return footprint_diffs[0][1], None | |
# return sum([np.argsort(fp[:fn]).tolist() == np.argsort(footprints[i-1][:fn]).tolist() for i, fp in enumerate(footprints[1:])]), None |