Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import numpy as np
import interaction_distance as id
def _compute_footprint_seeds(bin_widths, footprints_number, background_seed=True):
if footprints_number < 2:
raise ValueError("footprints_number should be >= 2")
min_width_id = np.argmin(bin_widths)
bin_ids = [min_width_id]
min_width = bin_widths[min_width_id]
max_width = max(bin_widths)
# print('min_width', min_width)
# print('max_width', max_width)
while len(bin_ids) < footprints_number - 1:
mean = np.mean(bin_ids)
bin_scores = [abs(mean - bid) / len(bin_widths) + min_width / bw for bid, bw in enumerate(bin_widths)]
for mbid in bin_ids:
bin_scores[mbid] = 0
bin_ids.append(np.argmax(bin_scores))
if background_seed:
mean = np.mean(bin_ids)
bin_scores = [abs(mean - bid) / len(bin_widths) + bw / max_width for bid, bw in
enumerate(bin_widths)]
for mbid in bin_ids:
bin_scores[mbid] = 0
bin_ids.append(np.argmax(bin_scores))
return bin_ids
def _compute_ID_minfootprints(footprint_IDs):
return np.apply_along_axis(lambda a: np.argmin(a), 1, footprint_IDs)
def compute_ID_footprints(bin_map, curr, data, dim_maxes, footprints_number, bins_right_bounds, seed_heuristic=False, tradeoff=False):
curr_min = data[curr].min()
data_wo_curr = data.copy()
if curr in data.columns:
data_wo_curr.pop(curr)
footprint_IDs = compute_footprint_IDs1(bin_map, data_wo_curr, curr_min, dim_maxes, footprints_number, bins_right_bounds, seed_heuristic, tradeoff)
cols = footprint_IDs.shape[1]
pairs = [[c1, c2] for c1 in range(cols) for c2 in range(c1 + 1, cols)]
footprint_diffs = {c1: dict() for c1 in range(cols)}
for p in pairs:
diff = sum([abs(fp[p[0]] - fp[p[1]]) for fp in footprint_IDs])
footprint_diffs[p[0]][p[1]] = diff
footprint_diffs[p[1]][p[0]] = diff
return _compute_ID_minfootprints(footprint_IDs), footprint_IDs, footprint_diffs
def compute_footprint_IDs1(bin_map, data_wo_curr, curr_min, dim_maxes, footprints_number, bins_right_bounds, seed_heuristic=False, tradeoff=False):
footprint_bin_ids = None
if seed_heuristic:
bin_widths = np.array([j - i for i, j in zip([curr_min] + bins_right_bounds, bins_right_bounds)])
footprint_bin_ids = _compute_footprint_seeds(bin_widths, footprints_number, True)
dist_bins = bin_map.cat.categories
bins_count = len(dist_bins)
inner_bin_measures = []
if footprint_bin_ids is None:
# todo make equi width instead equi frequency seeding
footprint_bin_ids = [int(i / (footprints_number - 1) * (bins_count - 1)) for i in range(footprints_number)] if footprints_number > 1 else [0]
# print('selected footprint seeds', [bins_right_bounds[b] for b in footprint_bin_ids])
# increased width does not increase performance much
# footprints_data = [pd.concat([data.loc[bin_map == dist_bins[bin_id + i]] for i in range(fp_bin_number)], axis=0) for bin_id in footprint_bin_ids]
footprints_data = [data_wo_curr.loc[bin_map == dist_bins[bin_id]] for bin_id in footprint_bin_ids]
if tradeoff:
footprints_data = [footprint_data.loc[[bool(i % len(footprint_bin_ids)) for i in range(len(footprint_data))]] for footprint_data in footprints_data]
footprints_points_count = [fbd.shape[0] for fbd in footprints_data]
footprint_inner_prod_matrices = [np.ones([fp_count, fp_count]) for fp_count in footprints_points_count]
for fid, footprint_data in enumerate(footprints_data):
for dim in footprint_data:
footprint_inner_elem = id.compute_ID_elem(footprint_data[dim], footprint_data[dim], dim_maxes[dim])
footprint_inner_prod_matrices[fid] = np.multiply(footprint_inner_prod_matrices[fid], footprint_inner_elem)
footprint_inner_bin_measures = [np.sum(matrix) / footprints_points_count[fid] ** 2 for fid, matrix in enumerate(footprint_inner_prod_matrices)]
footprint_inter_bin_measures = [[] for i in range(len(footprint_bin_ids))]
for bin_id, binn in enumerate(dist_bins):
bin_data = data_wo_curr.loc[bin_map == binn]
points_count = bin_data.shape[0]
inner_prod_matrix = np.ones([points_count, points_count])
footprint_inter_prod_matrices = [np.ones([points_count, fp_count]) for fp_count in footprints_points_count]
# product elements for each dimension
for dim in bin_data:
inner_elem = id.compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim])
inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem)
for fid, footprint_data in enumerate(footprints_data):
footprint_inter_elem = id.compute_ID_elem(bin_data[dim], footprint_data[dim], dim_maxes[dim])
footprint_inter_prod_matrices[fid] = np.multiply(footprint_inter_prod_matrices[fid], footprint_inter_elem)
inner_bin_measures.append(np.sum(inner_prod_matrix) / points_count ** 2)
for fid, fp_inter_measure in enumerate(footprint_inter_bin_measures):
fp_inter_measure.append(2 * np.sum(footprint_inter_prod_matrices[fid]) / (points_count * footprints_points_count[fid]))
IDs = []
for bin_id, inner_measure in enumerate(inner_bin_measures):
bin_footprint_IDs = []
for fid, fp_inter_measure in enumerate(footprint_inter_bin_measures):
bin_footprint_IDs.append(inner_measure - fp_inter_measure[bin_id] + footprint_inner_bin_measures[fid])
IDs.append(bin_footprint_IDs)
IDs = np.array(IDs)
return IDs
def compute_self_adjusting_footprint_IDs1(bin_map, data_wo_curr, dim_maxes, footprint_number):
if footprint_number < 1 or type(footprint_number) is not int:
raise ValueError("footprint_number should be integer > 0")
dist_bins = bin_map.cat.categories
inner_bin_measures = []
# fp_width = 2
# compute inner measures for all bins
for bin_id, binn in enumerate(dist_bins):
bin_data = data_wo_curr.loc[bin_map == binn]
points_count = bin_data.shape[0]
inner_prod_matrix = np.ones([points_count, points_count])
# product elements for each dimension
for dim in bin_data:
inner_elem = id.compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim])
inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem)
inner_bin_measures.append(np.sum(inner_prod_matrix) / points_count ** 2)
IDs = []
# initial footprint
footprint_bin_id = 0
for fid in range(footprint_number):
footprint_data = data_wo_curr.loc[bin_map == dist_bins[footprint_bin_id]]
footprint_data = footprint_data.loc[[bool(i % footprint_number) for i in range(len(footprint_data))]]
footprint_points_count = footprint_data.shape[0]
footprint_inner_prod_matrix = np.ones([footprint_points_count, footprint_points_count])
# compute inner measures for footprint bin
for dim in footprint_data:
footprint_inner_elem = id.compute_ID_elem(footprint_data[dim], footprint_data[dim], dim_maxes[dim])
footprint_inner_prod_matrix = np.multiply(footprint_inner_prod_matrix, footprint_inner_elem)
footprint_inner_bin_measure = np.sum(footprint_inner_prod_matrix) / footprint_points_count ** 2
# compute inter measures bwn all bins agains the footprint bin
footprint_inter_bin_measure = []
for bin_id, binn in enumerate(dist_bins):
bin_data = data_wo_curr.loc[bin_map == binn]
points_count = bin_data.shape[0]
footprint_inter_prod_matrix = np.ones([points_count, footprint_points_count])
# product elements for each dimension
for dim in bin_data:
footprint_inter_elem = id.compute_ID_elem(bin_data[dim], footprint_data[dim], dim_maxes[dim])
footprint_inter_prod_matrix = np.multiply(footprint_inter_prod_matrix, footprint_inter_elem)
footprint_inter_bin_measure.append(2 * np.sum(footprint_inter_prod_matrix) / (points_count * footprint_points_count))
bin_footprint_IDs = []
for bin_id, inner_measure in enumerate(inner_bin_measures):
bin_footprint_IDs.append(inner_measure - footprint_inter_bin_measure[bin_id] + footprint_inner_bin_measure)
fp_IDs_ids = [i for i in range(len(bin_footprint_IDs))]
IDs.append(bin_footprint_IDs)
bin_footprint_IDs = bin_footprint_IDs.copy()
fp_IDs_ids.remove(footprint_bin_id)
bin_footprint_IDs.pop(footprint_bin_id)
footprint_bin_id = np.argmax(bin_footprint_IDs)
IDs = np.array(IDs).transpose()
return IDs
def compute_subspace_interaction_measure(bin_map, curr, data, bins_right_bounds, dim_maxes):
fn = 2
minhash, footprints, footprint_diffs = compute_ID_footprints(bin_map, curr, data, dim_maxes, fn, bins_right_bounds, seed_heuristic=True)
return footprint_diffs[0][1], None
# return sum([np.argsort(fp[:fn]).tolist() == np.argsort(footprints[i-1][:fn]).tolist() for i, fp in enumerate(footprints[1:])]), None