Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/interaction_distance.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
372 lines (292 sloc)
13.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import pandas as pd | |
import numpy as np | |
import constants as cst | |
import ID_sm as idsm | |
from enum import Enum | |
def merge_IDs(IDs1, IDs2): | |
merged = [ids[i] for i in range(len(IDs2)) for ids in [IDs1, IDs2]] | |
merged.append(IDs1[-1]) | |
return merged | |
def compute_IDs(bin_map, curr, data, dim_maxes): | |
dist_bins = bin_map.cat.categories | |
if data.empty: | |
return np.zeros(len(dist_bins) - 1) | |
data_wo_curr = data.copy() | |
if curr in data.columns: | |
data_wo_curr.pop(curr) # todo slow? | |
IDs = compute_IDs1(bin_map, data_wo_curr, dim_maxes) | |
# IDs = compute_IDs1(bin_map, data_wo_curr, dim_maxes) if not smooth else merge_IDs( | |
# compute_doubled_bins_IDs(bin_map, data_wo_curr, dim_maxes), | |
# compute_doubled_bins_IDs(bin_map, data_wo_curr, dim_maxes, True)) | |
# bin_widths = [j - i for i, j in zip([-2] + curr_points, curr_points + [2])] | |
# min_width = min(bin_widths) | |
# | |
# | |
# bin_width_IDs = [ID * float(min_width / min(bin_widths[i], bin_widths[i + 1])) for i, ID in enumerate(IDs)] | |
return np.array(IDs) | |
def compute_IDs1(bin_map, data, dim_maxes): | |
dist_bins = bin_map.cat.categories | |
inner_bin_measures = [] | |
inter_bin_measures = [] | |
for bin_id, binn in enumerate(dist_bins): | |
bin_data = data.loc[bin_map == binn] | |
points_count = bin_data.shape[0] | |
prev_bin_data = None | |
inter_prod_matrix = None | |
prev_points_count = None | |
if bin_id > 0: | |
prev_bin_data = data.loc[bin_map == dist_bins[bin_id - 1]] | |
prev_points_count = prev_bin_data.shape[0] | |
inter_prod_matrix = np.ones([points_count, prev_points_count]) | |
inner_prod_matrix = np.ones([points_count, points_count]) | |
# product elements for each dimension | |
for dim in bin_data: | |
inner_elem = compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim]) | |
inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem) | |
if bin_id > 0: | |
inter_elem = compute_ID_elem(bin_data[dim], prev_bin_data[dim], dim_maxes[dim]) | |
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem) | |
inner_bin_measures.append(np.sum(inner_prod_matrix) / points_count ** 2) | |
if bin_id > 0: | |
inter_bin_measures.append(2 * np.sum(inter_prod_matrix) / (points_count * prev_points_count)) | |
IDs = [] | |
for c, inter_measure in enumerate(inter_bin_measures): | |
IDs.append(inner_bin_measures[c] - inter_measure + inner_bin_measures[c + 1]) | |
IDs = np.array(IDs) | |
return IDs | |
def compute_point_IDs1(data): | |
inner_bin_measures = [] | |
inter_bin_measures = [] | |
dim_maxes = np.max(data, 0) | |
prev_bin_data = data[0] | |
inner_measure = 1 | |
for dim, dim_data in enumerate(prev_bin_data): | |
inner_measure = inner_measure * (dim_maxes[dim] - dim_data) | |
inner_bin_measures.append(inner_measure) | |
for bin_data in data[1:]: | |
inter_measure = 1 | |
inner_measure = 1 | |
# product elements for each dimension | |
for dim, dim_data in enumerate(bin_data): | |
inner_measure = inner_measure * (dim_maxes[dim] - dim_data) | |
inter_measure = inter_measure * (dim_maxes[dim] - max(dim_data, prev_bin_data[dim])) | |
inner_bin_measures.append(inner_measure) | |
inter_bin_measures.append(inter_measure) | |
prev_bin_data = bin_data | |
IDs = [] | |
for c, inter_measure in enumerate(inter_bin_measures): | |
IDs.append(pow(inner_bin_measures[c] - inter_measure + inner_bin_measures[c + 1], 0.5)) | |
IDs = np.array(IDs) | |
return IDs | |
def compute_l1norm_distances1(data): | |
IDs = [] | |
for bin_id, bin_data in enumerate(data[1:], start=1): | |
IDs.append(sum([abs(dim_data - data[bin_id - 1][dim]) for dim, dim_data in enumerate(bin_data)])) | |
IDs = np.array(IDs) | |
return IDs | |
def compute_running_avg(data, k): | |
assert k % 2 == 0 | |
l = int(k/2) | |
running_mean = np.mean(data[:l], 0) | |
avg_data = [] | |
for i in range(l): | |
avg_data.append(running_mean) | |
for point_id, point in enumerate(data[l:-l], start=l): | |
avg_data.append(np.mean(data[point_id - l: point_id + l], 0)) | |
running_mean = np.mean(data[-l:], 0) | |
for i in range(l): | |
avg_data.append(running_mean) | |
return np.array(avg_data) | |
def compute_calibrated_point_IDs1(data): | |
inner_bin_measures = [] | |
inter_bin_measures = [] | |
dim_maxes = np.max(data, 0) | |
prev_bin_data = data[0] | |
inner_measure = 1 | |
for dim, dim_data in enumerate(prev_bin_data): | |
inner_measure = inner_measure * (dim_maxes[dim] - dim_data) | |
inner_bin_measures.append(inner_measure) | |
for bin_data in data[1:]: | |
inter_measure = 1 | |
inner_measure = 1 | |
# product elements for each dimension | |
for dim, dim_data in enumerate(bin_data): | |
diff = dim_maxes[dim] - max(dim_data, prev_bin_data[dim]) | |
inner_measure = inner_measure * (dim_maxes[dim] - dim_data - diff) | |
inter_measure = inter_measure * (dim_maxes[dim] - max(dim_data, prev_bin_data[dim]) - diff) | |
inner_bin_measures.append(inner_measure) | |
inter_bin_measures.append(inter_measure) | |
prev_bin_data = bin_data | |
IDs = [] | |
for c, inter_measure in enumerate(inter_bin_measures): | |
IDs.append(pow(inner_bin_measures[c] - inter_measure + inner_bin_measures[c + 1], 0.5)) | |
IDs = np.array(IDs) | |
return IDs | |
def compute_calibrated_IDs1(bin_map, data, dim_maxes): | |
dist_bins = bin_map.cat.categories | |
inner_bin_measures = [] | |
prev_inner_bin_measures = [] | |
inter_bin_measures = [] | |
# prepare previous bin data | |
prev_bin_data = data.loc[bin_map == dist_bins[0]] | |
for bin_id, binn in enumerate(dist_bins[1:]): | |
bin_data = data.loc[bin_map == binn] | |
points_count = bin_data.shape[0] | |
prev_points_count = prev_bin_data.shape[0] | |
inter_prod_matrix = np.ones([points_count, prev_points_count]) | |
inner_prod_matrix = np.ones([points_count, points_count]) | |
prev_inner_prod_matrix = np.ones([prev_points_count, prev_points_count]) | |
# product elements for each dimension | |
for dim in bin_data: | |
diff = dim_maxes[dim] - max(max(bin_data[dim]), max(prev_bin_data[dim])) | |
inner_elem = compute_ID_elem(bin_data[dim] + diff, bin_data[dim] + diff, dim_maxes[dim]) | |
inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem) | |
prev_inner_elem = compute_ID_elem(prev_bin_data[dim] + diff, prev_bin_data[dim] + diff, dim_maxes[dim]) | |
prev_inner_prod_matrix = np.multiply(prev_inner_prod_matrix, prev_inner_elem) | |
inter_elem = compute_ID_elem(bin_data[dim] + diff, prev_bin_data[dim] + diff, dim_maxes[dim]) | |
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem) | |
inner_bin_measures.append(np.sum(inner_prod_matrix) / points_count ** 2) | |
prev_inner_bin_measures.append(np.sum(prev_inner_prod_matrix) / prev_points_count ** 2) | |
inter_bin_measures.append(2 * np.sum(inter_prod_matrix) / (points_count * prev_points_count)) | |
prev_bin_data = bin_data | |
IDs = [] | |
for bin_id, inter_measure in enumerate(inter_bin_measures): | |
IDs.append(math.pow(inner_bin_measures[bin_id] - inter_measure + prev_inner_bin_measures[bin_id], 0.5)) | |
IDs = np.array(IDs) | |
return IDs | |
def compute_doubled_bins_IDs(bin_map, data, dim_maxes, swallow_first=False): | |
dist_bins = bin_map.cat.categories | |
inner_bin_measures = [] | |
inter_bin_measures = [] | |
first_bin = not swallow_first | |
first_binn = None | |
prev_bin_data = None | |
for bin_id, binn in enumerate(dist_bins): | |
if first_bin: | |
first_binn = binn | |
first_bin = False | |
continue | |
first_bin = True | |
# merge 2 binns | |
bin_data = pd.concat((data.loc[bin_map == first_binn], data.loc[bin_map == binn]), axis=0) | |
points_count = bin_data.shape[0] | |
prev_points_count = None | |
inter_prod_matrix = None | |
if prev_bin_data is not None: | |
# prev_bin_data = data.loc[bin_map == dist_bins[bin_id - 1]] | |
prev_points_count = prev_bin_data.shape[0] | |
inter_prod_matrix = np.ones([points_count, prev_points_count]) | |
inner_prod_matrix = np.ones([points_count, points_count]) | |
# product elements for each dimension | |
for dim in bin_data: | |
inner_elem = compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim]) | |
inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem) | |
if prev_bin_data is not None: | |
inter_elem = compute_ID_elem(bin_data[dim], prev_bin_data[dim], dim_maxes[dim]) | |
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem) | |
inner_bin_measures.append(np.sum(inner_prod_matrix) / points_count ** 2) | |
if prev_points_count is not None: | |
inter_bin_measures.append(2 * np.sum(inter_prod_matrix) / (points_count * prev_points_count)) | |
prev_bin_data = bin_data | |
IDs = [] | |
for c, inter_measure in enumerate(inter_bin_measures): | |
IDs.append(inner_bin_measures[c] - inter_measure + inner_bin_measures[c + 1]) | |
IDs = np.array(IDs) | |
return IDs | |
def sample(data, size): | |
return np.random.choice(data, size) | |
def _compute_sampled_IDs(bin_map, data, dim_maxes, dist_bins): | |
sample_count = 100 | |
inner_bin_measures = [] | |
inter_bin_measures = [] | |
for bin_id, binn in enumerate(dist_bins): | |
bin_data = data.loc[bin_map == binn] | |
points_count = bin_data.shape[0] | |
sample_points_count = int(math.floor(math.sqrt(points_count))) | |
inner_bin_measure = 0 | |
if bin_id > 0: | |
inter_bin_measure = 0 | |
for i in range(sample_count): | |
prev_bin_data = None | |
inter_prod_matrix = None | |
prev_points_count = None | |
if bin_id > 0: | |
prev_bin_data = data.loc[bin_map == dist_bins[bin_id - 1]] | |
prev_points_count = prev_bin_data.shape[0] | |
prev_sample_points_count = math.floor(math.sqrt(prev_points_count)) | |
inter_prod_matrix = np.ones([sample_points_count, prev_sample_points_count]) | |
inner_prod_matrix = np.ones([sample_points_count, sample_points_count]) | |
# product elements for each dimension | |
for dim in bin_data: | |
inner_elem = compute_ID_elem(sample(bin_data[dim], sample_points_count), | |
sample(bin_data[dim], sample_points_count), dim_maxes[dim]) | |
inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem) | |
if bin_id > 0: | |
inter_elem = compute_ID_elem(sample(bin_data[dim], sample_points_count), | |
sample(prev_bin_data[dim], prev_sample_points_count), dim_maxes[dim]) | |
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem) | |
inner_bin_measure += np.sum(inner_prod_matrix) / sample_points_count ** 2 | |
if bin_id > 0: | |
inter_bin_measure += 2 * np.sum(inter_prod_matrix) / (sample_points_count * prev_sample_points_count) | |
inner_bin_measures.append(inner_bin_measure / sample_count) | |
if bin_id > 0: | |
inter_bin_measures.append(inter_bin_measure / sample_count) | |
IDs = [] | |
for c, inter_measure in enumerate(inter_bin_measures): | |
IDs.append(inner_bin_measures[c] - inter_measure + inner_bin_measures[c + 1]) | |
IDs = np.array(IDs) | |
return IDs | |
def compute_ID(bin1, bin2, dim_maxes): | |
bin1_points_count = bin1.shape[0] | |
bin2_points_count = bin2.shape[0] | |
inter_prod_matrix = np.ones([bin1_points_count, bin2_points_count]) | |
bin1_inner_prod_matrix = np.ones([bin1_points_count, bin1_points_count]) | |
bin2_inner_prod_matrix = np.ones([bin2_points_count, bin2_points_count]) | |
# product elements for each dimension | |
for dim in bin1: | |
bin1_inner_elem = compute_ID_elem(bin1[dim], bin1[dim], dim_maxes[dim]) | |
bin1_inner_prod_matrix = np.multiply(bin1_inner_prod_matrix, bin1_inner_elem) | |
bin2_inner_elem = compute_ID_elem(bin2[dim], bin2[dim], dim_maxes[dim]) | |
bin2_inner_prod_matrix = np.multiply(bin2_inner_prod_matrix, bin2_inner_elem) | |
inter_elem = compute_ID_elem(bin1[dim], bin2[dim], dim_maxes[dim]) | |
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem) | |
inner_bin1_measure = np.sum(bin1_inner_prod_matrix) / bin1_points_count ** 2 | |
inner_bin2_measure = np.sum(bin2_inner_prod_matrix) / bin2_points_count ** 2 | |
inter_bin_measure = 2 * np.sum(inter_prod_matrix) / (bin1_points_count * bin2_points_count) | |
ID = inner_bin1_measure - inter_bin_measure + inner_bin2_measure | |
return ID | |
def compute_ID_elem(bin1, bin2, dim_max): | |
points_count1 = bin1.shape[0] | |
points_count2 = bin2.shape[0] | |
# max_i array | |
max_array = np.ones([points_count1, points_count2]) | |
max_array.fill(dim_max) | |
# max_i - max(R^i_{j_1}, R^i_{j_2}) | |
outer_max = np.maximum.outer(bin1, np.transpose(bin2)) | |
return max_array - outer_max | |
class IDThresholdStrategy(Enum): | |
LOW = 1 | |
HIGH = 2 | |
BALANCED_AVG = 3 | |
AVG = 4 | |
UNIFORM_1_BIN = 5 | |
def compute_ID_threshold(IDs, dist_attr): | |
IDs = IDs.copy() | |
IDs.sort() | |
return IDs[math.ceil(int(len(IDs) * dist_attr)) - 1] | |
def compute_max_ID_threshold(IDs): | |
IDs = IDs.copy() | |
IDs.sort() | |
return max(IDs) | |
def compute_sliding_count(IDs, ID_threshold): | |
count = [] | |
avg = sum(IDs) / len(IDs) | |
for i in range(cst.ID_SLIDING_WINDOW, len(IDs)): | |
start = i - cst.ID_SLIDING_WINDOW if i > cst.ID_SLIDING_WINDOW else 0 | |
count.append(sum(1 for id in IDs[start: i] if id > ID_threshold)) | |
# i = 0 | |
# while i < len(IDs): | |
# count.append(sum(1 for id in IDs[i: i + ID_SLIDING_WINDOW] if id > ID_threshold)) | |
# i += ID_SLIDING_WINDOW | |
return count |