Skip to content
Permalink
b1a05852fb
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
372 lines (292 sloc) 13.7 KB
import math
import pandas as pd
import numpy as np
import constants as cst
import ID_sm as idsm
from enum import Enum
def merge_IDs(IDs1, IDs2):
merged = [ids[i] for i in range(len(IDs2)) for ids in [IDs1, IDs2]]
merged.append(IDs1[-1])
return merged
def compute_IDs(bin_map, curr, data, dim_maxes):
dist_bins = bin_map.cat.categories
if data.empty:
return np.zeros(len(dist_bins) - 1)
data_wo_curr = data.copy()
if curr in data.columns:
data_wo_curr.pop(curr) # todo slow?
IDs = compute_IDs1(bin_map, data_wo_curr, dim_maxes)
# IDs = compute_IDs1(bin_map, data_wo_curr, dim_maxes) if not smooth else merge_IDs(
# compute_doubled_bins_IDs(bin_map, data_wo_curr, dim_maxes),
# compute_doubled_bins_IDs(bin_map, data_wo_curr, dim_maxes, True))
# bin_widths = [j - i for i, j in zip([-2] + curr_points, curr_points + [2])]
# min_width = min(bin_widths)
#
#
# bin_width_IDs = [ID * float(min_width / min(bin_widths[i], bin_widths[i + 1])) for i, ID in enumerate(IDs)]
return np.array(IDs)
def compute_IDs1(bin_map, data, dim_maxes):
dist_bins = bin_map.cat.categories
inner_bin_measures = []
inter_bin_measures = []
for bin_id, binn in enumerate(dist_bins):
bin_data = data.loc[bin_map == binn]
points_count = bin_data.shape[0]
prev_bin_data = None
inter_prod_matrix = None
prev_points_count = None
if bin_id > 0:
prev_bin_data = data.loc[bin_map == dist_bins[bin_id - 1]]
prev_points_count = prev_bin_data.shape[0]
inter_prod_matrix = np.ones([points_count, prev_points_count])
inner_prod_matrix = np.ones([points_count, points_count])
# product elements for each dimension
for dim in bin_data:
inner_elem = compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim])
inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem)
if bin_id > 0:
inter_elem = compute_ID_elem(bin_data[dim], prev_bin_data[dim], dim_maxes[dim])
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem)
inner_bin_measures.append(np.sum(inner_prod_matrix) / points_count ** 2)
if bin_id > 0:
inter_bin_measures.append(2 * np.sum(inter_prod_matrix) / (points_count * prev_points_count))
IDs = []
for c, inter_measure in enumerate(inter_bin_measures):
IDs.append(inner_bin_measures[c] - inter_measure + inner_bin_measures[c + 1])
IDs = np.array(IDs)
return IDs
def compute_point_IDs1(data):
inner_bin_measures = []
inter_bin_measures = []
dim_maxes = np.max(data, 0)
prev_bin_data = data[0]
inner_measure = 1
for dim, dim_data in enumerate(prev_bin_data):
inner_measure = inner_measure * (dim_maxes[dim] - dim_data)
inner_bin_measures.append(inner_measure)
for bin_data in data[1:]:
inter_measure = 1
inner_measure = 1
# product elements for each dimension
for dim, dim_data in enumerate(bin_data):
inner_measure = inner_measure * (dim_maxes[dim] - dim_data)
inter_measure = inter_measure * (dim_maxes[dim] - max(dim_data, prev_bin_data[dim]))
inner_bin_measures.append(inner_measure)
inter_bin_measures.append(inter_measure)
prev_bin_data = bin_data
IDs = []
for c, inter_measure in enumerate(inter_bin_measures):
IDs.append(pow(inner_bin_measures[c] - inter_measure + inner_bin_measures[c + 1], 0.5))
IDs = np.array(IDs)
return IDs
def compute_l1norm_distances1(data):
IDs = []
for bin_id, bin_data in enumerate(data[1:], start=1):
IDs.append(sum([abs(dim_data - data[bin_id - 1][dim]) for dim, dim_data in enumerate(bin_data)]))
IDs = np.array(IDs)
return IDs
def compute_running_avg(data, k):
assert k % 2 == 0
l = int(k/2)
running_mean = np.mean(data[:l], 0)
avg_data = []
for i in range(l):
avg_data.append(running_mean)
for point_id, point in enumerate(data[l:-l], start=l):
avg_data.append(np.mean(data[point_id - l: point_id + l], 0))
running_mean = np.mean(data[-l:], 0)
for i in range(l):
avg_data.append(running_mean)
return np.array(avg_data)
def compute_calibrated_point_IDs1(data):
inner_bin_measures = []
inter_bin_measures = []
dim_maxes = np.max(data, 0)
prev_bin_data = data[0]
inner_measure = 1
for dim, dim_data in enumerate(prev_bin_data):
inner_measure = inner_measure * (dim_maxes[dim] - dim_data)
inner_bin_measures.append(inner_measure)
for bin_data in data[1:]:
inter_measure = 1
inner_measure = 1
# product elements for each dimension
for dim, dim_data in enumerate(bin_data):
diff = dim_maxes[dim] - max(dim_data, prev_bin_data[dim])
inner_measure = inner_measure * (dim_maxes[dim] - dim_data - diff)
inter_measure = inter_measure * (dim_maxes[dim] - max(dim_data, prev_bin_data[dim]) - diff)
inner_bin_measures.append(inner_measure)
inter_bin_measures.append(inter_measure)
prev_bin_data = bin_data
IDs = []
for c, inter_measure in enumerate(inter_bin_measures):
IDs.append(pow(inner_bin_measures[c] - inter_measure + inner_bin_measures[c + 1], 0.5))
IDs = np.array(IDs)
return IDs
def compute_calibrated_IDs1(bin_map, data, dim_maxes):
dist_bins = bin_map.cat.categories
inner_bin_measures = []
prev_inner_bin_measures = []
inter_bin_measures = []
# prepare previous bin data
prev_bin_data = data.loc[bin_map == dist_bins[0]]
for bin_id, binn in enumerate(dist_bins[1:]):
bin_data = data.loc[bin_map == binn]
points_count = bin_data.shape[0]
prev_points_count = prev_bin_data.shape[0]
inter_prod_matrix = np.ones([points_count, prev_points_count])
inner_prod_matrix = np.ones([points_count, points_count])
prev_inner_prod_matrix = np.ones([prev_points_count, prev_points_count])
# product elements for each dimension
for dim in bin_data:
diff = dim_maxes[dim] - max(max(bin_data[dim]), max(prev_bin_data[dim]))
inner_elem = compute_ID_elem(bin_data[dim] + diff, bin_data[dim] + diff, dim_maxes[dim])
inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem)
prev_inner_elem = compute_ID_elem(prev_bin_data[dim] + diff, prev_bin_data[dim] + diff, dim_maxes[dim])
prev_inner_prod_matrix = np.multiply(prev_inner_prod_matrix, prev_inner_elem)
inter_elem = compute_ID_elem(bin_data[dim] + diff, prev_bin_data[dim] + diff, dim_maxes[dim])
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem)
inner_bin_measures.append(np.sum(inner_prod_matrix) / points_count ** 2)
prev_inner_bin_measures.append(np.sum(prev_inner_prod_matrix) / prev_points_count ** 2)
inter_bin_measures.append(2 * np.sum(inter_prod_matrix) / (points_count * prev_points_count))
prev_bin_data = bin_data
IDs = []
for bin_id, inter_measure in enumerate(inter_bin_measures):
IDs.append(math.pow(inner_bin_measures[bin_id] - inter_measure + prev_inner_bin_measures[bin_id], 0.5))
IDs = np.array(IDs)
return IDs
def compute_doubled_bins_IDs(bin_map, data, dim_maxes, swallow_first=False):
dist_bins = bin_map.cat.categories
inner_bin_measures = []
inter_bin_measures = []
first_bin = not swallow_first
first_binn = None
prev_bin_data = None
for bin_id, binn in enumerate(dist_bins):
if first_bin:
first_binn = binn
first_bin = False
continue
first_bin = True
# merge 2 binns
bin_data = pd.concat((data.loc[bin_map == first_binn], data.loc[bin_map == binn]), axis=0)
points_count = bin_data.shape[0]
prev_points_count = None
inter_prod_matrix = None
if prev_bin_data is not None:
# prev_bin_data = data.loc[bin_map == dist_bins[bin_id - 1]]
prev_points_count = prev_bin_data.shape[0]
inter_prod_matrix = np.ones([points_count, prev_points_count])
inner_prod_matrix = np.ones([points_count, points_count])
# product elements for each dimension
for dim in bin_data:
inner_elem = compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim])
inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem)
if prev_bin_data is not None:
inter_elem = compute_ID_elem(bin_data[dim], prev_bin_data[dim], dim_maxes[dim])
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem)
inner_bin_measures.append(np.sum(inner_prod_matrix) / points_count ** 2)
if prev_points_count is not None:
inter_bin_measures.append(2 * np.sum(inter_prod_matrix) / (points_count * prev_points_count))
prev_bin_data = bin_data
IDs = []
for c, inter_measure in enumerate(inter_bin_measures):
IDs.append(inner_bin_measures[c] - inter_measure + inner_bin_measures[c + 1])
IDs = np.array(IDs)
return IDs
def sample(data, size):
return np.random.choice(data, size)
def _compute_sampled_IDs(bin_map, data, dim_maxes, dist_bins):
sample_count = 100
inner_bin_measures = []
inter_bin_measures = []
for bin_id, binn in enumerate(dist_bins):
bin_data = data.loc[bin_map == binn]
points_count = bin_data.shape[0]
sample_points_count = int(math.floor(math.sqrt(points_count)))
inner_bin_measure = 0
if bin_id > 0:
inter_bin_measure = 0
for i in range(sample_count):
prev_bin_data = None
inter_prod_matrix = None
prev_points_count = None
if bin_id > 0:
prev_bin_data = data.loc[bin_map == dist_bins[bin_id - 1]]
prev_points_count = prev_bin_data.shape[0]
prev_sample_points_count = math.floor(math.sqrt(prev_points_count))
inter_prod_matrix = np.ones([sample_points_count, prev_sample_points_count])
inner_prod_matrix = np.ones([sample_points_count, sample_points_count])
# product elements for each dimension
for dim in bin_data:
inner_elem = compute_ID_elem(sample(bin_data[dim], sample_points_count),
sample(bin_data[dim], sample_points_count), dim_maxes[dim])
inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem)
if bin_id > 0:
inter_elem = compute_ID_elem(sample(bin_data[dim], sample_points_count),
sample(prev_bin_data[dim], prev_sample_points_count), dim_maxes[dim])
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem)
inner_bin_measure += np.sum(inner_prod_matrix) / sample_points_count ** 2
if bin_id > 0:
inter_bin_measure += 2 * np.sum(inter_prod_matrix) / (sample_points_count * prev_sample_points_count)
inner_bin_measures.append(inner_bin_measure / sample_count)
if bin_id > 0:
inter_bin_measures.append(inter_bin_measure / sample_count)
IDs = []
for c, inter_measure in enumerate(inter_bin_measures):
IDs.append(inner_bin_measures[c] - inter_measure + inner_bin_measures[c + 1])
IDs = np.array(IDs)
return IDs
def compute_ID(bin1, bin2, dim_maxes):
bin1_points_count = bin1.shape[0]
bin2_points_count = bin2.shape[0]
inter_prod_matrix = np.ones([bin1_points_count, bin2_points_count])
bin1_inner_prod_matrix = np.ones([bin1_points_count, bin1_points_count])
bin2_inner_prod_matrix = np.ones([bin2_points_count, bin2_points_count])
# product elements for each dimension
for dim in bin1:
bin1_inner_elem = compute_ID_elem(bin1[dim], bin1[dim], dim_maxes[dim])
bin1_inner_prod_matrix = np.multiply(bin1_inner_prod_matrix, bin1_inner_elem)
bin2_inner_elem = compute_ID_elem(bin2[dim], bin2[dim], dim_maxes[dim])
bin2_inner_prod_matrix = np.multiply(bin2_inner_prod_matrix, bin2_inner_elem)
inter_elem = compute_ID_elem(bin1[dim], bin2[dim], dim_maxes[dim])
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem)
inner_bin1_measure = np.sum(bin1_inner_prod_matrix) / bin1_points_count ** 2
inner_bin2_measure = np.sum(bin2_inner_prod_matrix) / bin2_points_count ** 2
inter_bin_measure = 2 * np.sum(inter_prod_matrix) / (bin1_points_count * bin2_points_count)
ID = inner_bin1_measure - inter_bin_measure + inner_bin2_measure
return ID
def compute_ID_elem(bin1, bin2, dim_max):
points_count1 = bin1.shape[0]
points_count2 = bin2.shape[0]
# max_i array
max_array = np.ones([points_count1, points_count2])
max_array.fill(dim_max)
# max_i - max(R^i_{j_1}, R^i_{j_2})
outer_max = np.maximum.outer(bin1, np.transpose(bin2))
return max_array - outer_max
class IDThresholdStrategy(Enum):
LOW = 1
HIGH = 2
BALANCED_AVG = 3
AVG = 4
UNIFORM_1_BIN = 5
def compute_ID_threshold(IDs, dist_attr):
IDs = IDs.copy()
IDs.sort()
return IDs[math.ceil(int(len(IDs) * dist_attr)) - 1]
def compute_max_ID_threshold(IDs):
IDs = IDs.copy()
IDs.sort()
return max(IDs)
def compute_sliding_count(IDs, ID_threshold):
count = []
avg = sum(IDs) / len(IDs)
for i in range(cst.ID_SLIDING_WINDOW, len(IDs)):
start = i - cst.ID_SLIDING_WINDOW if i > cst.ID_SLIDING_WINDOW else 0
count.append(sum(1 for id in IDs[start: i] if id > ID_threshold))
# i = 0
# while i < len(IDs):
# count.append(sum(1 for id in IDs[i: i + ID_SLIDING_WINDOW] if id > ID_threshold))
# i += ID_SLIDING_WINDOW
return count