Skip to content
Permalink
b1a05852fb
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
307 lines (257 sloc) 15.8 KB
import numpy as np
import pandas as pd
import interaction_distance as id
import correlation_measures.binning as bn
import data_generator as dg
import main
import math
import re
def compute_fractalIDs1(bin_map, data, dim_maxes):
dist_bins = bin_map.cat.categories
if data.empty:
return np.zeros(len(dist_bins) - 1)
inner_bin_measures = [[] for i in range(len(dist_bins))]
inter_bin_measures = [[] for i in range((len(dist_bins) - 1))]
subdim = data.columns[0]
if len(data.columns) > 1:
subbins_count = 2
new_subdims = data.columns.delete(0)
else:
subbins_count = 1
new_subdims = data.columns
# new_subdims = data.columns
flipped_inner_bin_measures = [[[] for i in range(len(dist_bins))] for j in new_subdims]
flipped_inter_bin_measures = [[[] for i in range((len(dist_bins) - 1))] for j in new_subdims]
prev_bin_data = [None] * subbins_count
for bin_id, binn in enumerate(dist_bins):
bin_data = data.loc[bin_map == binn]
# create sub-bins (default 2)
subbinning = bn.Binning(bin_data, subdim, subbins_count)
subbin_map = subbinning.equal_frequency_binning_by_rank()
dist_subbins = subbin_map.cat.categories
for subbin_id, subbinn in enumerate(dist_subbins):
subbin_data = bin_data.loc[subbin_map == subbinn, new_subdims]
points_count = subbin_data.shape[0]
inner_prod_matrix = np.ones([points_count, points_count])
flipped_inner_prod_matrices = [np.ones([points_count, points_count]) for j in new_subdims]
inter_prod_matrix = None
flipped_inter_prod_matrices = None
prev_points_count = None
if prev_bin_data[subbin_id] is not None:
prev_points_count = prev_bin_data[subbin_id].shape[0]
inter_prod_matrix = np.ones([points_count, prev_points_count])
flipped_inter_prod_matrices = [np.ones([points_count, prev_points_count]) for j in new_subdims]
# product elements for each dimension
for dim_id, dim in enumerate(subbin_data):
subbin_data_dim = subbin_data[dim]
inner_elem = id.compute_ID_elem(subbin_data_dim, subbin_data_dim, dim_maxes[dim])
inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem)
flipped_inner_elem = id.compute_ID_elem(-subbin_data_dim, -subbin_data_dim, dim_maxes[dim])
for flip_col_id, flip_col in enumerate(new_subdims):
flipped_inner_prod_matrices[flip_col_id] = np.multiply(flipped_inner_prod_matrices[flip_col_id],
flipped_inner_elem if dim == flip_col else inner_elem)
if inter_prod_matrix is not None:
prev_bin_data_dim = prev_bin_data[subbin_id][dim]
inter_elem = id.compute_ID_elem(subbin_data_dim, prev_bin_data_dim, dim_maxes[dim])
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem)
flipped_inter_elem = id.compute_ID_elem(-subbin_data_dim, -prev_bin_data_dim, dim_maxes[dim])
for flip_col_id, flip_col in enumerate(new_subdims):
flipped_inter_prod_matrices[flip_col_id] = np.multiply(flipped_inter_prod_matrices[flip_col_id],
flipped_inter_elem if dim == flip_col else inter_elem)
inner_bin_measures[bin_id].append(np.sum(inner_prod_matrix) / points_count ** 2)
for flip_col_id in range(len(new_subdims)):
flipped_inner_bin_measures[flip_col_id][bin_id].append(
np.sum(flipped_inner_prod_matrices[flip_col_id]) / points_count ** 2)
if inter_prod_matrix is not None:
inter_bin_measures[bin_id - 1].append(
2 * np.sum(inter_prod_matrix) / (points_count * prev_points_count))
for flip_col_id in range(len(new_subdims)):
flipped_inter_bin_measures[flip_col_id][bin_id - 1].append(
2 * np.sum(flipped_inter_prod_matrices[flip_col_id]) / (points_count * prev_points_count))
prev_bin_data[subbin_id] = subbin_data
IDs = []
for bin_id, inter_measures in enumerate(inter_bin_measures):
measures = [[inner_bin_measures[bin_id][subbin_id] - sub + inner_bin_measures[bin_id + 1][subbin_id]]
# + [
# flipped_inner_bin_measures[flip_col_id][bin_id][subbin_id] -
# flipped_inter_bin_measures[flip_col_id][bin_id][subbin_id] +
# flipped_inner_bin_measures[flip_col_id][bin_id + 1][subbin_id] for flip_col_id in range(len(new_subdims))]
for subbin_id, sub in enumerate(inter_measures)]
IDs.append(pow(np.average(measures), 0.5))
# IDs.append(np.average([inner_bin_measures[bin_id][subbin_id] - sub + inner_bin_measures[bin_id + 1][subbin_id]
# for subbin_id, sub in enumerate(inter_measures)]))
IDs = np.array(IDs)
return IDs
def compute_avg_fractalIDs1(bin_map, data, dim_maxes):
dist_bins = bin_map.cat.categories
if data.empty:
return np.zeros(len(dist_bins) - 1)
inner_bin_measures = [[] for i in range(len(dist_bins))]
inter_bin_measures = [[] for i in range((len(dist_bins) - 1))]
subdim = data.columns[0]
if len(data.columns) > 1:
subbins_count = 2
new_subdims = data.columns.delete(0)
else:
subbins_count = 1
new_subdims = data.columns
# new_subdims = data.columns
flipped_inner_bin_measures = [[[] for i in range(len(dist_bins))] for j in new_subdims]
flipped_inter_bin_measures = [[[] for i in range((len(dist_bins) - 1))] for j in new_subdims]
prev_bin_data = [None] * subbins_count
for bin_id, binn in enumerate(dist_bins):
bin_data = data.loc[bin_map == binn]
# create sub-bins (default 2)
subbinning = bn.Binning(bin_data, subdim, subbins_count)
subbin_map = subbinning.equal_frequency_binning_by_rank()
dist_subbins = subbin_map.cat.categories
for subbin_id, subbinn in enumerate(dist_subbins):
subbin_data = bin_data.loc[subbin_map == subbinn, new_subdims]
points_count = subbin_data.shape[0]
inner_prod_matrix = np.ones([points_count, points_count])
flipped_inner_prod_matrices = [np.ones([points_count, points_count]) for j in new_subdims]
inter_prod_matrix = None
flipped_inter_prod_matrices = None
prev_points_count = None
if prev_bin_data[subbin_id] is not None:
prev_points_count = prev_bin_data[subbin_id].shape[0]
inter_prod_matrix = np.ones([points_count, prev_points_count])
flipped_inter_prod_matrices = [np.ones([points_count, prev_points_count]) for j in new_subdims]
# product elements for each dimension
for dim_id, dim in enumerate(subbin_data):
subbin_data_dim = subbin_data[dim]
inner_elem = id.compute_ID_elem(subbin_data_dim, subbin_data_dim, dim_maxes[dim])
inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem)
flipped_inner_elem = id.compute_ID_elem(-subbin_data_dim, -subbin_data_dim, dim_maxes[dim])
for flip_col_id, flip_col in enumerate(new_subdims):
flipped_inner_prod_matrices[flip_col_id] = np.multiply(flipped_inner_prod_matrices[flip_col_id],
flipped_inner_elem if dim == flip_col else inner_elem)
if inter_prod_matrix is not None:
prev_bin_data_dim = prev_bin_data[subbin_id][dim]
inter_elem = id.compute_ID_elem(subbin_data_dim, prev_bin_data_dim, dim_maxes[dim])
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem)
flipped_inter_elem = id.compute_ID_elem(-subbin_data_dim, -prev_bin_data_dim, dim_maxes[dim])
for flip_col_id, flip_col in enumerate(new_subdims):
flipped_inter_prod_matrices[flip_col_id] = np.multiply(flipped_inter_prod_matrices[flip_col_id],
flipped_inter_elem if dim == flip_col else inter_elem)
inner_bin_measures[bin_id].append(np.sum(inner_prod_matrix) / points_count ** 2)
for flip_col_id in range(len(new_subdims)):
flipped_inner_bin_measures[flip_col_id][bin_id].append(
np.sum(flipped_inner_prod_matrices[flip_col_id]) / points_count ** 2)
if inter_prod_matrix is not None:
inter_bin_measures[bin_id - 1].append(
2 * np.sum(inter_prod_matrix) / (points_count * prev_points_count))
for flip_col_id in range(len(new_subdims)):
flipped_inter_bin_measures[flip_col_id][bin_id - 1].append(
2 * np.sum(flipped_inter_prod_matrices[flip_col_id]) / (points_count * prev_points_count))
prev_bin_data[subbin_id] = subbin_data
IDs = []
for bin_id, inter_measures in enumerate(inter_bin_measures):
measures = [[inner_bin_measures[bin_id][subbin_id] - sub + inner_bin_measures[bin_id + 1][subbin_id]]
# + [
# flipped_inner_bin_measures[flip_col_id][bin_id][subbin_id] -
# flipped_inter_bin_measures[flip_col_id][bin_id][subbin_id] +
# flipped_inner_bin_measures[flip_col_id][bin_id + 1][subbin_id] for flip_col_id in range(len(new_subdims))]
for subbin_id, sub in enumerate(inter_measures)]
IDs.append(np.average(measures))
# IDs.append(np.average([inner_bin_measures[bin_id][subbin_id] - sub + inner_bin_measures[bin_id + 1][subbin_id]
# for subbin_id, sub in enumerate(inter_measures)]))
IDs = np.array(IDs)
return IDs
def compute_fractal_calibratedIDs1(bin_map, data, dim_maxes):
dist_bins = bin_map.cat.categories
if data.empty:
return np.zeros(len(dist_bins) - 1)
subdim = data.columns[0]
if len(data.columns) > 1:
subbins_count = 4
new_subdims = data.columns.delete(0)
else:
subbins_count = 1
new_subdims = data.columns
calibrated_inner_bin_measures = [[] for i in range(len(dist_bins) - 1)]
calibrated_prev_inner_bin_measures = [[] for i in range(len(dist_bins) - 1)]
calibrated_inter_bin_measures = [[] for i in range((len(dist_bins) - 1))]
# prepare previous bin data
bin_data = data.loc[bin_map == dist_bins[0]]
subbinning = bn.Binning(bin_data, subdim, subbins_count)
subbin_map = subbinning.equal_frequency_binning_by_rank()
dist_subbins = subbin_map.cat.categories
prev_bin_data = []
for subbin_id, subbinn in enumerate(dist_subbins):
subbin_data = bin_data.loc[subbin_map == subbinn, new_subdims]
prev_bin_data.append(subbin_data)
for bin_id, binn in enumerate(dist_bins[1:], start=0):
bin_data = data.loc[bin_map == binn]
# create sub-bins (default 2)
subbinning = bn.Binning(bin_data, subdim, subbins_count)
subbin_map = subbinning.equal_frequency_binning_by_rank()
dist_subbins = subbin_map.cat.categories
for subbin_id, subbinn in enumerate(dist_subbins):
subbin_data = bin_data.loc[subbin_map == subbinn, new_subdims]
points_count = subbin_data.shape[0]
calibrated_inner_prod_matrix = np.ones([points_count, points_count])
prev_points_count = prev_bin_data[subbin_id].shape[0]
calibrated_prev_inner_prod_matrix = np.ones([prev_points_count, prev_points_count])
calibrated_inter_prod_matrix = np.ones([points_count, prev_points_count])
# product elements for each dimension
for dim_id, dim in enumerate(subbin_data):
subbin_data_dim = subbin_data[dim]
prev_bin_data_dim = prev_bin_data[subbin_id][dim]
diff = dim_maxes[dim] - max(max(subbin_data_dim), max(prev_bin_data_dim))
calibrated_inner_elem = id.compute_ID_elem(subbin_data_dim + diff, subbin_data_dim + diff,
dim_maxes[dim])
calibrated_inner_prod_matrix = np.multiply(calibrated_inner_prod_matrix, calibrated_inner_elem)
calibrated_prev_inner_elem = id.compute_ID_elem(prev_bin_data_dim + diff, prev_bin_data_dim + diff,
dim_maxes[dim])
calibrated_prev_inner_prod_matrix = np.multiply(calibrated_prev_inner_prod_matrix,
calibrated_prev_inner_elem)
calibrated_inter_elem = id.compute_ID_elem(subbin_data_dim + diff, prev_bin_data_dim + diff,
dim_maxes[dim])
calibrated_inter_prod_matrix = np.multiply(calibrated_inter_prod_matrix,
calibrated_inter_elem)
calibrated_inner_bin_measures[bin_id].append(np.sum(calibrated_inner_prod_matrix) / points_count ** 2)
calibrated_prev_inner_bin_measures[bin_id].append(
np.sum(calibrated_prev_inner_prod_matrix) / prev_points_count ** 2)
calibrated_inter_bin_measures[bin_id].append(2 * np.sum(calibrated_inter_prod_matrix) /
(points_count * prev_points_count))
prev_bin_data[subbin_id] = subbin_data
IDs = []
for bin_id, inter_measures in enumerate(calibrated_inter_bin_measures):
measures = [calibrated_prev_inner_bin_measures[bin_id][subbin_id] -
calibrated_inter_bin_measures[bin_id][subbin_id] +
calibrated_inner_bin_measures[bin_id][subbin_id]
for subbin_id, sub in enumerate(inter_measures)]
IDs.append(np.average(measures))
# IDs.append(np.average([inner_bin_measures[bin_id][subbin_id] - sub + inner_bin_measures[bin_id + 1][subbin_id]
# for subbin_id, sub in enumerate(inter_measures)]))
IDs = np.array(IDs)
return IDs
if __name__ == '__main__':
data_gen = dg.produce_xor_generator(4, 3, 'bla', distribution='uniform', rows=6000)
subspaces = data_gen.subspaces
print(subspaces)
subspace_map = main.get_map_from_subspace_set(subspaces)
data = pd.DataFrame(data_gen.build()[0])
dim_maxes = data.max(0)
# init_bins_count = int(math.pow(cube_rows*2, 0.6)) * 2 # ceil in original ipd...
# init_bins_count = int(math.ceil(math.pow(data.shape[0], 0.4))) # ceil in original ipd...
init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) # ceil in original ipd...
print('init_bins_count', init_bins_count)
curr = 0
print('discretization', data_gen.perf_disc[curr])
binning = bn.Binning(data, curr, init_bins_count)
bin_map = binning.equal_frequency_binning_by_rank()
dist_bins = bin_map.cat.categories
curr_points = [data.loc[binning.rank_data[binning.rank_data[curr] == math.floor(
float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)', dist_bins[i]).group(1)))].index.tolist()[0], curr] for i in
range(len(dist_bins) - 1)]
# curr_points = [float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)', dist_bins[i]).group(1)) for i in
# range(len(dist_bins) - 1)]
curr_subspace = list(subspace_map[curr])
# curr_subspace.append(curr)
print('curr_subspace', curr_subspace)
new_data = data.copy().loc[:, curr_subspace]
new_dim_maxes = dim_maxes[curr_subspace]
IDs = compute_fractal_calibratedIDs1(bin_map, new_data, new_dim_maxes)
print(IDs)