Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import cjs
import data_generator as dg
import pandas as pd
import math
import interaction_distance as id
import re
import matplotlib.pyplot as plt
import main
import ID_sm as idsm
from correlation_measures.binning import Binning
import numpy as np
import fractal_interaction_distance as fr
def ID_computation_improvements():
cube_rows = 6000
# data_gen = dg.CubesGenerator(8, 0, 2, 'bla')
# data_gen.add_cube_parameter(dg.CubeParameters(int(cube_rows/2)))
# data_gen.add_cube_parameter(dg.CubeParameters(int(cube_rows/2),
# {
# 0: [-0.5, 1], 1: [-0.5, 1],
# 2: [-0.6, 1], 3: [-0.7, 1],
# 4: [0.5, 1], 5: [-0.5, 1],
# 6: [0.3, 1], 7: [-0.3, 1]
# }))
data_gen = dg.produce_cube_generator(8, 0, 1, 'c', 1, 'bla', cube_rows)
# data_gen = dg.produce_xor_generator(3, 0, 'bla')
# data_gen = dg.XorGenerator(3, 0, 2, cube_rows, 0.1, 'bla')
data = pd.DataFrame(data_gen.build()[0])
# data = pd.read_csv("/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/new_cubes/cubes_08_01_c.csv",
# header=None, delimiter=";")
# data = data.loc[:, :7]
print(data.shape)
# data_gen = dg.XorGenerator(3, 0, 2, cube_rows * 3, 0.1, 'bla')
# data = pd.DataFrame(data_gen.build()[0])
dim_maxes = data.max(0)
curr = 0
init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) # ceil in original ipd...
print('init_bins_count', init_bins_count)
# init_bins_count = int(math.pow(cube_rows*2, 0.6)) * 2 # ceil in original ipd...
# init_bins_count = int(math.ceil(math.pow(data.shape[0], 0.4))) # ceil in original ipd...
binning = Binning(data, curr, init_bins_count)
bin_map = binning.equal_frequency_binning_by_rank()
dist_bins = bin_map.cat.categories
curr_points = [data.loc[binning.rank_data[binning.rank_data[curr] == math.floor(
float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)', dist_bins[i]).group(1)))].index.tolist()[0], curr] for i in
range(len(dist_bins) - 1)]
bin_widths = [j - i for i, j in zip([-2] + curr_points, curr_points + [2])]
print('max bin width', max(bin_widths))
min_width = min(bin_widths)
print('min bin width', min_width)
IDs = id.compute_IDs(bin_map, curr, data, dim_maxes)
# new_IDs = [iIDd * float(min_width / min(bin_widths[i], bin_widths[i+1])) for i, ID in enumerate(IDs)]
new_IDs = [ID * float(min_width / min(bin_widths[i], bin_widths[i + 1])) for i, ID in enumerate(IDs)]
# IDs = cjs.compute_CJSs(bin_map, curr, data, dim_maxes)
ax = plt.subplot(141)
# ax.set_ylim([0, 400])
plt.plot(curr_points, IDs)
# plt.scatter(curr_points, IDs)
plt.plot(curr_points, [id.compute_ID_threshold(curr_points, IDs)] * len(curr_points), color='r')
ax = plt.subplot(142)
# ax.set_ylim([0, 400])
plt.plot(curr_points, new_IDs)
# plt.scatter(curr_points, new_IDs)
plt.plot(curr_points, [id.compute_ID_threshold(curr_points, new_IDs)] * len(curr_points), color='r')
IDs = id.compute_IDs(bin_map, curr, data, dim_maxes, True)
new_IDs = [ID * float(min_width / min(bin_widths[i], bin_widths[i + 1]))
for i, ID in enumerate(IDs)]
plt.subplot(143)
plt.plot(curr_points, IDs)
# plt.scatter(curr_points, IDs)
plt.plot(curr_points, [id.compute_ID_threshold(curr_points, IDs)] * len(curr_points), color='r')
plt.subplot(144)
plt.plot(curr_points, new_IDs)
# plt.scatter(curr_points, IDs)
plt.plot(curr_points, [id.compute_ID_threshold(curr_points, new_IDs)] * len(curr_points), color='r')
plt.show()
plot_rows = 3
plot_cols = 5
plot_id = 1
def plot(curr_points, IDs, scatter=False, scaled_x=True):
global plot_id
curr_points = [i for i in range(len(IDs))] if not scaled_x else curr_points
ax = plt.subplot(plot_rows, plot_cols, plot_id)
if scatter:
plt.scatter(curr_points, IDs, s=0.5)
# for p in curr_points:
# plt.vlines(p, -2, 2)
else:
# ax.set_ylim([-0.01, 0.12])
plt.plot(curr_points, IDs)
# plt.scatter(curr_points, IDs)
plt.plot(curr_points, [id.compute_ID_threshold(curr_points, IDs)] * len(curr_points), color='r')
plot_id += 1
def naive_IDs():
cube_rows = 6000
# data_gen = dg.produce_cube_generator(3, 3, 1, 'c', 1, 'bla', cube_rows)
data_gen = dg.produce_xor_generator(3, 3, 'bla', distribution='uniform', rows=10000)
subspaces = data_gen.subspaces
print(subspaces)
subspace_map = main.get_map_from_subspace_set(subspaces)
data = pd.DataFrame(data_gen.build()[0])
dim_maxes = data.max(0)
# init_bins_count = int(math.pow(cube_rows*2, 0.6)) * 2 # ceil in original ipd...
# init_bins_count = int(math.ceil(math.pow(data.shape[0], 0.4))) # ceil in original ipd...
init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) # ceil in original ipd...
print('init_bins_count', init_bins_count)
curr = 0
print('discretization', data_gen.perf_disc[curr])
binning = Binning(data, curr, init_bins_count)
# bin_map = binning.equal_frequency_binning_by_rank()
def run_for_subspace(binning, curr_subspace, new_dim_maxes, new_data):
bin_map = binning.equal_frequency_binning_by_rank()
dist_bins = bin_map.cat.categories
curr_points = [data.loc[binning.rank_data[binning.rank_data[curr] == math.floor(
float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)', dist_bins[i]).group(1)))].index.tolist()[0], curr] for i in
range(len(dist_bins) - 1)]
# curr_points = [float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)', dist_bins[i]).group(1)) for i in
# range(len(dist_bins) - 1)]
data_wo_curr = new_data.copy()
if curr in data.columns:
data_wo_curr.pop(curr)
IDs = id.compute_IDs1(bin_map, data_wo_curr, new_dim_maxes)
# bin_widths = [j - i for i, j in zip([-2] + curr_points, curr_points + [2])]
# min_width = min(bin_widths)
# smoothed_IDs = id.compute_IDs(bin_map, curr, new_data, new_dim_maxes, curr_points, True)
scaled_x = True
plot(curr_points, IDs, scaled_x=scaled_x)
# plot(curr_points, smoothed_IDs, scaled_x=scaled_x)
# bin_map = binning.equal_width_binning()
# dist_bins = bin_map.cat.categories
# curr_points = [float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)', dist_bins[i]).group(1)) for i in range(len(dist_bins) - 1)]
# smoothed_IDs = id.compute_IDs(bin_map, curr, new_data, new_dim_maxes, True)
fractal_IDs = fr.compute_fractalIDs1(bin_map, data_wo_curr, dim_maxes)
fractal_calibrated_IDs = fr.compute_fractal_calibratedIDs1(bin_map, data_wo_curr, dim_maxes)
# bin_width_IDs = [ID * float(min_width / min(bin_widths[i], bin_widths[i + 1])) for i, ID in enumerate(fractal_IDs)]
plot(curr_points, fractal_IDs, scaled_x=scaled_x)
plot(curr_points, fractal_calibrated_IDs, scaled_x=scaled_x)
plot(data[curr], data[curr_subspace[1]], True)
plot(binning.rank_data[curr], data[curr_subspace[1]], True)
print('balID score ID', idsm.compute_ID_subspace_set_score(curr_points, IDs)[0])
print('balID score fractal_IDs', idsm.compute_ID_subspace_set_score(curr_points, fractal_IDs)[0])
print('balID score fractal_calibrated_IDs', idsm.compute_ID_subspace_set_score(curr_points, fractal_calibrated_IDs)[0])
def IDscoreID(IDs):
return sum([1 if ID < np.average(IDs) else 0 for ID in IDs]) / len(IDs)
print('ID score ID', IDscoreID(IDs))
print('ID score fractal_IDs', IDscoreID(fractal_IDs))
print('ID score fractal_calibrated_IDs', IDscoreID(fractal_calibrated_IDs))
curr_subspace = list(subspace_map[curr])
curr_subspace.append(curr)
print('curr_subspace', curr_subspace)
new_data = data.copy().loc[:, curr_subspace]
new_dim_maxes = dim_maxes[curr_subspace]
run_for_subspace(binning, curr_subspace, new_dim_maxes, new_data)
curr_subspace = [curr, subspace_map[curr].pop()]
print('curr_subspace', curr_subspace)
new_data = data.copy().loc[:, curr_subspace]
new_dim_maxes = dim_maxes[curr_subspace]
run_for_subspace(binning, curr_subspace, new_dim_maxes, new_data)
# # irrelevant feature from another interaction
# subspace = subspaces[0]
# if curr in subspace:
# subspace = subspaces[1]
# irrelevant = subspace_map[subspace[0]].pop()
# curr_subspace = [curr, irrelevant]
# print('curr_subspace', curr_subspace)
# new_data = data.copy().loc[:, curr_subspace]
# new_dim_maxes = dim_maxes[curr_subspace]
# run_for_subspace(binning, curr_subspace, new_dim_maxes, new_data)
# # plot(data[curr], data[curr_subspace[1]], True, False)
# completely irrelevant feature
curr_subspace = [curr, 5]
print('curr_subspace', curr_subspace)
new_data = data.copy().loc[:, curr_subspace]
new_dim_maxes = dim_maxes[curr_subspace]
run_for_subspace(binning, curr_subspace, new_dim_maxes, new_data)
# plot(data[curr], data[curr_subspace[1]], True, False)
plt.show()
if __name__ == '__main__':
# ID_computation_improvements()
naive_IDs()