Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import numpy as np
import experiments_logging as log
import pandas as pd
import interaction_distance as id
import data_generator as dg
import matplotlib.pyplot as plt
import math
import re
from correlation_measures.binning import Binning
def evidence_ID():
# # no interaction
# b = np.matrix(np.random.uniform(1, 2, (4000, 1)))
# back = np.matrix(np.random.uniform(0, 2, (4000, 1)))
# res = np.append(b, back, axis=0)
# b1 = np.matrix(np.random.uniform(0, 2, (8000, 1)))
# #
# # either horizontal or vertical tube
# all = np.append(b1, res, axis=1)
# df = pd.DataFrame(all)
# df = df.sort_values(by=0).reset_index(drop=True)
# print(id.compute_ID(df.loc[:200, 1].to_frame(), df.loc[7800:8000, 1].to_frame(), [0, 2]))
# log.plot_data_2d(df)
# all = np.append(res, b1, axis=1)
# df = pd.DataFrame(all)
# df = df.sort_values(by=0).reset_index(drop=True)
# print(id.compute_ID(df.loc[:200, 1].to_frame(), df.loc[7800:8000, 1].to_frame(), [0, 2]))
# log.plot_data_2d(df)
# # log.plot_data_2d(pd.concat([df.loc[:100], df.loc[7900:8000]], axis=0))
# cube interaction in the middle
# middle_id_avg = 0
# top_id_avg = 0
# for i in range(100):
b = np.matrix(np.random.uniform(1, 2, (4000, 1)))
back = np.matrix(np.random.uniform(0, 2, (4000, 1)))
res = np.append(b, back, axis=0)
b1 = np.matrix(np.random.uniform(0.2, 1.2, (4000, 1)))
back1 = np.matrix(np.random.uniform(0, 2, (4000, 1)))
res1 = np.append(b1, back1, axis=0)
all = np.append(res, res1, axis=1)
df = pd.DataFrame(all)
df = df.sort_values(by=0).reset_index(drop=True)
# print(id.compute_ID(df.loc[7800:7900, 1].to_frame(), df.loc[7900:8000, 1].to_frame(), [0, 2]))
# middle_id_avg += id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[7600:8000, 1].to_frame(), [0, 2])
ID = id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[7600:8000, 1].to_frame(), [0, 2])
print(ID)
ID = id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[400:800, 1].to_frame(), [0, 2])
print(ID)
# log.plot_data_2d(df)
log.plot_data_2d(pd.concat([df.loc[:400], df.loc[7600:8000]], axis=0))
log.plot_data_2d(pd.concat([df.loc[:400], df.loc[400:800]], axis=0))
#
# # cube interaction in the top part
# b = np.matrix(np.random.uniform(1, 2, (4000, 1)))
# back = np.matrix(np.random.uniform(0, 2, (4000, 1)))
# res = np.append(b, back, axis=0)
# b1 = np.matrix(np.random.uniform(0.8, 1.8, (4000, 1)))
# back1 = np.matrix(np.random.uniform(0, 2, (4000, 1)))
# res1 = np.append(b1, back1, axis=0)
#
# all = np.append(res, res1, axis=1)
# df = pd.DataFrame(all)
# df = df.sort_values(by=0).reset_index(drop=True)
# # print(id.compute_ID(df.loc[7800:7900, 1].to_frame(), df.loc[7900:8000, 1].to_frame(), [0, 2]))
# top_id_avg += id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[7600:8000, 1].to_frame(), [0, 2])
# # log.plot_data_2d(df)
# log.plot_data_2d(pd.concat([df.loc[:400], df.loc[7600:8000]], axis=0))
# print('middle ID', middle_id_avg / 100)
# print('top ID', top_id_avg / 100)
# evidence_ID()
# exit(1)
def balance_ids(dim_x, ids):
if type(ids) is not list:
ids = ids.tolist()
if type(dim_x) is not list:
dim_x = dim_x.tolist()
dim_x = dim_x[:-1]
new_dim_x = [dim_x[0]]
new_ids = [ids[0]]
b = (dim_x[-1] - dim_x[0]) / len(ids)
for i, x in enumerate(dim_x[1:], start=1):
dist = x - dim_x[i - 1]
if dist > b:
add = []
add_ids = []
for j in range(1, int(dist / b) + 1):
add.append(dim_x[i - 1] + b * j)
add_ids.append((ids[i] - ids[i - 1]) * b * j / dist + ids[i - 1])
new_ids.extend(add_ids)
new_dim_x.extend(add)
new_ids.append(ids[i])
new_dim_x.append(x)
return new_dim_x, new_ids
def reduce_ids(dim_x, ids):
a1 = [dim_x[i] + (dim_x[i] - dim_x[i - 1]) / 2 for i in range(1, len(dim_x)) if i % 2 == 0]
a2 = [ids[i] + (ids[i] - ids[i - 1]) / 2 for i in range(1, len(ids)) if i % 2 == 0]
return a1, a2
# def experiment1():
# cg = dg.produce_cube_generator(2, 2, 1, "c", 1, ".csv")
# data, filname = cg.build()
# # print(cg.subspaces)
# # print(cg.perf_disc)
# if type(data) is not pd.DataFrame:
# data = pd.DataFrame(data)
# # f1 = plt.figure(1)
# # ax = f1.add_subplot(111, projection='3d')
# # log.build_plot_data_3d(ax, data)
# # fig, axes = plt.subplots(3, 2, sharex='col', sharey='row')
# plot_id = 0
# dim_count = data.shape[1]
#
# bests = []
# init_bests = []
# for curr in range(dim_count - 1):
# dims = data.columns.tolist()
# dims.remove(curr)
# dims.remove(dim_count - 1)
# projected_data = data.sort_values(by=curr).reset_index()
# curr_index = projected_data['index']
# projected_data = projected_data.loc[:, dims]
# rows = projected_data.shape[0]
# # print('curr dimension', curr)
# best_score = None
# best = None
# init_best_score = None
# init_best = None
# for dim in dims:
# counter = 0
# ids = []
# dim_x = []
# while (True):
# if counter + 140 > rows:
# break
# ids.append(id.compute_ID(projected_data.loc[counter:counter + 70, dim].to_frame(),
# projected_data.loc[counter + 70: counter + 140, dim].to_frame(),
# [2] * dim_count))
# dim_x.append(data.loc[curr_index.loc[counter + 70], curr])
# # todo check if the smoothed binning improves results counter += 70
# counter += 140
# # needs data normalization todo
# bal_dim_x, bal_ids = balance_ids(dim_x, ids)
# # print('interaction with', dim)
# # print('average ID', np.average(ids))
# # print('average balanced ID', np.average(bal_ids))
# # print('under average ID', sum([1 if ID < np.average(ids) else 0 for ID in ids]) / len(ids))
# score = sum([1 if ID < np.average(bal_ids) else 0 for ID in ids]) / len(ids)
# init_score = sum([1 if ID < np.average(ids) else 0 for ID in ids]) / len(ids)
# # print('under bal average ID', score)
# if not best_score or best_score < score:
# best_score = score
# best = dim
# if not init_best_score or init_best_score < init_score:
# init_best_score = init_score
# init_best = dim
# # ax = axes[int(plot_id / 2), int(plot_id % 2)]
# # ax.set_title('curr: ' + str(curr) + ', dim: ' + str(dim))
# # ax.set_ylim([0, 0.2])
# # ax.plot(dim_x, [np.average(bal_ids)] * len(dim_x))
# # ax.plot(dim_x, ids)
# plot_id += 1
# # print('\n')
# bests.append(best)
# init_bests.append(init_best)
# # print(bests)
# # plt.show()
# return bests[0] == 1 and bests[1] == 0, init_bests[0] == 1 and init_bests[1] == 0
def compute_ID_subspace_set_score(curr_x, IDs):
bal_curr_x, bal_ids = balance_ids(curr_x, IDs)
return sum([1 if ID < np.average(bal_ids) else 0 for ID in IDs]) / len(IDs), IDs
def compute_subspace_interaction_measure(bin_map, curr, data, curr_x, dim_maxes):
if curr not in data.columns.tolist():
raise ValueError("no current dimension in data!")
# dim_maxes = data.max(0)
# init_bins_count = int(math.ceil(math.sqrt(data.shape[0])))
# binning = Binning(data, curr, init_bins_count)
# bin_map = binning.equal_frequency_binning_by_rank()
# dist_bins = bin_map.cat.categories
IDs = id.compute_IDs(bin_map, curr, data, dim_maxes)
# curr_x = [data.loc[binning.rank_data[binning.rank_data[curr]
# == math.floor(float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)',
# dist_bins[i]).group(1)))]
# .index.tolist()[0], curr] for i in range(len(IDs))]
return compute_ID_subspace_set_score(curr_x, IDs)
# res = 0
# init_res = 0
# for i in range(100):
# ex = experiment1()
# res += int(ex[0])
# init_res += int(ex[1])
# print("bal accuracy", res / 100)
# print("init accuracy", init_res / 100)