Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/ID_sm.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
206 lines (190 sloc)
8.31 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import experiments_logging as log | |
import pandas as pd | |
import interaction_distance as id | |
import data_generator as dg | |
import matplotlib.pyplot as plt | |
import math | |
import re | |
from correlation_measures.binning import Binning | |
def evidence_ID(): | |
# # no interaction | |
# b = np.matrix(np.random.uniform(1, 2, (4000, 1))) | |
# back = np.matrix(np.random.uniform(0, 2, (4000, 1))) | |
# res = np.append(b, back, axis=0) | |
# b1 = np.matrix(np.random.uniform(0, 2, (8000, 1))) | |
# # | |
# # either horizontal or vertical tube | |
# all = np.append(b1, res, axis=1) | |
# df = pd.DataFrame(all) | |
# df = df.sort_values(by=0).reset_index(drop=True) | |
# print(id.compute_ID(df.loc[:200, 1].to_frame(), df.loc[7800:8000, 1].to_frame(), [0, 2])) | |
# log.plot_data_2d(df) | |
# all = np.append(res, b1, axis=1) | |
# df = pd.DataFrame(all) | |
# df = df.sort_values(by=0).reset_index(drop=True) | |
# print(id.compute_ID(df.loc[:200, 1].to_frame(), df.loc[7800:8000, 1].to_frame(), [0, 2])) | |
# log.plot_data_2d(df) | |
# # log.plot_data_2d(pd.concat([df.loc[:100], df.loc[7900:8000]], axis=0)) | |
# cube interaction in the middle | |
# middle_id_avg = 0 | |
# top_id_avg = 0 | |
# for i in range(100): | |
b = np.matrix(np.random.uniform(1, 2, (4000, 1))) | |
back = np.matrix(np.random.uniform(0, 2, (4000, 1))) | |
res = np.append(b, back, axis=0) | |
b1 = np.matrix(np.random.uniform(0.2, 1.2, (4000, 1))) | |
back1 = np.matrix(np.random.uniform(0, 2, (4000, 1))) | |
res1 = np.append(b1, back1, axis=0) | |
all = np.append(res, res1, axis=1) | |
df = pd.DataFrame(all) | |
df = df.sort_values(by=0).reset_index(drop=True) | |
# print(id.compute_ID(df.loc[7800:7900, 1].to_frame(), df.loc[7900:8000, 1].to_frame(), [0, 2])) | |
# middle_id_avg += id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[7600:8000, 1].to_frame(), [0, 2]) | |
ID = id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[7600:8000, 1].to_frame(), [0, 2]) | |
print(ID) | |
ID = id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[400:800, 1].to_frame(), [0, 2]) | |
print(ID) | |
# log.plot_data_2d(df) | |
log.plot_data_2d(pd.concat([df.loc[:400], df.loc[7600:8000]], axis=0)) | |
log.plot_data_2d(pd.concat([df.loc[:400], df.loc[400:800]], axis=0)) | |
# | |
# # cube interaction in the top part | |
# b = np.matrix(np.random.uniform(1, 2, (4000, 1))) | |
# back = np.matrix(np.random.uniform(0, 2, (4000, 1))) | |
# res = np.append(b, back, axis=0) | |
# b1 = np.matrix(np.random.uniform(0.8, 1.8, (4000, 1))) | |
# back1 = np.matrix(np.random.uniform(0, 2, (4000, 1))) | |
# res1 = np.append(b1, back1, axis=0) | |
# | |
# all = np.append(res, res1, axis=1) | |
# df = pd.DataFrame(all) | |
# df = df.sort_values(by=0).reset_index(drop=True) | |
# # print(id.compute_ID(df.loc[7800:7900, 1].to_frame(), df.loc[7900:8000, 1].to_frame(), [0, 2])) | |
# top_id_avg += id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[7600:8000, 1].to_frame(), [0, 2]) | |
# # log.plot_data_2d(df) | |
# log.plot_data_2d(pd.concat([df.loc[:400], df.loc[7600:8000]], axis=0)) | |
# print('middle ID', middle_id_avg / 100) | |
# print('top ID', top_id_avg / 100) | |
# evidence_ID() | |
# exit(1) | |
def balance_ids(dim_x, ids): | |
if type(ids) is not list: | |
ids = ids.tolist() | |
if type(dim_x) is not list: | |
dim_x = dim_x.tolist() | |
dim_x = dim_x[:-1] | |
new_dim_x = [dim_x[0]] | |
new_ids = [ids[0]] | |
b = (dim_x[-1] - dim_x[0]) / len(ids) | |
for i, x in enumerate(dim_x[1:], start=1): | |
dist = x - dim_x[i - 1] | |
if dist > b: | |
add = [] | |
add_ids = [] | |
for j in range(1, int(dist / b) + 1): | |
add.append(dim_x[i - 1] + b * j) | |
add_ids.append((ids[i] - ids[i - 1]) * b * j / dist + ids[i - 1]) | |
new_ids.extend(add_ids) | |
new_dim_x.extend(add) | |
new_ids.append(ids[i]) | |
new_dim_x.append(x) | |
return new_dim_x, new_ids | |
def reduce_ids(dim_x, ids): | |
a1 = [dim_x[i] + (dim_x[i] - dim_x[i - 1]) / 2 for i in range(1, len(dim_x)) if i % 2 == 0] | |
a2 = [ids[i] + (ids[i] - ids[i - 1]) / 2 for i in range(1, len(ids)) if i % 2 == 0] | |
return a1, a2 | |
# def experiment1(): | |
# cg = dg.produce_cube_generator(2, 2, 1, "c", 1, ".csv") | |
# data, filname = cg.build() | |
# # print(cg.subspaces) | |
# # print(cg.perf_disc) | |
# if type(data) is not pd.DataFrame: | |
# data = pd.DataFrame(data) | |
# # f1 = plt.figure(1) | |
# # ax = f1.add_subplot(111, projection='3d') | |
# # log.build_plot_data_3d(ax, data) | |
# # fig, axes = plt.subplots(3, 2, sharex='col', sharey='row') | |
# plot_id = 0 | |
# dim_count = data.shape[1] | |
# | |
# bests = [] | |
# init_bests = [] | |
# for curr in range(dim_count - 1): | |
# dims = data.columns.tolist() | |
# dims.remove(curr) | |
# dims.remove(dim_count - 1) | |
# projected_data = data.sort_values(by=curr).reset_index() | |
# curr_index = projected_data['index'] | |
# projected_data = projected_data.loc[:, dims] | |
# rows = projected_data.shape[0] | |
# # print('curr dimension', curr) | |
# best_score = None | |
# best = None | |
# init_best_score = None | |
# init_best = None | |
# for dim in dims: | |
# counter = 0 | |
# ids = [] | |
# dim_x = [] | |
# while (True): | |
# if counter + 140 > rows: | |
# break | |
# ids.append(id.compute_ID(projected_data.loc[counter:counter + 70, dim].to_frame(), | |
# projected_data.loc[counter + 70: counter + 140, dim].to_frame(), | |
# [2] * dim_count)) | |
# dim_x.append(data.loc[curr_index.loc[counter + 70], curr]) | |
# # todo check if the smoothed binning improves results counter += 70 | |
# counter += 140 | |
# # needs data normalization todo | |
# bal_dim_x, bal_ids = balance_ids(dim_x, ids) | |
# # print('interaction with', dim) | |
# # print('average ID', np.average(ids)) | |
# # print('average balanced ID', np.average(bal_ids)) | |
# # print('under average ID', sum([1 if ID < np.average(ids) else 0 for ID in ids]) / len(ids)) | |
# score = sum([1 if ID < np.average(bal_ids) else 0 for ID in ids]) / len(ids) | |
# init_score = sum([1 if ID < np.average(ids) else 0 for ID in ids]) / len(ids) | |
# # print('under bal average ID', score) | |
# if not best_score or best_score < score: | |
# best_score = score | |
# best = dim | |
# if not init_best_score or init_best_score < init_score: | |
# init_best_score = init_score | |
# init_best = dim | |
# # ax = axes[int(plot_id / 2), int(plot_id % 2)] | |
# # ax.set_title('curr: ' + str(curr) + ', dim: ' + str(dim)) | |
# # ax.set_ylim([0, 0.2]) | |
# # ax.plot(dim_x, [np.average(bal_ids)] * len(dim_x)) | |
# # ax.plot(dim_x, ids) | |
# plot_id += 1 | |
# # print('\n') | |
# bests.append(best) | |
# init_bests.append(init_best) | |
# # print(bests) | |
# # plt.show() | |
# return bests[0] == 1 and bests[1] == 0, init_bests[0] == 1 and init_bests[1] == 0 | |
def compute_ID_subspace_set_score(curr_x, IDs): | |
bal_curr_x, bal_ids = balance_ids(curr_x, IDs) | |
return sum([1 if ID < np.average(bal_ids) else 0 for ID in IDs]) / len(IDs), IDs | |
def compute_subspace_interaction_measure(bin_map, curr, data, curr_x, dim_maxes): | |
if curr not in data.columns.tolist(): | |
raise ValueError("no current dimension in data!") | |
# dim_maxes = data.max(0) | |
# init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) | |
# binning = Binning(data, curr, init_bins_count) | |
# bin_map = binning.equal_frequency_binning_by_rank() | |
# dist_bins = bin_map.cat.categories | |
IDs = id.compute_IDs(bin_map, curr, data, dim_maxes) | |
# curr_x = [data.loc[binning.rank_data[binning.rank_data[curr] | |
# == math.floor(float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)', | |
# dist_bins[i]).group(1)))] | |
# .index.tolist()[0], curr] for i in range(len(IDs))] | |
return compute_ID_subspace_set_score(curr_x, IDs) | |
# res = 0 | |
# init_res = 0 | |
# for i in range(100): | |
# ex = experiment1() | |
# res += int(ex[0]) | |
# init_res += int(ex[1]) | |
# print("bal accuracy", res / 100) | |
# print("init accuracy", init_res / 100) |