ID_sm.py

import numpy as np
import experiments_logging as log
import pandas as pd
import interaction_distance as id
import data_generator as dg
import matplotlib.pyplot as plt
import math
import re

from correlation_measures.binning import Binning


def evidence_ID():
    # # no interaction
    # b = np.matrix(np.random.uniform(1, 2, (4000, 1)))
    # back = np.matrix(np.random.uniform(0, 2, (4000, 1)))
    # res = np.append(b, back, axis=0)
    # b1 = np.matrix(np.random.uniform(0, 2, (8000, 1)))
    # #
    # # either horizontal or vertical tube
    # all = np.append(b1, res, axis=1)
    # df = pd.DataFrame(all)
    # df = df.sort_values(by=0).reset_index(drop=True)
    # print(id.compute_ID(df.loc[:200, 1].to_frame(), df.loc[7800:8000, 1].to_frame(), [0, 2]))
    # log.plot_data_2d(df)
    # all = np.append(res, b1, axis=1)
    # df = pd.DataFrame(all)
    # df = df.sort_values(by=0).reset_index(drop=True)
    # print(id.compute_ID(df.loc[:200, 1].to_frame(), df.loc[7800:8000, 1].to_frame(), [0, 2]))
    # log.plot_data_2d(df)
    # # log.plot_data_2d(pd.concat([df.loc[:100], df.loc[7900:8000]], axis=0))


    # cube interaction in the middle
    # middle_id_avg = 0
    # top_id_avg = 0
    # for i in range(100):
    b = np.matrix(np.random.uniform(1, 2, (4000, 1)))
    back = np.matrix(np.random.uniform(0, 2, (4000, 1)))
    res = np.append(b, back, axis=0)
    b1 = np.matrix(np.random.uniform(0.2, 1.2, (4000, 1)))
    back1 = np.matrix(np.random.uniform(0, 2, (4000, 1)))
    res1 = np.append(b1, back1, axis=0)

    all = np.append(res, res1, axis=1)
    df = pd.DataFrame(all)
    df = df.sort_values(by=0).reset_index(drop=True)
    # print(id.compute_ID(df.loc[7800:7900, 1].to_frame(), df.loc[7900:8000, 1].to_frame(), [0, 2]))
    # middle_id_avg += id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[7600:8000, 1].to_frame(), [0, 2])
    ID = id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[7600:8000, 1].to_frame(), [0, 2])
    print(ID)
    ID = id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[400:800, 1].to_frame(), [0, 2])
    print(ID)
    # log.plot_data_2d(df)
    log.plot_data_2d(pd.concat([df.loc[:400], df.loc[7600:8000]], axis=0))
    log.plot_data_2d(pd.concat([df.loc[:400], df.loc[400:800]], axis=0))
    #
    # # cube interaction in the top part
    # b = np.matrix(np.random.uniform(1, 2, (4000, 1)))
    # back = np.matrix(np.random.uniform(0, 2, (4000, 1)))
    # res = np.append(b, back, axis=0)
    # b1 = np.matrix(np.random.uniform(0.8, 1.8, (4000, 1)))
    # back1 = np.matrix(np.random.uniform(0, 2, (4000, 1)))
    # res1 = np.append(b1, back1, axis=0)
    #
    # all = np.append(res, res1, axis=1)
    # df = pd.DataFrame(all)
    # df = df.sort_values(by=0).reset_index(drop=True)
    # # print(id.compute_ID(df.loc[7800:7900, 1].to_frame(), df.loc[7900:8000, 1].to_frame(), [0, 2]))
    # top_id_avg += id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[7600:8000, 1].to_frame(), [0, 2])
    # # log.plot_data_2d(df)
    # log.plot_data_2d(pd.concat([df.loc[:400], df.loc[7600:8000]], axis=0))
    # print('middle ID', middle_id_avg / 100)
    # print('top ID', top_id_avg / 100)

# evidence_ID()
# exit(1)


def balance_ids(dim_x, ids):
    if type(ids) is not list:
        ids = ids.tolist()
    if type(dim_x) is not list:
        dim_x = dim_x.tolist()
    dim_x = dim_x[:-1]
    new_dim_x = [dim_x[0]]
    new_ids = [ids[0]]
    b = (dim_x[-1] - dim_x[0]) / len(ids)
    for i, x in enumerate(dim_x[1:], start=1):
        dist = x - dim_x[i - 1]
        if dist > b:
            add = []
            add_ids = []
            for j in range(1, int(dist / b) + 1):
                add.append(dim_x[i - 1] + b * j)
                add_ids.append((ids[i] - ids[i - 1]) * b * j / dist + ids[i - 1])
            new_ids.extend(add_ids)
            new_dim_x.extend(add)
        new_ids.append(ids[i])
        new_dim_x.append(x)
    return new_dim_x, new_ids


def reduce_ids(dim_x, ids):
    a1 = [dim_x[i] + (dim_x[i] - dim_x[i - 1]) / 2 for i in range(1, len(dim_x)) if i % 2 == 0]
    a2 = [ids[i] + (ids[i] - ids[i - 1]) / 2 for i in range(1, len(ids)) if i % 2 == 0]
    return a1, a2

# def experiment1():
#     cg = dg.produce_cube_generator(2, 2, 1, "c", 1, ".csv")
#     data, filname = cg.build()
#     # print(cg.subspaces)
#     # print(cg.perf_disc)
#     if type(data) is not pd.DataFrame:
#         data = pd.DataFrame(data)
#     # f1 = plt.figure(1)
#     # ax = f1.add_subplot(111, projection='3d')
#     # log.build_plot_data_3d(ax, data)
#     # fig, axes = plt.subplots(3, 2, sharex='col', sharey='row')
#     plot_id = 0
#     dim_count = data.shape[1]
#
#     bests = []
#     init_bests = []
#     for curr in range(dim_count - 1):
#         dims = data.columns.tolist()
#         dims.remove(curr)
#         dims.remove(dim_count - 1)
#         projected_data = data.sort_values(by=curr).reset_index()
#         curr_index = projected_data['index']
#         projected_data = projected_data.loc[:, dims]
#         rows = projected_data.shape[0]
#         # print('curr dimension', curr)
#         best_score = None
#         best = None
#         init_best_score = None
#         init_best = None
#         for dim in dims:
#             counter = 0
#             ids = []
#             dim_x = []
#             while (True):
#                 if counter + 140 > rows:
#                     break
#                 ids.append(id.compute_ID(projected_data.loc[counter:counter + 70, dim].to_frame(),
#                                          projected_data.loc[counter + 70: counter + 140, dim].to_frame(),
#                                          [2] * dim_count))
#                 dim_x.append(data.loc[curr_index.loc[counter + 70], curr])
#                 # todo check if the smoothed binning improves results counter += 70
#                 counter += 140
#             # needs data normalization todo
#             bal_dim_x, bal_ids = balance_ids(dim_x, ids)
#             # print('interaction with', dim)
#             # print('average ID', np.average(ids))
#             # print('average balanced ID', np.average(bal_ids))
#             # print('under average ID', sum([1 if ID < np.average(ids) else 0 for ID in ids]) / len(ids))
#             score = sum([1 if ID < np.average(bal_ids) else 0 for ID in ids]) / len(ids)
#             init_score = sum([1 if ID < np.average(ids) else 0 for ID in ids]) / len(ids)
#             # print('under bal average ID', score)
#             if not best_score or best_score < score:
#                 best_score = score
#                 best = dim
#             if not init_best_score or init_best_score < init_score:
#                 init_best_score = init_score
#                 init_best = dim
#             # ax = axes[int(plot_id / 2), int(plot_id % 2)]
#             # ax.set_title('curr: ' + str(curr) + ', dim: ' + str(dim))
#             # ax.set_ylim([0, 0.2])
#             # ax.plot(dim_x, [np.average(bal_ids)] * len(dim_x))
#             # ax.plot(dim_x, ids)
#             plot_id += 1
#         # print('\n')
#         bests.append(best)
#         init_bests.append(init_best)
#     # print(bests)
#     # plt.show()
#     return bests[0] == 1 and bests[1] == 0, init_bests[0] == 1 and init_bests[1] == 0


def compute_ID_subspace_set_score(curr_x, IDs):
    bal_curr_x, bal_ids = balance_ids(curr_x, IDs)
    return sum([1 if ID < np.average(bal_ids) else 0 for ID in IDs]) / len(IDs), IDs

def compute_subspace_interaction_measure(bin_map, curr, data, curr_x, dim_maxes):
    if curr not in data.columns.tolist():
        raise ValueError("no current dimension in data!")
    # dim_maxes = data.max(0)
    # init_bins_count = int(math.ceil(math.sqrt(data.shape[0])))
    # binning = Binning(data, curr, init_bins_count)
    # bin_map = binning.equal_frequency_binning_by_rank()
    # dist_bins = bin_map.cat.categories
    IDs = id.compute_IDs(bin_map, curr, data, dim_maxes)
    # curr_x = [data.loc[binning.rank_data[binning.rank_data[curr]
    #                              == math.floor(float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)',
    #                                                            dist_bins[i]).group(1)))]
    #                .index.tolist()[0], curr] for i in range(len(IDs))]
    return compute_ID_subspace_set_score(curr_x, IDs)

# res = 0
# init_res = 0
# for i in range(100):
#     ex = experiment1()
#     res += int(ex[0])
#     init_res += int(ex[1])
# print("bal accuracy", res / 100)
# print("init accuracy", init_res / 100)
	import numpy as np
	import experiments_logging as log
	import pandas as pd
	import interaction_distance as id
	import data_generator as dg
	import matplotlib.pyplot as plt
	import math
	import re

	from correlation_measures.binning import Binning


	def evidence_ID():
	# # no interaction
	# b = np.matrix(np.random.uniform(1, 2, (4000, 1)))
	# back = np.matrix(np.random.uniform(0, 2, (4000, 1)))
	# res = np.append(b, back, axis=0)
	# b1 = np.matrix(np.random.uniform(0, 2, (8000, 1)))
	# #
	# # either horizontal or vertical tube
	# all = np.append(b1, res, axis=1)
	# df = pd.DataFrame(all)
	# df = df.sort_values(by=0).reset_index(drop=True)
	# print(id.compute_ID(df.loc[:200, 1].to_frame(), df.loc[7800:8000, 1].to_frame(), [0, 2]))
	# log.plot_data_2d(df)
	# all = np.append(res, b1, axis=1)
	# df = pd.DataFrame(all)
	# df = df.sort_values(by=0).reset_index(drop=True)
	# print(id.compute_ID(df.loc[:200, 1].to_frame(), df.loc[7800:8000, 1].to_frame(), [0, 2]))
	# log.plot_data_2d(df)
	# # log.plot_data_2d(pd.concat([df.loc[:100], df.loc[7900:8000]], axis=0))


	# cube interaction in the middle
	# middle_id_avg = 0
	# top_id_avg = 0
	# for i in range(100):
	b = np.matrix(np.random.uniform(1, 2, (4000, 1)))
	back = np.matrix(np.random.uniform(0, 2, (4000, 1)))
	res = np.append(b, back, axis=0)
	b1 = np.matrix(np.random.uniform(0.2, 1.2, (4000, 1)))
	back1 = np.matrix(np.random.uniform(0, 2, (4000, 1)))
	res1 = np.append(b1, back1, axis=0)

	all = np.append(res, res1, axis=1)
	df = pd.DataFrame(all)
	df = df.sort_values(by=0).reset_index(drop=True)
	# print(id.compute_ID(df.loc[7800:7900, 1].to_frame(), df.loc[7900:8000, 1].to_frame(), [0, 2]))
	# middle_id_avg += id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[7600:8000, 1].to_frame(), [0, 2])
	ID = id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[7600:8000, 1].to_frame(), [0, 2])
	print(ID)
	ID = id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[400:800, 1].to_frame(), [0, 2])
	print(ID)
	# log.plot_data_2d(df)
	log.plot_data_2d(pd.concat([df.loc[:400], df.loc[7600:8000]], axis=0))
	log.plot_data_2d(pd.concat([df.loc[:400], df.loc[400:800]], axis=0))
	#
	# # cube interaction in the top part
	# b = np.matrix(np.random.uniform(1, 2, (4000, 1)))
	# back = np.matrix(np.random.uniform(0, 2, (4000, 1)))
	# res = np.append(b, back, axis=0)
	# b1 = np.matrix(np.random.uniform(0.8, 1.8, (4000, 1)))
	# back1 = np.matrix(np.random.uniform(0, 2, (4000, 1)))
	# res1 = np.append(b1, back1, axis=0)
	#
	# all = np.append(res, res1, axis=1)
	# df = pd.DataFrame(all)
	# df = df.sort_values(by=0).reset_index(drop=True)
	# # print(id.compute_ID(df.loc[7800:7900, 1].to_frame(), df.loc[7900:8000, 1].to_frame(), [0, 2]))
	# top_id_avg += id.compute_ID(df.loc[:400, 1].to_frame(), df.loc[7600:8000, 1].to_frame(), [0, 2])
	# # log.plot_data_2d(df)
	# log.plot_data_2d(pd.concat([df.loc[:400], df.loc[7600:8000]], axis=0))
	# print('middle ID', middle_id_avg / 100)
	# print('top ID', top_id_avg / 100)

	# evidence_ID()
	# exit(1)


	def balance_ids(dim_x, ids):
	if type(ids) is not list:
	ids = ids.tolist()
	if type(dim_x) is not list:
	dim_x = dim_x.tolist()
	dim_x = dim_x[:-1]
	new_dim_x = [dim_x[0]]
	new_ids = [ids[0]]
	b = (dim_x[-1] - dim_x[0]) / len(ids)
	for i, x in enumerate(dim_x[1:], start=1):
	dist = x - dim_x[i - 1]
	if dist > b:
	add = []
	add_ids = []
	for j in range(1, int(dist / b) + 1):
	add.append(dim_x[i - 1] + b * j)
	add_ids.append((ids[i] - ids[i - 1]) * b * j / dist + ids[i - 1])
	new_ids.extend(add_ids)
	new_dim_x.extend(add)
	new_ids.append(ids[i])
	new_dim_x.append(x)
	return new_dim_x, new_ids


	def reduce_ids(dim_x, ids):
	a1 = [dim_x[i] + (dim_x[i] - dim_x[i - 1]) / 2 for i in range(1, len(dim_x)) if i % 2 == 0]
	a2 = [ids[i] + (ids[i] - ids[i - 1]) / 2 for i in range(1, len(ids)) if i % 2 == 0]
	return a1, a2

	# def experiment1():
	# cg = dg.produce_cube_generator(2, 2, 1, "c", 1, ".csv")
	# data, filname = cg.build()
	# # print(cg.subspaces)
	# # print(cg.perf_disc)
	# if type(data) is not pd.DataFrame:
	# data = pd.DataFrame(data)
	# # f1 = plt.figure(1)
	# # ax = f1.add_subplot(111, projection='3d')
	# # log.build_plot_data_3d(ax, data)
	# # fig, axes = plt.subplots(3, 2, sharex='col', sharey='row')
	# plot_id = 0
	# dim_count = data.shape[1]
	#
	# bests = []
	# init_bests = []
	# for curr in range(dim_count - 1):
	# dims = data.columns.tolist()
	# dims.remove(curr)
	# dims.remove(dim_count - 1)
	# projected_data = data.sort_values(by=curr).reset_index()
	# curr_index = projected_data['index']
	# projected_data = projected_data.loc[:, dims]
	# rows = projected_data.shape[0]
	# # print('curr dimension', curr)
	# best_score = None
	# best = None
	# init_best_score = None
	# init_best = None
	# for dim in dims:
	# counter = 0
	# ids = []
	# dim_x = []
	# while (True):
	# if counter + 140 > rows:
	# break
	# ids.append(id.compute_ID(projected_data.loc[counter:counter + 70, dim].to_frame(),
	# projected_data.loc[counter + 70: counter + 140, dim].to_frame(),
	# [2] * dim_count))
	# dim_x.append(data.loc[curr_index.loc[counter + 70], curr])
	# # todo check if the smoothed binning improves results counter += 70
	# counter += 140
	# # needs data normalization todo
	# bal_dim_x, bal_ids = balance_ids(dim_x, ids)
	# # print('interaction with', dim)
	# # print('average ID', np.average(ids))
	# # print('average balanced ID', np.average(bal_ids))
	# # print('under average ID', sum([1 if ID < np.average(ids) else 0 for ID in ids]) / len(ids))
	# score = sum([1 if ID < np.average(bal_ids) else 0 for ID in ids]) / len(ids)
	# init_score = sum([1 if ID < np.average(ids) else 0 for ID in ids]) / len(ids)
	# # print('under bal average ID', score)
	# if not best_score or best_score < score:
	# best_score = score
	# best = dim
	# if not init_best_score or init_best_score < init_score:
	# init_best_score = init_score
	# init_best = dim
	# # ax = axes[int(plot_id / 2), int(plot_id % 2)]
	# # ax.set_title('curr: ' + str(curr) + ', dim: ' + str(dim))
	# # ax.set_ylim([0, 0.2])
	# # ax.plot(dim_x, [np.average(bal_ids)] * len(dim_x))
	# # ax.plot(dim_x, ids)
	# plot_id += 1
	# # print('\n')
	# bests.append(best)
	# init_bests.append(init_best)
	# # print(bests)
	# # plt.show()
	# return bests[0] == 1 and bests[1] == 0, init_bests[0] == 1 and init_bests[1] == 0


	def compute_ID_subspace_set_score(curr_x, IDs):
	bal_curr_x, bal_ids = balance_ids(curr_x, IDs)
	return sum([1 if ID < np.average(bal_ids) else 0 for ID in IDs]) / len(IDs), IDs

	def compute_subspace_interaction_measure(bin_map, curr, data, curr_x, dim_maxes):
	if curr not in data.columns.tolist():
	raise ValueError("no current dimension in data!")
	# dim_maxes = data.max(0)
	# init_bins_count = int(math.ceil(math.sqrt(data.shape[0])))
	# binning = Binning(data, curr, init_bins_count)
	# bin_map = binning.equal_frequency_binning_by_rank()
	# dist_bins = bin_map.cat.categories
	IDs = id.compute_IDs(bin_map, curr, data, dim_maxes)
	# curr_x = [data.loc[binning.rank_data[binning.rank_data[curr]
	# == math.floor(float(re.search(', (-\d+\.\de[+-]\d)',
	# dist_bins[i]).group(1)))]
	# .index.tolist()[0], curr] for i in range(len(IDs))]
	return compute_ID_subspace_set_score(curr_x, IDs)

	# res = 0
	# init_res = 0
	# for i in range(100):
	# ex = experiment1()
	# res += int(ex[0])
	# init_res += int(ex[1])
	# print("bal accuracy", res / 100)
	# print("init accuracy", init_res / 100)