Skip to content
Permalink
b1a05852fb
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
227 lines (190 sloc) 9.48 KB
import time
import data_generator as dg
import pandas as pd
import math
import interaction_distance as id
import footprint as f
import re
import matplotlib.pyplot as plt
import main
from correlation_measures.binning import Binning
import numpy as np
import merging as m
import subspace_mining as sm
import constants as cst
import operator
import functools
def plot(curr_x, points, scatter=False, threshold=None, scaled_x=True, vert_lines=None):
global plot_id
curr_x = [i for i in range(len(points))] if not scaled_x else curr_x
ax = plt.subplot(plot_rows, plot_cols, plot_id)
if scatter:
plt.scatter(curr_x, points, s=0.5)
else:
# ax.set_ylim([-0.01, 0.12])
plt.plot(curr_x[:-1], points)
if threshold is not None:
plt.axhline(threshold, min(curr_x), max(curr_x), color='r')
if scaled_x and vert_lines is not None:
for line in vert_lines:
plt.axvline(line, color='r')
plot_id += 1
def plot_data_2d(data):
ax = plt.subplot(plot_rows, plot_cols, plot_id)
data = pd.DataFrame(data)
color_cond = {'g': data[2] == 0,
'r': data[2] == 1,
}
for c in color_cond:
ax.scatter(data[0][color_cond[c]], data[1][color_cond[c]], s=1, c=c)
plt.xlabel("dim 0")
plt.ylabel("dim 1")
def plot_data_3d(data, cols, colored=False):
data = pd.DataFrame(data)
global plot_id
ax = plt.subplot(plot_rows, plot_cols, plot_id, projection='3d')
plot_id += 1
# ax.set_xlim(-3, 3)
# ax.set_ylim(-3, 3)
# ax.set_zlim(-3, 3)
# colored
if colored:
color_cond = {'b': data[3] == 0,
'r': data[3] == 1,
'k': data[3] == 2,
'g': data[3] == 3,
'm': data[3] == 4,
}
for c in color_cond:
ax.scatter(data[0][color_cond[c]], data[1][color_cond[c]], data[2][color_cond[c]], c=c, s=1)
## without coloring
else:
ax.scatter(data[cols[0]], data[cols[1]], data[cols[2]], c='k', s=1)
ax.set_xlabel('X0')
ax.set_ylabel('X1')
ax.set_zlabel('X2')
def run_for_subspace(binning, curr, curr_dim_maxes, curr_data, footprints_number, new=False):
bin_map = binning.equal_frequency_binning_by_rank()
dist_bins = bin_map.cat.categories
bins_right_bounds = [data.loc[binning.rank_data[binning.rank_data[curr] == math.floor(
float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)', dist_bins[i]).group(1)))].index.tolist()[0], curr] for i in
range(len(dist_bins))]
scaled_x = True
# eq_bin_widths = np.array([1 for i in range(len(bins_right_bounds) + 1)])
t = time.time()
IDs = id.compute_IDs(bin_map, curr, curr_data, curr_dim_maxes)
# IDs = IDs * functools.reduce(operator.mul, curr_dim_maxes, 1) / pow(3, (104 - len(curr_dim_maxes)) / 2)
#
# print("ID runtime", time.time() - t)
# # # # min_width = min(bin_widths)
# # t = time.time()
# curr_macro_intervals = compute_disc(IDs, binning, dist_bins, m.BreakPointsStrategy.DEFAULT, 0.3)
# # print("ID disc runtime", time.time() - t)
# plot(bins_right_bounds, IDs, vert_lines=[i[0] for i in curr_macro_intervals], scaled_x=scaled_x,
# threshold=id.compute_ID_threshold(IDs, 0.3))
# t = time.time()
# minhash, footprint_IDs = f.compute_ID_footprints(bin_map, curr, curr_data, curr_dim_maxes, footprints_number)
# print("ID minhashes runtime", time.time() - t)
#
# t = time.time()
# curr_macro_intervals = compute_disc_minhash(minhash, binning, dist_bins)
# print("ID minhashes disc runtime", time.time() - t)
# # print("curr_macro_intervals", curr_macro_intervals)
#
# plot(bins_right_bounds, footprint_IDs[:-1,:], vert_lines=[curr_macro_intervals[i][0] for i in curr_macro_intervals], scaled_x=scaled_x)
# t = time.time()
footprints, footprint_IDs, footprint_diffs = f.compute_ID_footprints(bin_map, curr, curr_data, curr_dim_maxes, footprints_number, bins_right_bounds, True)
#
# print("ID minhashes runtime", time.time() - t)
# m = max(pair_diffs)
# final_footprints = [i for i in range(cols)]
# for i, p in enumerate(pairs):
# if pair_diffs[i] < m/2 and p[0] in final_footprints and p[1] in final_footprints:
# final_footprints.remove(p[0] if sum([pair_diffs[j] for j, p1 in enumerate(pairs) if p[0] in p1]) < sum([pair_diffs[j] for j, p1 in enumerate(pairs) if p[1] in p1]) else p[1])
# print('final_footprints', final_footprints)
# plot_data_3d(curr_data, curr_data.columns.tolist())
# t = time.time()
# # curr_macro_intervals = compute_disc_minhash(f._compute_ID_minhashes(footprint_IDs[:, final_footprints]), binning, dist_bins)
# curr_macro_intervals = compute_disc_minhash(footprints, binning, dist_bins, footprint_diffs)
# print("ID minhashes disc runtime with pair_diffs", time.time() - t)
# plot(bins_right_bounds, footprint_IDs[:-1, :], vert_lines=[curr_macro_intervals[i][0] for i in curr_macro_intervals], scaled_x=scaled_x)
# t = time.time()
# # curr_macro_intervals = compute_disc_minhash(f._compute_ID_minhashes(footprint_IDs[:, final_footprints]), binning, dist_bins)
# curr_macro_intervals = compute_disc_minhash(np.apply_along_axis(lambda a: str(np.argsort(a).tolist()), 1, footprint_IDs), binning, dist_bins)
# print("ID minhashes disc runtime old", time.time() - t)
# plot(bins_right_bounds, footprint_IDs[:-1, :], vert_lines=[curr_macro_intervals[i][0] for i in curr_macro_intervals], scaled_x=scaled_x)
measure = f.compute_subspace_interaction_measure(bin_map, curr, curr_data, bins_right_bounds, curr_dim_maxes)
print(measure[0])
plot(bins_right_bounds, footprint_IDs[:-1, :], scaled_x=scaled_x)
# fn = 2
# print(sum([np.argsort(fp[:fn]).tolist() == np.argsort(footprint_IDs[i-1][:fn]).tolist() for i, fp in enumerate(footprint_IDs[1:])]))
# print(sum([abs(fp[0] - fp[1]) for fp in footprint_IDs]))
def compute_disc(IDs, binning, dist_bins, bps, threshold):
ID_threshold = id.compute_ID_threshold(IDs, threshold)
F, discretizations = m.dynamic_merging(ID_threshold, IDs, init_bins_count, bps)
min_id = np.argmin(F[-1])
discretization = discretizations[-1][min_id]
(curr_macro_intervals, curr_macro_points) = main.get_discretized_points(curr, data, discretization,
dist_bins, binning.rank_data)
return curr_macro_intervals
def compute_disc_minhash(footprints, binning, dist_bins, footprint_diffs=None):
# ID_threshold = id.compute_ID_threshold(curr_points, IDs, idts)
# t = time.time()
F, discretizations = m.dynamic_merging_footprints(footprints, init_bins_count, footprint_diffs)
# print('merging', time.time() - t)
min_id = np.argmin(F[-1])
discretization = discretizations[-1][min_id]
(curr_macro_intervals, curr_macro_points) = main.get_discretized_points(curr, data, discretization,
dist_bins, binning.rank_data)
return curr_macro_intervals
cube_rows = 5000
# data_gen = dg.produce_xor_generator(3, 4, 'bla', distribution='uniform', rows=cube_rows, offset=(0, 0))
# data_gen = dg.produce_random_generator(20, 'bla', rows=cube_rows)
data_gen = dg.produce_cube_generator(4, 100, 1, 1, 'bla', cube_rows, "uniform")
subspaces = data_gen.subspaces
print('subspaces', subspaces)
build = data_gen.build()
data = pd.DataFrame(build[0])
# print('disc', build[1])
# subspace_map = main.get_map_from_subspace_set(subspaces)
# data = pd.read_csv("new_cubes/cubes_n1000_r4_i1_c1.csv", delimiter=';', header=None, na_values='?')
# print('data maxes', data.max(0))
# print('data mins', data.min(0))
class_labels = data.pop(data.shape[1] - 1)
data = data.apply(lambda x: 2 * cst.NORMALIZATION_RADIUS * (x - x.min()) / (
x.max() - x.min()) - cst.NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape)))
dim_maxes = data.max(0)
init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) # ceil in original ipd...
print('init_bins_count', init_bins_count)
plot_rows = 1
plot_cols = 1
plot_id = 1
footprints_number = 2
curr = 0
binning = Binning(data, curr, init_bins_count)
# for curr_subspace in [[0, 1, 2], [0, 1, 4], [0, 4, 5]]:
# # for curr_subspace in [[0, 1, 2]]:
# # for curr_subspace in [[0, 1, 2, 3]]:
# print('curr_subspace', curr, curr_subspace)
# curr_data = data.copy().loc[:, curr_subspace]
# curr_dim_maxes = dim_maxes[curr_subspace]
# run_for_subspace(binning, curr, curr_dim_maxes, curr_data, footprints_number)
bin_map = binning.equal_frequency_binning_by_rank()
dist_bins = bin_map.cat.categories
bins_right_bounds = [data.loc[binning.rank_data[binning.rank_data[curr] == math.floor(
float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)', dist_bins[i]).group(1)))].index.tolist()[0], curr] for i in
range(len(dist_bins))]
topk = sm.best_first(bin_map, curr, data, bins_right_bounds, dim_maxes, 4, cst.CorrelationMeasure.FID)
print(topk)
curr_subspace = topk[0].union({0})
print('curr_subspace', curr, curr_subspace)
curr_data = data.copy().loc[:, curr_subspace]
curr_dim_maxes = dim_maxes[curr_subspace]
run_for_subspace(binning, curr, curr_dim_maxes, curr_data, footprints_number)
#
# for curr_subspace in [[0, d] for d in topk[0]]:
# print('curr_subspace', curr, curr_subspace)
# curr_data = data.copy().loc[:, curr_subspace]
# curr_dim_maxes = dim_maxes[curr_subspace]
# run_for_subspace(binning, curr, curr_dim_maxes, curr_data, footprints_number)
plt.show()