diff --git a/.gitignore b/.gitignore index 4ba56d0..d1a2c17 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ .idea/* *.iml +logs/* + # Mobile Tools for Java (J2ME) .mtj.tmp/ diff --git a/constants.py b/constants.py new file mode 100644 index 0000000..899f778 --- /dev/null +++ b/constants.py @@ -0,0 +1,19 @@ +from enum import Enum + +class Method(Enum): + ORIGINAL = 1 + EXTENDED = 2 + +class Correlation_measure(Enum): + UDS = 1 + CMI = 2 + MAC = 3 + +ID_THRESHOLD_QUANTILE = 0.8 + +NORMALIZATION_RADIUS = 1 + +FILE_DATA_OUTPUT = "out.txt" +FILE_DATA_CUTS = 'cut.txt' + +MAX_SUBSPACE_SIZE = 5 \ No newline at end of file diff --git a/data_generation.py b/data_generation.py new file mode 100644 index 0000000..e8417a0 --- /dev/null +++ b/data_generation.py @@ -0,0 +1,71 @@ +import numpy as np +import pandas as pd +import os.path + + +def correlated_data(m, n, sigma, f): + l = int(n / 2) + Z = np.random.normal(0, 1, (m, l)) + A = np.matrix(np.random.uniform(0, 1, (l, l))) + X1 = Z * A + B = np.matrix(np.random.uniform(0, 0.5, (l, l))) + W = X1 * B + E = np.random.normal(0, sigma, (m, l)) + X2 = f(W) + E + result = np.append(X1, X2, axis=1) + print(result) + return result + + +def generate_uncorrelated_data(m, n): + return np.random.normal(0, 1, (m, n)) + + +def func1(X): + return 2 * X + 1 + + +def func2(X): + return np.log2(np.abs(X) + 1) + + +def synthetic_data_1(m, r, s, sigma=0.1): + r_dims = np.random.uniform(-0.5, 0.5, (m, r)) + parity_dim = -(np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.random.uniform(0, 0.5, + (m, 1)) + s_dims = np.random.normal(0, 1, (m, s)) + data = np.concatenate((r_dims, parity_dim, s_dims), axis=1) + if sigma: + e = np.random.normal(0, sigma, (m, r + s + 1)) + data = data + e + + return data + + +def synthetic_data_gauss(m, r, s, sigma=0.1): + r_dims = np.random.normal(0, 1, (m, r)) + parity_dim = -(np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.abs(np.random.normal(0, 1, + (m, 1))) + s_dims = np.random.normal(0, 1, (m, s)) + data = np.concatenate((r_dims, parity_dim, s_dims), axis=1) + if sigma: + e = np.random.normal(0, sigma, (m, r + s + 1)) + data = data + e + + return data + + +def synthetic_data_0(m): + l = int(m / 2) + first = np.concatenate((np.random.uniform(-1, 0, (l, 1)), np.random.uniform(0, 1, (l, 1))), axis=1) + sec = np.concatenate((np.random.uniform(0, 1, (m - l, 1)), np.random.uniform(-1, 0, (m - l, 1))), axis=1) + return np.concatenate((first, sec), axis=0) + + +if __name__ == '__main__': + data__ = np.concatenate((synthetic_data_1(20000, 2, 0, 0), np.zeros((20000, 1))), axis=1) + file = 'synthetic_data_example_20000.csv' + + if os.path.isfile(file): + raise ValueError + pd.DataFrame(data__).to_csv(file, sep=';', header=False, index=False, float_format='%.2f') diff --git a/experiments_logging.py b/experiments_logging.py new file mode 100644 index 0000000..981f91d --- /dev/null +++ b/experiments_logging.py @@ -0,0 +1,50 @@ +import matplotlib.pyplot as plt +import numpy as np +from mpl_toolkits.mplot3d import Axes3D + +def plot_data_3d(data): + fig = plt.figure() + ax = fig.add_subplot(111, projection='3d') + + color_cond = {'b': np.logical_and(data[0] < 0, data[1] > 0), + 'g': np.logical_and(data[0] > 0, data[1] < 0), + 'r': np.logical_and(data[0] < 0, data[1] < 0), + 'c': np.logical_and(data[0] > 0, data[1] > 0), + } + for c in color_cond: + ax.scatter(data[0][color_cond[c]], data[1][color_cond[c]], data[2][color_cond[c]], c=c) + + ax.set_xlabel('X0') + ax.set_ylabel('X1') + ax.set_zlabel('X2') + + plt.show() + + +def write_out_file(name, disc_intervals, disc_points, class_labels): + with open(name, 'w') as out: + out.write('@relation DB\n\n') + counter = [1] + for i in range(len(disc_intervals)): + out.write( + '@attribute dim' + str(i) + ' {' + ','.join([str(j + counter[-1]) for j in disc_intervals[i]]) + '}\n') + counter.append(counter[-1] + len(disc_intervals[i])) + out.write('@attribute class {' + ','.join(['"' + str(i) + '"' for i in class_labels.unique()]) + '}\n\n') + out.write('@data\n') + + for i in range(len(disc_points[0])): + for j in range(len(disc_points)): + out.write(str(disc_points[j][i] + counter[j])) + out.write(',') + out.write('"' + str(class_labels[i]) + '"\n') + + +def write_cut_file(name, disc_intervals): + with open(name, 'w') as out: + for i in range(len(disc_intervals)): + out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n') + for bin in disc_intervals[i]: + out.write(str(disc_intervals[i][bin][1]) + '\n') + out.write('-------------------------------------\n') + + diff --git a/interaction_distance.py b/interaction_distance.py index 029c2c0..ec2361b 100644 --- a/interaction_distance.py +++ b/interaction_distance.py @@ -2,55 +2,29 @@ import numpy as np import uds +from constants import Correlation_measure, ID_THRESHOLD_QUANTILE def compute_IDs(bin_map, curr, data, dist_bins, dim_maxes): - intra_bin_measures = [] - inter_bin_measures = [] - data_wo_curr = data.copy() data_wo_curr.pop(curr) # todo slow? - for bin_id, binn in enumerate(dist_bins): - bin_data = data_wo_curr.loc[bin_map == binn] - points_count = bin_data.shape[0] - prev_bin_data = None - inter_prod_matrix = None - prev_points_count = None - if bin_id > 0: - prev_bin_data = data_wo_curr.loc[bin_map == dist_bins[bin_id - 1]] - prev_points_count = prev_bin_data.shape[0] - inter_prod_matrix = np.ones([points_count, prev_points_count]) + return _compute_IDs(bin_map, data_wo_curr, dim_maxes, dist_bins) - intra_prod_matrix = np.ones([points_count, points_count]) - # product elements for each dimension - for dim in bin_data: - intra_elem = compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim]) - intra_prod_matrix = np.multiply(intra_prod_matrix, intra_elem) - if bin_id > 0: - inter_elem = compute_ID_elem(bin_data[dim], prev_bin_data[dim], dim_maxes[dim]) - inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem) +def compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, cor_measure, k): + if cor_measure == Correlation_measure.UDS: + subspace = uds.find_correlated_subspace(data, curr, k) + else: + ValueError('No implementation!') - intra_bin_measures.append(np.sum(intra_prod_matrix) / points_count ** 2) + data = data.copy().loc[:, subspace] - if bin_id > 0: - inter_bin_measures.append(2 * np.sum(inter_prod_matrix) / (points_count * prev_points_count)) - IDs = [] - for c, inter_measure in enumerate(inter_bin_measures): - IDs.append(intra_bin_measures[c] - inter_measure + intra_bin_measures[c + 1]) - IDs = np.array(IDs) - return IDs + return _compute_IDs(bin_map, data, dim_maxes, dist_bins) -def compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, k): - intra_bin_measures = [] +def _compute_IDs(bin_map, data, dim_maxes, dist_bins): + inner_bin_measures = [] inter_bin_measures = [] - - data = data.copy() - - data = data.loc[:, uds.find_correlated_subspace(data, curr, k)] - # data.pop(curr) # todo slow? - for bin_id, binn in enumerate(dist_bins): bin_data = data.loc[bin_map == binn] points_count = bin_data.shape[0] @@ -62,23 +36,23 @@ def compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, k): prev_points_count = prev_bin_data.shape[0] inter_prod_matrix = np.ones([points_count, prev_points_count]) - intra_prod_matrix = np.ones([points_count, points_count]) + inner_prod_matrix = np.ones([points_count, points_count]) # product elements for each dimension for dim in bin_data: - intra_elem = compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim]) - intra_prod_matrix = np.multiply(intra_prod_matrix, intra_elem) + inner_elem = compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim]) + inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem) if bin_id > 0: inter_elem = compute_ID_elem(bin_data[dim], prev_bin_data[dim], dim_maxes[dim]) inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem) - intra_bin_measures.append(np.sum(intra_prod_matrix) / points_count ** 2) + inner_bin_measures.append(np.sum(inner_prod_matrix) / points_count ** 2) if bin_id > 0: inter_bin_measures.append(2 * np.sum(inter_prod_matrix) / (points_count * prev_points_count)) IDs = [] for c, inter_measure in enumerate(inter_bin_measures): - IDs.append(intra_bin_measures[c] - inter_measure + intra_bin_measures[c + 1]) + IDs.append(inner_bin_measures[c] - inter_measure + inner_bin_measures[c + 1]) IDs = np.array(IDs) return IDs @@ -98,5 +72,12 @@ def compute_ID_threshold(IDs): IDs = IDs.copy() IDs.sort() # similar to original ipd (but possibly wrong) todo - return IDs[math.ceil(int(len(IDs) / 3)) - 1] - # return IDs[int(len(IDs) * ID_THRESHOLD_QUANTILE)] + # return IDs[math.ceil(int(len(IDs) / 3)) - 1] + return IDs[int(len(IDs) * ID_THRESHOLD_QUANTILE)] + + +def compute_max_ID_threshold(IDs): + IDs = IDs.copy() + IDs.sort() + + return max(IDs) diff --git a/main.py b/main.py index 1a00b37..9c6f19f 100644 --- a/main.py +++ b/main.py @@ -1,18 +1,18 @@ +import glob import math -import pandas as pd +import sys + +import matplotlib.pyplot as plt import numpy as np -import interaction_distance as id +import pandas as pd +import constants as cst +import data_generation as dg +import interaction_distance as id from correlation_measures.binning import Binning +from experiments_logging import write_out_file, write_cut_file from merging import dynamic_merging -# ----------------------CONSTANTS----------------------- - -ID_THRESHOLD_QUANTILE = 1.0 / 3 -NORMALIZATION_RADIUS = 1 -FILE_DATA_OUTPUT = "out.txt" -FILE_DATA_CUTS = 'cut.txt' -MAX_SUBSPACE_SIZE = 5 # ------------------------------------------------------ @@ -23,34 +23,65 @@ def find_disc_macro_id(disc_macro_intervals, point): raise ValueError("Micro bin is not covered by any of the macro intervals!", disc_macro_intervals, point) -def writeOutFile(name, disc_intervals, disc_points, class_labels): - with open(name, 'w') as out: - out.write('@relation DB\n\n') - counter = [1] - for i in range(len(disc_intervals)): - out.write( - '@attribute dim' + str(i) + ' {' + ','.join([str(j + counter[-1]) for j in disc_intervals[i]]) + '}\n') - counter.append(counter[-1] + len(disc_intervals[i])) - out.write('@attribute class {' + ','.join(['"' + str(i) + '"' for i in class_labels.unique()]) + '}\n\n') - out.write('@data\n') +# todo valid only in compute_optimal_discretization method! +def write(*args): + log.write(' '.join([str(a) for a in args])) + log.write('\n') + + +def id_exceeds_experiment(data, method=cst.Method.ORIGINAL, cor_measure=None): + # class labels are not of much use in original ipd.. + # class_labels = data.pop(data.shape[1] - 1) + dim_count = data.shape[1] + # dimension maximums + dim_maxes = data.max(0) - for i in range(len(disc_points[0])): - for j in range(len(disc_points)): - out.write(str(disc_points[j][i] + counter[j])) - out.write(',') - out.write('"' + str(class_labels[i]) + '"\n') + # number of initial dist_bins + # todo remove later + # init_bins_count = 20 # ceil in original ipd... + init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) # ceil in original ipd... + write('init_bins_count:', init_bins_count) + write('ID_THRESHOLD_QUANTILE:', cst.ID_THRESHOLD_QUANTILE) + + # normalization step todo(optional) + + # data = data.apply(lambda x: 2 * cst.NORMALIZATION_RADIUS * (x - x.min()) / ( + # x.max() - x.min()) - cst.NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape))) + + disc_macro_intervals = [] + disc_points = [] + orig_binning = Binning(data) + rank_data = orig_binning.get_rank_data() + # plt.figure(1) -def writeCutFile(name, disc_intervals): - with open(name, 'w') as out: - for i in range(len(disc_intervals)): - out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n') - for bin in disc_intervals[i]: - out.write(str(disc_intervals[i][bin][1]) + '\n') - out.write('-------------------------------------\n') + height = int(math.sqrt(dim_count )) + width = int(math.ceil((dim_count ) / height)) + fig, axes = plt.subplots(nrows=height, ncols=width, squeeze=False) -def compute_optimal_discretization(data): + # iterate over all the dimensions + for curr in range(dim_count): + bin_map = orig_binning.equal_frequency_binning(curr, init_bins_count) + + # distinct bins + dist_bins = bin_map.unique() + + # -----------------------------INTERACTION DISTANCES---------------------------------- + + # for each bin along the current dimension compute inner measure B and inter measure + # IDs = id.compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, MAX_SUBSPACE_SIZE) + IDs = id.compute_IDs(bin_map, curr, data, dist_bins, dim_maxes) if method == cst.Method.ORIGINAL else \ + id.compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, cor_measure, cst.MAX_SUBSPACE_SIZE) + ID_threshold = id.compute_ID_threshold(IDs) + write('-------------------------') + write('dimension:', curr) + write('ID_threshold:', ID_threshold) + write('ID exceeds:', sum([1 for i in IDs[:int(len(IDs)/2)] if i > ID_threshold]), + sum([1 for i in IDs[int(len(IDs)/2):] if i > ID_threshold])) + + +def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure=None): # class labels are not of much use in original ipd.. class_labels = data.pop(data.shape[1] - 1) dim_count = data.shape[1] @@ -60,22 +91,29 @@ def compute_optimal_discretization(data): # number of initial dist_bins # todo remove later # init_bins_count = 20 # ceil in original ipd... - init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) # ceil in original ipd... - print('initBinsCount: ', init_bins_count) + init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) # ceil in original ipd... + write('init_bins_count:', init_bins_count) + write('ID_THRESHOLD_QUANTILE:', cst.ID_THRESHOLD_QUANTILE) # normalization step todo(optional) - # data = data.apply(lambda x: 2 * NORMALIZATION_RADIUS * (x - x.min()) / ( - # x.max() - x.min()) - NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape))) + # data = data.apply(lambda x: 2 * cst.NORMALIZATION_RADIUS * (x - x.min()) / ( + # x.max() - x.min()) - cst.NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape))) disc_macro_intervals = [] disc_points = [] orig_binning = Binning(data) rank_data = orig_binning.get_rank_data() + # plt.figure(1) + + height = int(math.sqrt(dim_count )) + width = int(math.ceil((dim_count ) / height)) + + fig, axes = plt.subplots(nrows=height, ncols=width, squeeze=False) + # iterate over all the dimensions for curr in range(dim_count): - bin_map = orig_binning.equal_frequency_binning(curr, init_bins_count) # distinct bins @@ -84,29 +122,64 @@ def compute_optimal_discretization(data): # -----------------------------INTERACTION DISTANCES---------------------------------- # for each bin along the current dimension compute inner measure B and inter measure - IDs = id.compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, MAX_SUBSPACE_SIZE) + # IDs = id.compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, MAX_SUBSPACE_SIZE) + IDs = id.compute_IDs(bin_map, curr, data, dist_bins, dim_maxes) if method == cst.Method.ORIGINAL else \ + id.compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, cor_measure, cst.MAX_SUBSPACE_SIZE) ID_threshold = id.compute_ID_threshold(IDs) - + pd.DataFrame(IDs).to_csv(prefix + "_IDs_" + str(curr) + ".csv") # -----------------------------OPTIMAL MERGE STRATEGY---------------------------------- # table with costs, the value in i-th row and j-th column means cost of (i+1) micro bins merged into (j+1) # macro bins F, discretizations = dynamic_merging(ID_threshold, IDs, init_bins_count) + # pd.DataFrame(F).to_csv(prefix + "_F_" + str(curr) + ".csv") + pd.DataFrame([[[b[-1] for b in k[:-1]] for k in c] for c in discretizations]).to_csv(prefix + "_bp_" + str(curr) + ".csv") + min_id = np.argmin(F[-1]) - print('dimension ' + str(curr)) - print('ID_threshold', ID_threshold) - print('cost ' + str(F[-1, min_id])) (curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_data) - print(curr_macro_intervals) - print(curr_macro_points) + ax1 = axes[int(curr / width), int(curr % width)] + # ax1.hist(IDs, bins=100, color='c') + ax1.plot([i for i in range(len(IDs))], IDs) + ax1.axhline(ID_threshold, color='b', linestyle='dashed', linewidth=1) + ax1.set_title('dimension ' + str(curr)) + + # ax2 = axes[int((2*curr + 1) / width), int((2*curr + 1) % width)] + # ax2.plot(sorted(IDs), color='k') + # ax2.set_title('dimension ' + str(curr)) + + + write('-------------------------') + write('dimension:', curr) + write('ID_threshold:', ID_threshold) + write('cost:', F[-1, min_id]) + write('number of macrobins:', len(curr_macro_intervals)) + + # write('IDs', IDs) + write('\nIDs between the macrobins:') + for macro_id, macro_bin in enumerate(discretizations[-1][min_id][:-1]): + write("{0:.2f}".format(curr_macro_intervals[macro_id][1]) + " -", IDs[macro_bin[-1]], '[q=' + + str((sorted(IDs).index(IDs[macro_bin[-1]]) + 1) / len(IDs)) + ']') + # ax1.axhline(IDs[macro_bin[-1]], color='r', linewidth=1) + ax1.plot([macro_bin[-1]], [IDs[macro_bin[-1]]], marker='o', markersize=3, color="red") + # ax2.axvline(IDs[macro_bin[-1]], color='r', linewidth=1) + + write('\nnumber of points per macrobin:') + for macro_id in curr_macro_intervals: + write("[" + "{0:.2f}".format(curr_macro_intervals[macro_id][0]) + ",", + "{0:.2f}".format(curr_macro_intervals[macro_id][1]) + "]", + sum([1 for p in curr_macro_points if p == macro_id])) + write('\n') disc_macro_intervals.append(curr_macro_intervals) disc_points.append(curr_macro_points) + plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25, + wspace=0.35) + plt.savefig(prefix + '.png', format='png') return disc_macro_intervals, disc_points, class_labels @@ -133,10 +206,44 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_ return disc_macro_intervals, macro_points -data = pd.read_csv('data/crime.csv', delimiter=',', header=None).loc[:, :14] - -disc_intervals, disc_points, class_labels = compute_optimal_discretization(data) - -writeOutFile(FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels) - -writeCutFile(FILE_DATA_CUTS, disc_intervals) +if __name__ == "__main__": + sys.argv = '-f=synthetic_data_example_20000.csv -d=;'.split(' ') + # if len(sys.argv) < 2: + # print('Usage: main.py -f= -d= -c= -m=<[original|greedy]> -cor=<[uds]>') + file_arg = list(filter(lambda x: x.startswith("-f="), sys.argv)) + # if not file_arg: + # raise ValueError('No data file provided!') + delim_arg = list(filter(lambda x: x.startswith("-d="), sys.argv)) + columns_arg = list(filter(lambda x: x.startswith("-c="), sys.argv)) + method_arg = list(filter(lambda x: x.startswith("-m="), sys.argv)) + corr_measure_arg = list(filter(lambda x: x.startswith("-cor="), sys.argv)) + + data_file = file_arg[0].replace('-f=', '') if file_arg else None + delimiter = delim_arg[0].replace('-d=', '') if delim_arg else ',' + columns = columns_arg[0].replace('-c=', '') if columns_arg else None + method = cst.Method[method_arg[0].replace('-m=', '').uppercase()] if method_arg else cst.Method.ORIGINAL + cor_measure = cst.Correlation_measure[corr_measure_arg[0].replace('-cor=', '').uppercase()] if corr_measure_arg \ + else cst.Correlation_measure.UDS + + data = pd.read_csv(data_file, delimiter=delimiter, header=None) if data_file else pd.DataFrame( + dg.synthetic_data_1(20000, 2, 0, 0)) + if columns: + data = data.loc[:, :columns] + + # defining prefix for the output files + prefix = 'logs/' + 'id_experiment'\ + + (data_file if data_file else 'generated') + "_" + \ + (method.name if method == cst.Method.ORIGINAL else cor_measure.name) + log_files = len(glob.glob(prefix + "*.log")) + prefix = prefix + (str(log_files) if log_files > 0 else '') + + print('output files are:', prefix + '*') + log_file = prefix + ".log" + + with open(log_file, 'w') as log: + + # disc_intervals, disc_points, class_labels = compute_optimal_discretization(data, method, cor_measure) + id_exceeds_experiment(data, method, cor_measure) + # write_out_file(prefix + cst.FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels) + # + # write_cut_file(prefix + cst.FILE_DATA_CUTS, disc_intervals) diff --git a/merging.py b/merging.py index 4a1dd42..e3e85ac 100644 --- a/merging.py +++ b/merging.py @@ -15,7 +15,14 @@ def quasi_uniform_code(n): def break_points_number(macro_bin, IDs, ID_threshold): - ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]] + ''' + returns count of the break points in which ID is GREATER OR EQUAL than ID_threshold + :param macro_bin: + :param IDs: + :param ID_threshold: + :return: + ''' + ID_boolean = [1 if ID >= ID_threshold else 0 for ID in IDs[macro_bin[:-1]]] return sum(ID_boolean) @@ -28,11 +35,11 @@ def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold): break_points_size = break_points_number(macro_bin, IDs, ID_threshold) # todo in the original ipd L_disc L_N is computed for (k-1) - # L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2) - L_disc = quasi_uniform_code(k - 1) + math.log(comb(c - 1, k - 1), 2) + L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2) + # L_disc = quasi_uniform_code(k - 1) + math.log(comb(c - 1, k - 1), 2) # todo in the original ipd L_disc L_N is computed for (k-1) - # L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0) - L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0) + L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0) + # L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0) L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1) L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0) @@ -46,7 +53,7 @@ def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold): def dynamic_merging(ID_threshold, IDs, init_bins_count): - F = np.empty([init_bins_count, init_bins_count]) + F = np.zeros([init_bins_count, init_bins_count]) discretizations = [] # compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1 k_ = 0 diff --git a/uds.py b/uds.py index c3b7da8..0abedc7 100644 --- a/uds.py +++ b/uds.py @@ -2,9 +2,11 @@ import pandas as pd import numpy as np from correlation_measures.binning import Binning +import data_generation as dg +from data_generation import correlated_data # bins count -BETA = 20 +UDS_BETA = 20 def compute_cond_CE(data, dim, I, point_ids): @@ -14,7 +16,7 @@ def compute_cond_CE(data, dim, I, point_ids): # discretization of the next dimension def dim_optimal_disc(prev, curr, binning, I, data): - binned_points = binning.equal_frequency_binning2(prev, BETA) + binned_points = binning.equal_frequency_binning2(prev, UDS_BETA) # Series with bins support support = binned_points.value_counts().sort_index().cumsum() @@ -27,7 +29,7 @@ def dim_optimal_disc(prev, curr, binning, I, data): # todo worth considering implementation of the ufunc in C (slow) f = [] # upper bound - for i in range(BETA): + for i in range(UDS_BETA): f_row = [] merged_bins_row = [] # lower bound @@ -42,8 +44,8 @@ def dim_optimal_disc(prev, curr, binning, I, data): b.append([[merged_bins_row[0]]]) val.append([f_row[0]]) - for l in range(1, BETA): - for i in range(l, BETA): + for l in range(1, UDS_BETA): + for i in range(l, UDS_BETA): min_cost = None arg_min = None for j in range(l - 1, i): @@ -98,30 +100,6 @@ def extend_I(I, disc): return [d for d in disc_ if not d.empty] -def generate_correlated_data(m, n, sigma, f): - l = int(n / 2) - Z = np.random.normal(0, 1, (m, l)) - A = np.matrix(np.random.uniform(0, 1, (l, l))) - X1 = Z * A - B = np.matrix(np.random.uniform(0, 0.5, (l, l))) - W = X1 * B - E = np.random.normal(0, sigma, (m, l)) - X2 = f(W) + E - return np.append(X1, X2, axis=1) - - -def generate_uncorrelated_data(m, n): - return np.random.normal(0, 1, (m, n)) - - -def func1(X): - return 2 * X + 1 - - -def func2(X): - return np.log2(np.abs(X) + 1) - - def compute_uds(data): data = data.rename(columns={data.columns[i]: i for i in range(len(data.columns))}) binning = Binning(data) @@ -145,7 +123,7 @@ def compute_uds(data): for l, score in enumerate(scores): temp_I = extend_I(I, discs[l]) temp_cost = score / CEs[dim] + entropy(temp_I, len(data)) / ( - math.log(BETA, 2) + sum([math.log(e + 1, 2) for e in es])) + math.log(UDS_BETA, 2) + sum([math.log(e + 1, 2) for e in es])) if not opt_cost or temp_cost < opt_cost: opt_cost = temp_cost opt_score = score @@ -167,7 +145,7 @@ def compute_uds(data): if __name__ == "__main__": # data = pd.read_csv('data/testdata.csv', delimiter=',', header=None) - data = pd.DataFrame(generate_correlated_data(4000, 20, 10, func2)) + data = pd.DataFrame(correlated_data(4000, 20, 10, dg.func2)) # data = pd.DataFrame(generate_correlated_data(1000, 10, 2, func1)) # data = pd.DataFrame(generate_uncorrelated_data(4000, 20)) # classLabels = data.pop(len(data.columns) - 1) @@ -192,7 +170,7 @@ def find_correlated_subspace(data, curr, k): corr = [] for dim in dims: uds = compute_uds(data.loc[:, [curr, dim]]) - print(dim, uds) + # print(dim, uds) corr.append(uds) order = np.argsort(corr).tolist() order.reverse()