diff --git a/.gitignore b/.gitignore index 32858aa..4ba56d0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ *.class +.idea/* +*.iml # Mobile Tools for Java (J2ME) .mtj.tmp/ diff --git a/example/simple15.csv b/example/simple15.csv new file mode 100755 index 0000000..3049ff3 --- /dev/null +++ b/example/simple15.csv @@ -0,0 +1,15 @@ +0.660217;0.750373;0 +0.460440;0.793855;0 +0.460440;0.765323;0 +0.460440;0.730397;0 +0.460440;0.692839;0 +0.445508;0.817600;0 +0.428743;0.837934;0 +0.428743;0.878658;0 +0.428743;0.915953;0 +0.428743;0.999258;0 +0.819264;0.516152;0 +0.819264;0.523430;0 +0.559855;0.641614;0 +0.534924;0.699147;0 +0.801006;0.326723;0 \ No newline at end of file diff --git a/interaction_distance.py b/interaction_distance.py new file mode 100644 index 0000000..6934bbe --- /dev/null +++ b/interaction_distance.py @@ -0,0 +1,60 @@ +import math +import numpy as np + +def computeIDs(bin_map, curr, data, dist_bins, dim_maxes): + intra_bin_measures = [] + inter_bin_measures = [] + + data_wo_curr = data.copy() + data_wo_curr.pop(curr) # todo slow? + for bin_id, binn in enumerate(dist_bins): + bin_data = data_wo_curr.loc[bin_map == binn] + # print(bin_data) + points_count = bin_data.shape[0] + prev_bin_data = None + inter_prod_matrix = None + prev_points_count = None + if bin_id > 0: + prev_bin_data = data_wo_curr.loc[bin_map == dist_bins[bin_id - 1]] + # print(prev_bin_data) + prev_points_count = prev_bin_data.shape[0] + inter_prod_matrix = np.ones([points_count, prev_points_count]) + + intra_prod_matrix = np.ones([points_count, points_count]) + # product elements for each dimension + for dim in bin_data: + intra_elem = compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim]) + intra_prod_matrix = np.multiply(intra_prod_matrix, intra_elem) + + if bin_id > 0: + inter_elem = compute_ID_elem(bin_data[dim], prev_bin_data[dim], dim_maxes[dim]) + inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem) + + intra_bin_measures.append(np.sum(intra_prod_matrix) / points_count ** 2) + + if bin_id > 0: + inter_bin_measures.append(2 * np.sum(inter_prod_matrix) / (points_count * prev_points_count)) + IDs = [] + for c, inter_measure in enumerate(inter_bin_measures): + IDs.append(intra_bin_measures[c] - inter_measure + intra_bin_measures[c + 1]) + IDs = np.array(IDs) + return IDs + + +def compute_ID_elem(bin1, bin2, dim_max): + points_count1 = bin1.shape[0] + points_count2 = bin2.shape[0] + # max_i array + max_array = np.ones([points_count1, points_count2]) + max_array.fill(dim_max) + # max_i - max(R^i_{j_1}, R^i_{j_2}) + outer_max = np.maximum.outer(bin1, np.transpose(bin2)) + return max_array - outer_max + + +def compute_ID_threshold(IDs): + IDs = IDs.copy() + IDs.sort() + # similar to original ipd (but possibly wrong) todo + return IDs[math.ceil(int(len(IDs) / 3)) - 1] + # return IDs[int(len(IDs) * ID_THRESHOLD_QUANTILE)] diff --git a/main.py b/main.py new file mode 100644 index 0000000..20d41b1 --- /dev/null +++ b/main.py @@ -0,0 +1,143 @@ +import math +import pandas as pd +import numpy as np + +from interaction_distance import computeIDs, compute_ID_threshold +from merging import dynamic_merging + +# ----------------------CONSTANTS----------------------- + +ID_THRESHOLD_QUANTILE = 1.0 / 3 +NORMALIZATION_RADIUS = 1 +FILE_DATA_OUTPUT = "out.txt" +FILE_DATA_CUTS = 'cut.txt' + + +# ------------------------------------------------------ + +def find_disc_macro_id(disc_macro_intervals, point): + for macro in disc_macro_intervals.items(): + if macro[1][0] <= point <= macro[1][1]: + return macro[0] + raise ValueError("Micro bin is not covered by any of the macro intervals!", disc_macro_intervals, point) + + +def writeOutFile(name, disc_intervals, disc_points, class_labels): + with open(name, 'w') as out: + out.write('@relation DB\n\n') + counter = [1] + for i in range(len(disc_intervals)): + out.write( + '@attribute dim' + str(i) + ' {' + ','.join([str(j + counter[-1]) for j in disc_intervals[i]]) + '}\n') + counter.append(counter[-1] + len(disc_intervals[i])) + out.write('@attribute class {' + ','.join(['"' + str(i) + '"' for i in class_labels.unique()]) + '}\n\n') + out.write('@data\n') + + for i in range(len(disc_points[0])): + for j in range(len(disc_points)): + out.write(str(disc_points[j][i] + counter[j])) + out.write(',') + out.write('"' + str(class_labels[i]) + '"\n') + + +def writeCutFile(name, disc_intervals): + with open(name, 'w') as out: + for i in range(len(disc_intervals)): + out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n') + for bin in disc_intervals[i]: + out.write(str(disc_intervals[i][bin][1]) + '\n') + out.write('-------------------------------------\n') + + +def compute_optimal_discretization(data): + # class labels are not of much use in original ipd.. + class_labels = data.pop(data.shape[1] - 1) + dim_count = data.shape[1] + # dimension maximums + dim_maxes = data.max(0) + + # number of initial dist_bins + # initBinsCount = int(math.ceil(math.sqrt(row_count))) # ceil in original ipd... + # todo remove later + initBinsCount = 20 # ceil in original ipd... + print('initBinsCount: ', initBinsCount) + + # normalization step todo(optional) + + # data = data.apply(lambda x: 2 * NORMALIZATION_RADIUS * (x - x.min()) / ( + # x.max() - x.min()) - NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape))) + + disc_macro_intervals = [] + disc_points = [] + rank_data = data.rank(method='first') + # iterate over all the dimensions + for curr in range(dim_count): + # original ids -> ranked_ids in the current dimension + rank_data = rank_data.sort_values(by=curr) + + # todo (small reminder) in the original ipd it is NOT equal binning + # Series of binned points + bin_map = pd.qcut(rank_data[curr], initBinsCount) + + # distinct bins + dist_bins = bin_map.drop_duplicates().values + + # -----------------------------INTERACTION DISTANCES---------------------------------- + + # for each bin along the current dimension compute inner measure B and inter measure + IDs = computeIDs(bin_map, curr, data, dist_bins, dim_maxes) + ID_threshold = compute_ID_threshold(IDs) + print('ID_threshold', ID_threshold) + + # -----------------------------OPTIMAL MERGE STRATEGY---------------------------------- + + # todo replace by empty method later + # table with costs, the value in i-th row and j-th column means cost of (i+1) micro bins merged into (j+1) macro bins + F, discretizations = dynamic_merging(ID_threshold, IDs, initBinsCount) + + print('dimension ' + str(curr)) + min_id = np.argmin(F[-1]) + print('cost ' + str(F[-1, min_id])) + + (curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations, dist_bins, + min_id, rank_data) + + print(curr_macro_intervals) + print(curr_macro_points) + + disc_macro_intervals.append(curr_macro_intervals) + disc_points.append(curr_macro_points) + + return disc_macro_intervals, disc_points, class_labels + + +def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_data): + disc_macro_intervals = dict() + for i, macro_bin in enumerate(discretizations[-1][min_id]): + macro_interval = [] + for micro_bin_id in macro_bin: + right = \ + data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[micro_bin_id].right)][curr].index[0]][curr] + if not len(macro_interval): + macro_interval.append( + data.loc[rank_data[rank_data[curr] == math.ceil(dist_bins[micro_bin_id].left)][curr].index[0]][ + curr]) + macro_interval.append(right) + else: + macro_interval[1] = right + disc_macro_intervals[i] = macro_interval + + macro_points = [] + for point in data.iterrows(): + macro_points.append(find_disc_macro_id(disc_macro_intervals, point[1][curr])) + + return (disc_macro_intervals, macro_points) + + +data = pd.read_csv('example/simple.csv', delimiter=';', header=None) + +disc_intervals, disc_points, class_labels = compute_optimal_discretization(data) + +writeOutFile(FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels) + +writeCutFile(FILE_DATA_CUTS, disc_intervals) diff --git a/merging.py b/merging.py new file mode 100644 index 0000000..1310c42 --- /dev/null +++ b/merging.py @@ -0,0 +1,78 @@ +import math +import pandas as pd +import numpy as np +from scipy.special import comb + +QUASI_UNIFORM_CODE_INITIAL_NUMBER = 2.865064 + +def quasi_uniform_code(n): + l = 0 + while n > 1: + n = math.log(n, 2) + l += n + return l + math.log(QUASI_UNIFORM_CODE_INITIAL_NUMBER, 2) + + +def break_points_number(macro_bin, IDs, ID_threshold): + ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]] + return sum(ID_boolean) + +def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold): + macro_bin_size = len(macro_bin) + if macro_bin_size != c - l: + raise ValueError(c + "!=" + l) + + macro_bin_size_code = quasi_uniform_code(macro_bin_size) + break_points_size = break_points_number(macro_bin, IDs, ID_threshold) + + # todo in the original ipd L_disc L_N is computed for (k-1) + # L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2) + L_disc = quasi_uniform_code(k-1) + math.log(comb(c - 1, k - 1), 2) + # todo in the original ipd L_disc L_N is computed for (k-1) + # L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0) + L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0) + + L_disc_M_ind = macro_bin_size_code - math.log(macro_bin_size / c, 2) * (macro_bin_size + 1) + L_disc_M_ind_prev = - (math.log(l / c, 2) * (k - 1 + l) if l > 0 else 0) + + L_disc_M_mh = quasi_uniform_code(break_points_size) + math.log(macro_bin_size - 1, 2) * break_points_size \ + if break_points_size > 0 else 0 + + L_errors = math.log(macro_bin_size, 2) * macro_bin_size + + return L_disc + L_disc_M_ind + L_disc_M_mh + L_errors + L_disc_prev + L_disc_M_ind_prev + + +def dynamic_merging(ID_threshold, IDs, initBinsCount): + F = np.zeros([initBinsCount, initBinsCount]) + discretizations = [] + # compute when we merge first c initial dist_bins into 1 and #macro dist_bins k = 1 + k_ = 0 + k = k_ + 1 + for c_ in range(initBinsCount): + c = c_ + 1 + micro_bins = [i for i in range(c)] + F[c_, k_] = compute_bin_cost(c, 0, k, micro_bins, IDs, ID_threshold) + c_disc = [[micro_bins]] + discretizations.append(c_disc) + for k_ in range(1, initBinsCount): + k = k_ + 1 + for c_ in range(k_, initBinsCount): + c = c_ + 1 + min_F = None + first_l_micro_bins = None + last_micro_bins = None + # search for the best # of microbins in the first (k - 1) macrobins: l + for l_ in range(k_ - 1, c_): + l = l_ + 1 + micro_bins = [i for i in range(l, c)] + temp_F = F[l_, k_ - 1] + compute_bin_cost(c, l, k, micro_bins, IDs, ID_threshold) + if not min_F or temp_F < min_F: + min_F = temp_F + first_l_micro_bins = discretizations[l_][k_ - 1] + last_micro_bins = micro_bins + F[c_, k_] = min_F + disc = first_l_micro_bins.copy() + disc.append(last_micro_bins) + discretizations[c_].append(disc) + return F, discretizations \ No newline at end of file