diff --git a/cjs.py b/cjs.py index a8e1ce3..5abe3b4 100644 --- a/cjs.py +++ b/cjs.py @@ -110,7 +110,7 @@ def dim_optimal_disc(prev, curr, I1, I2, binA, binB, maxes): # equal frequency binning global_min = min(binA[prev].min(), binB[prev].min()) binning = Binning(binA, prev, DEFAULT_BINS_COUNT, global_min) - points2binA_map = binning.equal_frequency_binning4() + points2binA_map = binning.equal_frequency_binning_duplicate_drop() points2binB_map = binning.interpolate(binB) # cjs discretizations and values @@ -239,16 +239,18 @@ def _compute_CJS(binA, binB, maxes): print(str(compute_CJS(binA, binB, pd.DataFrame(np.max(data[attrs]).transpose().reset_index(drop=True))))) -def compute_CJSs(bin_map, curr, data, dist_bins, dim_maxes): +def compute_CJSs(bin_map, curr, data, dim_maxes): data_wo_curr = data.copy() data_wo_curr.pop(curr) maxes_ = dim_maxes.drop(curr).to_frame().reset_index(drop=True) - return compute_CJSs1(bin_map, data_wo_curr, dist_bins, maxes_) + return compute_CJSs1(bin_map, data_wo_curr, maxes_) -def compute_CJSs1(bin_map, data_wo_curr, dist_bins, maxes_): +def compute_CJSs1(bin_map, data_wo_curr, maxes_): cjs = [] + # distinct bins + dist_bins = bin_map.cat.categories for bin_id, binn in enumerate(dist_bins[1:], start=1): bin_data = data_wo_curr.loc[bin_map == binn] prev_bin_data = data_wo_curr.loc[bin_map == dist_bins[bin_id - 1]] diff --git a/correlation_measures/binning.py b/correlation_measures/binning.py index 3eaa56e..ad41bb0 100644 --- a/correlation_measures/binning.py +++ b/correlation_measures/binning.py @@ -1,31 +1,39 @@ import pandas as pd +import numpy as np +import pandas.core.algorithms as algos +import re class Binning: - def __init__(self, data, dim=None, desired_bins_count=None, global_min=None): - self.desired_bins_count = desired_bins_count if desired_bins_count is None or data.shape[0] > desired_bins_count\ + def __init__(self, data, dim, desired_bins_count, global_min=None): + self.desired_bins_count = desired_bins_count if desired_bins_count is None or data.shape[0] > desired_bins_count \ else data.shape[0] self.dim = dim self.data = data - self.rank_data = data.rank(method='first') self.global_min = global_min - # todo old (small reminder) in the original ipd it is NOT equal binning # Series of binned points (with dropDuplicates produces not equally frequent bins) - def equal_frequency_binning(self, dim, bins_count): - return pd.qcut(self.rank_data.sort_values(by=dim)[dim], bins_count) - - def equal_frequency_binning2(self): - qcut = pd.qcut(self.rank_data.sort_values(by=self.dim)[self.dim], self.desired_bins_count) - self.qcut = qcut.cat.rename_categories([i for i in range(self.desired_bins_count)]).reindex(qcut.index) + def equal_frequency_binning_by_rank(self): + self.rank_data = self.data.rank(method='first') self.bins_count = self.desired_bins_count + self.qcut = pd.qcut(self.rank_data[self.dim], self.bins_count) + return self.qcut + + def equal_frequency_binning_by_rank_int_categories(self): + self.equal_frequency_binning_by_rank() + self.qcut = self.qcut.cat.rename_categories([i for i in range(self.desired_bins_count)]).reindex( + self.qcut.index) return self.qcut - def equal_frequency_binning4(self): - qcut = pd.qcut(self.data[self.dim], self.desired_bins_count, duplicates='drop') - qcut = qcut.cat.remove_unused_categories() - bounds = [c.right for c in qcut.cat.categories] + def equal_frequency_binning_duplicate_drop(self): + # todo python361 + # qcut = pd.qcut(self.data[self.dim], self.desired_bins_count, duplicates='drop') + # todo python 342 + qcut = self._compute_qcut() + + # qcut = qcut.cat.remove_unused_categories() + bounds = [float(re.search(', (-*\d+\.*\d*)', c).group(1)) for c in qcut.cat.categories] # including global_min with a margin of 1 bounds.insert(0, self.global_min - 1) self.bounds = pd.Series(bounds) @@ -34,22 +42,12 @@ def equal_frequency_binning4(self): self.qcut = qcut.cat.rename_categories([i for i in range(self.bins_count)]).reindex(qcut.index) return self.qcut - def equal_frequency_binning3(self, dim, desired_bins_count): - qcut = pd.qcut(self.data[dim], desired_bins_count, duplicates='drop') - self.qcut = qcut.cat.rename_categories([i for i in range(desired_bins_count)]).reindex(qcut.index) - return self.qcut - - def get_bounds(self, global_min): - groupby = self.qcut.reset_index().groupby(0) - - self.bounds = pd.Series(pd.unique(pd.concat([pd.Series(global_min - 1), - self.data.loc[groupby.last()['index'], self.dim]], axis=0))) \ - .reset_index(drop=True) - return self.bounds - - - def get_rank_data(self): - return self.rank_data + def _compute_qcut(self): + quantiles = np.linspace(0, 1, self.desired_bins_count + 1) + bins = algos.quantile(self.data[self.dim], quantiles) + bins = pd.unique(bins) + qcut = pd.cut(self.data[self.dim], bins, include_lowest=True) + return qcut def interpolate(self, other_bin): if self.bounds is None: @@ -62,4 +60,3 @@ def interpolate(self, other_bin): data_ = pd.cut(other_col, self.bounds) return data_.cat.rename_categories([i for i in range(self.bins_count)]).reindex(data_.index) - diff --git a/interaction_distance.py b/interaction_distance.py index d2ced05..2489cf3 100644 --- a/interaction_distance.py +++ b/interaction_distance.py @@ -5,10 +5,10 @@ import constants as cst -def compute_IDs(bin_map, curr, data, dist_bins, dim_maxes): +def compute_IDs(bin_map, curr, data, dim_maxes): data_wo_curr = data.copy() data_wo_curr.pop(curr) # todo slow? - return compute_IDs1(bin_map, data_wo_curr, dim_maxes, dist_bins) + return compute_IDs1(bin_map, data_wo_curr, dim_maxes) # # deprecated # def compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, cor_measure, @@ -26,7 +26,8 @@ def compute_IDs(bin_map, curr, data, dist_bins, dim_maxes): # return compute_IDs1(bin_map, data, dim_maxes, dist_bins) -def compute_IDs1(bin_map, data, dim_maxes, dist_bins): +def compute_IDs1(bin_map, data, dim_maxes): + dist_bins = bin_map.cat.categories inner_bin_measures = [] inter_bin_measures = [] for bin_id, binn in enumerate(dist_bins): diff --git a/main.py b/main.py index eaee02d..178b130 100644 --- a/main.py +++ b/main.py @@ -61,14 +61,14 @@ def plot_distances(dir, distances): plt.savefig(dir + 'distances.png', format='png') -def compute_distances(bin_map, curr, data, dist_bins, dim_maxes, +def compute_distances(bin_map, curr, data, dim_maxes, cor_measure, method, distance_measure, k=cst.MAX_SUBSPACE_SIZE, delta=cst.HETEROGENEOUS_THRESHOLD, beam_width=cst.BEAM_WIDTH): if method == cst.Method.ORIGINAL: - return id.compute_IDs(bin_map, curr, data, dist_bins, dim_maxes) if distance_measure == cst.DistanceMeasure.ID \ - else cjs.compute_CJSs(bin_map, curr, data, dist_bins, dim_maxes) + return id.compute_IDs(bin_map, curr, data, dim_maxes) if distance_measure == cst.DistanceMeasure.ID \ + else cjs.compute_CJSs(bin_map, curr, data, dim_maxes) if method == cst.Method.GREEDY_TOPK: subspace = sm.greedy_topk(data, curr, k, cor_measure) @@ -77,8 +77,8 @@ def compute_distances(bin_map, curr, data, dist_bins, dim_maxes, # todo the rest of the methods data = data.copy().loc[:, subspace] - return id.compute_IDs1(bin_map, data, dim_maxes, dist_bins) if distance_measure == cst.DistanceMeasure.ID \ - else cjs.compute_CJSs1(bin_map, data, dist_bins, dim_maxes) + return id.compute_IDs1(bin_map, data, dim_maxes) if distance_measure == cst.DistanceMeasure.ID \ + else cjs.compute_CJSs1(bin_map, data, dim_maxes) def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure=None, @@ -107,28 +107,27 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure dim_maxes = norm_data.max(0) disc_macro_intervals = [] disc_points = [] - orig_binning = Binning(norm_data) - rank_data = orig_binning.get_rank_data() distancez = [] # iterate over all the dimensions for curr in range(dim_count): - bin_map = orig_binning.equal_frequency_binning(curr, init_bins_count) - - # distinct bins - dist_bins = bin_map.unique() + binning = Binning(norm_data, curr, init_bins_count) + bin_map = binning.equal_frequency_binning_by_rank() + dist_bins = bin_map.cat.categories # -----------------------------INTERACTION DISTANCES---------------------------------- - distances = compute_distances(bin_map, curr, norm_data, dist_bins, dim_maxes, cor_measure, method, + distances = compute_distances(bin_map, curr, norm_data, dim_maxes, cor_measure, method, distance_measure) # todo python361 # distancez.append([[data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[i].right)].index.tolist()[0], curr] for i in # range(len(distances))], distances]) # todo python342 - distancez.append( - [[data.loc[rank_data[rank_data[curr] == math.floor(float(re.search(', (\d+\.*\d*?)', dist_bins[i]).group(1)))].index.tolist()[0], curr] for i in - range(len(distances))], distances]) + distancez.append([[data.loc[binning.rank_data[binning.rank_data[curr] + == math.floor(float(re.search(', (-*\d+\.*\d*)', + dist_bins[i]).group(1)))] + .index.tolist()[0], curr] for i in range(len(distances))], distances]) + ID_threshold = id.compute_ID_threshold(distances) # todo ext compute sliding average and count ID peaks above the avg (in a sliding window) # ID_peaks = id.compute_sliding_count(distances, ID_threshold) @@ -145,8 +144,8 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure min_id = np.argmin(F[-1]) - (curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations, dist_bins, - min_id, rank_data) + (curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations, + dist_bins, min_id, binning.rank_data) write(log, '-------------------------') write(log, 'dimension:', curr) @@ -188,10 +187,10 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_ # todo python342 right = \ - data.loc[rank_data[rank_data[curr] == math.floor(float(re.search(', (\d+\.*\d*?)', dist_bins[micro_bin_id]).group(1)))][curr].index[0]][curr] + data.loc[rank_data[rank_data[curr] == math.floor(float(re.search(', (-*\d+\.*\d*)', dist_bins[micro_bin_id]).group(1)))][curr].index[0]][curr] if not len(macro_interval): macro_interval.append( - data.loc[rank_data[rank_data[curr] == math.ceil(float(re.search('(\d+\.*\d*?),', dist_bins[micro_bin_id]).group(1)))][curr].index[0]][ + data.loc[rank_data[rank_data[curr] == math.ceil(float(re.search('(-*\d+\.*\d*),', dist_bins[micro_bin_id]).group(1)))][curr].index[0]][ curr]) macro_interval.append(right) else: @@ -210,7 +209,7 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_ print( 'Usage: main.py -f= -d= -c= -m=<[original|greedy_topk]> -cor=<[uds]> ' '-dist=<[id, cjs]>') - command = '-f=synthetic_cases/synthetic_2d_parity_problem.csv -d=; -dist=ID' + command = '-f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID' print('Running default: ', command) command_list = command.split(' ') else: @@ -263,8 +262,8 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_ with open(log_file, 'w') as log: disc_intervals, disc_points, class_labels, distances = compute_optimal_discretization(data, method, - distance_measure, - cor_measure, log) + cor_measure, + distance_measure, log) plot_distances(dir, distances) diff --git a/3d_parity_statistics.py b/old/3d_parity_statistics.py similarity index 93% rename from 3d_parity_statistics.py rename to old/3d_parity_statistics.py index f781e67..b0fd2f8 100644 --- a/3d_parity_statistics.py +++ b/old/3d_parity_statistics.py @@ -15,10 +15,11 @@ def average_id(bin1, bin2, dim_maxes): data0 = data[bin1] data1 = data[bin2] + # todo fix orig_binning0 = Binning(data0) orig_binning1 = Binning(data1) - bin0_map = orig_binning0.equal_frequency_binning(0, int(data0.shape[0] / 141)) - bin1_map = orig_binning1.equal_frequency_binning(0, int(data1.shape[0] / 141)) + bin0_map = orig_binning0.equal_frequency_binning_by_rank(0, int(data0.shape[0] / 141)) + bin1_map = orig_binning1.equal_frequency_binning_by_rank(0, int(data1.shape[0] / 141)) # distinct bins dist_bins0 = bin0_map.unique() dist_bins1 = bin1_map.unique() diff --git a/4d_parity_statistics.py b/old/4d_parity_statistics.py similarity index 96% rename from 4d_parity_statistics.py rename to old/4d_parity_statistics.py index faa5ce4..f305d6b 100644 --- a/4d_parity_statistics.py +++ b/old/4d_parity_statistics.py @@ -15,10 +15,11 @@ def average_id(bin1, bin2, dim_maxes): data0 = data[bin1] data1 = data[bin2] + # todo fix orig_binning0 = Binning(data0) orig_binning1 = Binning(data1) - bin0_map = orig_binning0.equal_frequency_binning(0, int(data0.shape[0] / 141)) - bin1_map = orig_binning1.equal_frequency_binning(0, int(data1.shape[0] / 141)) + bin0_map = orig_binning0.equal_frequency_binning_by_rank(0, int(data0.shape[0] / 141)) + bin1_map = orig_binning1.equal_frequency_binning_by_rank(0, int(data1.shape[0] / 141)) # distinct bins dist_bins0 = bin0_map.unique() dist_bins1 = bin1_map.unique() diff --git a/old/__init__.py b/old/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/temp_exp.py b/old/temp_exp.py similarity index 97% rename from temp_exp.py rename to old/temp_exp.py index c685548..7b15f62 100644 --- a/temp_exp.py +++ b/old/temp_exp.py @@ -52,8 +52,9 @@ def id_exceeds_experiment(data, method=cst.Method.ORIGINAL, cor_measure=None): disc_macro_intervals = [] disc_points = [] + # todo fix orig_binning = Binning(data) - rank_data = orig_binning.get_rank_data() + rank_data = orig_binning.rank_data # plt.figure(1) @@ -64,7 +65,7 @@ def id_exceeds_experiment(data, method=cst.Method.ORIGINAL, cor_measure=None): second = [] # iterate over all the dimensions for curr in range(dim_count): - bin_map = orig_binning.equal_frequency_binning(curr, init_bins_count) + bin_map = orig_binning.equal_frequency_binning_by_rank(curr, init_bins_count) # distinct bins dist_bins = bin_map.unique() diff --git a/uds.py b/uds.py index a37519c..bdda152 100644 --- a/uds.py +++ b/uds.py @@ -17,8 +17,9 @@ def compute_cond_CE(data, dim, I, point_ids): # discretization of the next dimension -def dim_optimal_disc(curr, binning, I, data): - binned_points = binning.equal_frequency_binning2() +def dim_optimal_disc(curr, prev, I, data): + binning = Binning(data, prev, UDS_BETA) + binned_points = binning.equal_frequency_binning_by_rank_int_categories() # Series with bins support support = binned_points.value_counts().sort_index().cumsum() @@ -114,10 +115,9 @@ def compute_uds(data): es = [] uds = 0 prev = perm[0] - binning = Binning(data, prev, UDS_BETA) for dim in perm[1:]: # todo should I pass binning? - scores, discs = dim_optimal_disc(dim, binning, I, data) + scores, discs = dim_optimal_disc(dim, prev, I, data) # regularization step opt_cost = None @@ -148,8 +148,9 @@ def compute_uds(data): if __name__ == "__main__": - # data = pd.read_csv('data/testdata.csv', delimiter=',', header=None) - data = pd.DataFrame(correlated_data(4000, 20, 10, dg.func2)) + + data = pd.read_csv('synthetic_cases/uds_test.csv', delimiter=';', header=None) + # data = pd.DataFrame(correlated_data(4000, 20, 10, dg.func2)) # data = pd.DataFrame(generate_correlated_data(1000, 10, 2, func1)) # data = pd.DataFrame(generate_uncorrelated_data(4000, 20)) # classLabels = data.pop(len(data.columns) - 1)