uds - init commit (always returns zero for now... :( )

tdembelo · May 25, 2017 · 92e72c3 · 92e72c3
1 parent eebe8d8
commit 92e72c3
Show file tree

Hide file tree

Showing 7 changed files with 2,247 additions and 10 deletions.
diff --git a/correlation_measures/__init__.py b/correlation_measures/__init__.py
diff --git a/correlation_measures/binning.py b/correlation_measures/binning.py
@@ -0,0 +1,18 @@
+import pandas as pd
+
+
+class Binning:
+    def __init__(self, data):
+        self.rank_data = data.rank(method='first')
+
+    # todo (small reminder) in the original ipd it is NOT equal binning
+    # Series of binned points (with dropDuplicates produces not equally frequent bins)
+    def equal_frequency_binning(self, dim, bins_count):
+        return pd.qcut(self.rank_data.sort_values(by=dim)[dim], bins_count)
+
+    def equal_frequency_binning2(self, dim, bins_count):
+        qcut = pd.qcut(self.rank_data.sort_values(by=dim)[dim], bins_count)
+        return qcut.cat.rename_categories([i for i in range(bins_count)]).reindex(qcut.index)
+
+    def get_rank_data(self):
+        return self.rank_data
diff --git a/data/crime_small.csv b/data/crime_small.csv
diff --git a/data/testdata.csv b/data/testdata.csv
@@ -0,0 +1,72 @@
+0.1,0.4,0.9,0.5,0
+0.2,0.4,0.8,0.5,0
+0.3,0.4,0.7,0.5,0
+0.4,0.8,0.1,0.5,0
+0.5,0.8,0.1,0.5,0
+0.6,0.8,0.1,0.6,0
+0.1,0.4,0.9,0.5,0
+0.2,0.4,0.8,0.5,0
+0.3,0.4,0.7,0.5,0
+0.4,0.8,0.1,0.5,0
+0.5,0.8,0.1,0.5,0
+0.6,0.8,0.1,0.6,0
+0.1,0.4,0.9,0.5,0
+0.2,0.4,0.8,0.5,0
+0.3,0.4,0.7,0.5,0
+0.4,0.8,0.1,0.5,0
+0.5,0.8,0.1,0.5,0
+0.6,0.8,0.1,0.6,0
+0.1,0.4,0.9,0.5,0
+0.2,0.4,0.8,0.5,0
+0.3,0.4,0.7,0.5,0
+0.4,0.8,0.1,0.5,0
+0.5,0.8,0.1,0.5,0
+0.6,0.8,0.1,0.6,0
+0.1,0.4,0.9,0.5,0
+0.2,0.4,0.8,0.5,0
+0.3,0.4,0.7,0.5,0
+0.4,0.8,0.1,0.5,0
+0.5,0.8,0.1,0.5,0
+0.6,0.8,0.1,0.6,0
+0.1,0.4,0.9,0.5,0
+0.2,0.4,0.8,0.5,0
+0.3,0.4,0.7,0.5,0
+0.4,0.8,0.1,0.5,0
+0.5,0.8,0.1,0.5,0
+0.6,0.8,0.1,0.6,0
+0.1,0.4,0.9,0.5,0
+0.2,0.4,0.8,0.5,0
+0.3,0.4,0.7,0.5,0
+0.4,0.8,0.1,0.5,0
+0.5,0.8,0.1,0.5,0
+0.6,0.8,0.1,0.6,0
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
+0.2,0.2,0.2,0.2,1
diff --git a/main.py b/main.py
@@ -2,6 +2,7 @@
 import pandas as pd
 import numpy as np
 
+from correlation_measures.binning import Binning
 from interaction_distance import computeIDs, compute_ID_threshold
 from merging import dynamic_merging
 
@@ -59,8 +60,8 @@ def compute_optimal_discretization(data):
     # number of initial dist_bins
     # initBinsCount = int(math.ceil(math.sqrt(row_count))) # ceil in original ipd...
     # todo remove later
-    initBinsCount = 20  # ceil in original ipd...
-    print('initBinsCount: ', initBinsCount)
+    init_bins_count = 20  # ceil in original ipd...
+    print('initBinsCount: ', init_bins_count)
 
     # normalization step todo(optional)
 
@@ -69,15 +70,12 @@ def compute_optimal_discretization(data):
 
     disc_macro_intervals = []
     disc_points = []
-    rank_data = data.rank(method='first')
+    orig_binning = Binning(data)
+    rank_data = orig_binning.get_rank_data()
     # iterate over all the dimensions
     for curr in range(dim_count):
-        # original ids -> ranked_ids in the current dimension
-        rank_data = rank_data.sort_values(by=curr)
 
-        # todo (small reminder) in the original ipd it is NOT equal binning
-        # Series of binned points
-        bin_map = pd.qcut(rank_data[curr], initBinsCount)
+        bin_map = orig_binning.equal_frequency_binning(curr, init_bins_count)
 
         # distinct bins
         dist_bins = bin_map.drop_duplicates().values
@@ -93,7 +91,7 @@ def compute_optimal_discretization(data):
 
         # todo replace by empty method later
         # table with costs, the value in i-th row and j-th column means cost of (i+1) micro bins merged into (j+1) macro bins
-        F, discretizations = dynamic_merging(ID_threshold, IDs, initBinsCount)
+        F, discretizations = dynamic_merging(ID_threshold, IDs, init_bins_count)
 
         print('dimension ' + str(curr))
         min_id = np.argmin(F[-1])
@@ -131,7 +129,7 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_
     for point in data.iterrows():
         macro_points.append(find_disc_macro_id(disc_macro_intervals, point[1][curr]))
 
-    return (disc_macro_intervals, macro_points)
+    return disc_macro_intervals, macro_points
 
 
 data = pd.read_csv('example/simple.csv', delimiter=';', header=None)

diff --git a/test_uds.py b/test_uds.py
@@ -0,0 +1,13 @@
+from unittest import TestCase
+import uds as u
+import pandas as pd
+
+
+class TestUds(TestCase):
+
+    def setUp(self):
+        self.data = pd.read_csv('data/testdata.csv', delimiter=',', header=None)
+
+    def test_compute_CEs(self):
+        c_es = u.compute_CEs(self.data)
+        print(c_es)
diff --git a/uds.py b/uds.py
@@ -0,0 +1,142 @@
+import math
+import pandas as pd
+import numpy as np
+from correlation_measures.binning import Binning
+
+# bins count
+BETA = 20
+
+
+def compute_cond_CE(data, dim, I, point_ids):
+    totalPointsCount = data.shape[0]
+    return sum([len(c) / totalPointsCount * compute_CE(data.loc[point_ids.intersection(c), dim]) for c in I])
+
+
+# discretization of the next dimension
+def dim_optimal_disc(prev, curr, binning, I, data):
+    binned_points = binning.equal_frequency_binning2(prev, BETA)
+
+    # Series with bins support
+    support = binned_points.value_counts().sort_index().cumsum()
+
+    b = []
+    val = []
+
+    merged_bins = []
+    # compute cost of single merged bins (transposed unlike in the paper) i - row, upper bound; j - column, lower bound
+    # todo worth considering implementation of the unfunc in C (slow)
+    f = []
+    # upper bound
+    for i in range(BETA):
+        f_row = []
+        merged_bins_row = []
+        # lower bound
+        for j in range(i + 1):
+            merged_bin = binned_points[np.logical_and(binned_points <= i, binned_points >= j)].index
+            merged_bins_row.append(merged_bin)
+
+            f_row.append(compute_cond_CE(data, curr, I, merged_bin))
+        f.append(f_row)
+        merged_bins.append(merged_bins_row)
+
+        b.append([[merged_bins_row[0]]])
+        val.append([f_row[0]])
+
+    for l in range(1, BETA):
+        for i in range(l, BETA):
+            min_cost = None
+            arg_min = None
+            for j in range(l - 1, i):
+                temp_cost = (support[i] - support[j]) / support[i] * f[i][j + 1] + support[j] / support[i] * val[j][
+                    l - 1]
+                if not min_cost or temp_cost < min_cost:
+                    min_cost = temp_cost
+                    arg_min = j
+
+            # val[i][l]
+            val[i].append(min_cost)
+            disc = b[arg_min][l - 1].copy()
+            disc.append(merged_bins[i][arg_min + 1])
+            b[i].append(disc)
+
+    return val[-1], b[-1]
+
+
+def compute_CEs(data):
+    dim_count = data.shape[1]
+    CEs = []
+    for curr in range(dim_count):
+        CE = compute_CE(data[curr])
+        CEs.append(CE)
+    return CEs
+
+
+def compute_CE(data):
+    m = data.shape[0]
+    if m <= 1:
+        return 0
+    curr_data = data.sort_values()
+    data_diff = (curr_data[1:] - curr_data.shift(1)[1:]).reset_index(drop=True)
+    CE = -math.log(pd.Series([((i + 1) / m) ** ((i + 1) * data_diff[i] / m) for i in range(len(data_diff))]).prod(), 2)
+    return CE
+
+
+def entropy(I, N):
+    return - sum([len(i) / N * math.log(len(i) / N, 2) for i in I])
+
+
+# compute permutation
+def compute_permutation(CEs):
+    argsort = np.argsort(CEs).tolist()
+    argsort.reverse()
+    return argsort
+
+
+def extend_I(I, disc):
+    disc_ = [i.intersection(j) for i in I for j in disc]
+
+    return [d for d in disc_ if not d.empty]
+
+
+if __name__ == "__main__":
+    data = pd.read_csv('data/testdata.csv', delimiter=',', header=None)
+    classLabels = data.pop(len(data.columns) - 1)
+    dim_count = data.shape[1]
+
+    binning = Binning(data)
+
+    # compute CE for all the dimensions
+    CEs = compute_CEs(data)
+
+    perm = compute_permutation(CEs)
+
+    # discretized dimensions array of arrays of point ids
+    I = [data.index]
+    es = []
+
+    uds = 0
+    prev = perm[0]
+    for dim in perm[1:]:
+        # todo should I pass binning?
+        costs, discs = dim_optimal_disc(prev, dim, binning, I, data)
+
+        #     regularization step
+        opt_cost = None
+        opt_l = None
+        opt_I = None
+        for l, cost in enumerate(costs):
+            temp_I = extend_I(I, discs[l])
+            temp_cost = cost / CEs[dim] + entropy(temp_I, len(data)) / (
+                math.log(BETA, 2) + sum([math.log(e + 1, 2) for e in es]))
+            if not opt_cost or temp_cost < opt_cost:
+                opt_cost = cost
+                opt_l = l
+                opt_I = temp_I
+
+        I = opt_I
+        es.append(opt_l)
+        uds += CEs[dim] - opt_cost
+        prev = dim
+    uds /= sum(CEs[1:])
+
+    print(uds)