added todo ext and todo old comments to see what are the changes

added interaction_distance.py#compute_sliding_count() method for counting ID peaks updated data_generation.py№synthetic_with_nearcopies() to the range [-0.5;0.5] the functionality is as before
tdembelo · Jun 22, 2017 · 14ad1aa · 14ad1aa
1 parent 57d60b4
commit 14ad1aa
Show file tree

Hide file tree

Showing 26 changed files with 180,099 additions and 469,799 deletions.
diff --git a/constants.py b/constants.py
@@ -11,7 +11,8 @@ class CorrelationMeasure(Enum):
     MAC = 3
 
 
-ID_THRESHOLD_QUANTILE = 1.0 / 3
+ID_THRESHOLD_QUANTILE = 1.0/3
+ID_SLIDING_WINDOW = 40
 
 NORMALIZATION_RADIUS = 1
 

diff --git a/data_generation.py b/data_generation.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import os.path
 
-
+# synthetic case from uds
 def correlated_data(m, n, sigma, f):
     l = int(n / 2)
     Z = np.random.normal(0, 1, (m, l))
@@ -29,11 +29,24 @@ def func2(X):
     return np.log2(np.abs(X) + 1)
 
 
-def synthetic_data_1(m, r, s, sigma=0.1):
-    r_dims = np.random.uniform(-0.5, 0.5, (m, r))
+def synthetic_data_uni(m, r, s, sigma=0.1):
+    r_dims = np.random.uniform(-0.5, 0.5, (m, r)) if r > 0 else np.empty((m, r))
     parity_dim = -(np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.random.uniform(0, 0.5,
-                                                                                                       (m, 1))
-    s_dims = np.random.normal(0, 1, (m, s))
+                                                                                                       (m, 1)) if r > 0 else np.empty((m, r))
+    s_dims = np.random.uniform(-0.5, 0.5, (m, s))
+    data = np.concatenate((r_dims, parity_dim, s_dims), axis=1)
+    if sigma:
+        e = np.random.normal(0, sigma, (m, r + s + 1))
+        data = data + e
+
+    return data
+
+
+def synthetic_data_uni_negative(m, r, s, sigma=0.1):
+    r_dims = np.random.uniform(-0.5, 0.5, (m, r)) if r > 0 else np.empty((m, r))
+    parity_dim = (np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.random.uniform(0, 0.5,
+                                                                                                       (m, 1)) if r > 0 else np.empty((m, r))
+    s_dims = np.random.uniform(-0.5, 0.5, (m, s))
     data = np.concatenate((r_dims, parity_dim, s_dims), axis=1)
     if sigma:
         e = np.random.normal(0, sigma, (m, r + s + 1))
@@ -43,9 +56,10 @@ def synthetic_data_1(m, r, s, sigma=0.1):
 
 
 def synthetic_data_gauss(m, r, s, sigma=0.1):
-    r_dims = np.random.normal(0, 1, (m, r))
+
+    r_dims = np.random.normal(0, 1, (m, r)) if r > 0 else np.empty((m, r))
     parity_dim = -(np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.abs(np.random.normal(0, 1,
-                                                                                                             (m, 1)))
+                                                                                                             (m, 1))) if r > 0 else np.empty((m, r))
     s_dims = np.random.normal(0, 1, (m, s))
     data = np.concatenate((r_dims, parity_dim, s_dims), axis=1)
     if sigma:
@@ -55,16 +69,23 @@ def synthetic_data_gauss(m, r, s, sigma=0.1):
     return data
 
 
-def synthetic_data_0(m):
-    l = int(m / 2)
-    first = np.concatenate((np.random.uniform(-1, 0, (l, 1)), np.random.uniform(0, 1, (l, 1))), axis=1)
-    sec = np.concatenate((np.random.uniform(0, 1, (m - l, 1)), np.random.uniform(-1, 0, (m - l, 1))), axis=1)
-    return np.concatenate((first, sec), axis=0)
+def synthetic_with_nearcopies(m, k, l, sigma=0.1):
+    k_dims = np.repeat(np.random.uniform(-0.5, 0, (m, 1)), k, axis=1) if k > 0 else np.empty((m, k))
+    l_dims = np.repeat(np.random.uniform(0, 0.5, (m, 1)), l, axis=1) if l > 0 else np.empty((m, l))
+
+    data = np.concatenate((k_dims, l_dims), axis=1)
+    if sigma:
+        e = np.random.normal(0, sigma, (m, k + l))
+        data = data + e
+
+    return data
 
 
 if __name__ == '__main__':
-    data__ = np.concatenate((synthetic_data_1(20000, 9, 0, 0), np.zeros((20000, 1))), axis=1)
-    file = 'synthetic_10d_parity_problem.csv'
+    rows = 20000
+    data__ = np.concatenate((synthetic_with_nearcopies(rows, 2, 0, 0), np.zeros((rows, 1))), axis=1)
+    # file = 'synthetic_cases/synthetic_3d_gauss2.csv'
+    file = 'synthetic_cases/synthetic_exact_copies2_2.csv'
 
     if os.path.isfile(file):
         raise ValueError

diff --git a/experiments_logging.py b/experiments_logging.py
@@ -6,14 +6,16 @@
 def plot_data_3d(data):
     fig = plt.figure()
     ax = fig.add_subplot(111, projection='3d')
+    # data = data[np.logical_and(data[0] < 0, data[1] > 0)]
 
-    color_cond = {'b': np.logical_and(data[0] < 0, data[1] > 0),
+    color_cond = {'b': np.logical_and(data[0] < 0, np.logical_and(data[1] > 0, data[2] < 0)),
+                  'k': np.logical_and(data[0] < 0, np.logical_and(data[1] > 0, data[2] > 0)),
                   'g': np.logical_and(data[0] > 0, data[1] < 0),
                   'r': np.logical_and(data[0] < 0, data[1] < 0),
                   'c': np.logical_and(data[0] > 0, data[1] > 0),
                   }
     for c in color_cond:
-        ax.scatter(data[0][color_cond[c]], data[1][color_cond[c]], data[2][color_cond[c]], c=c)
+        ax.scatter(data[0][color_cond[c]], data[1][color_cond[c]], data[2][color_cond[c]], c=c, s=1)
 
     ax.set_xlabel('X0')
     ax.set_ylabel('X1')
@@ -49,6 +51,6 @@ def write_cut_file(name, disc_intervals):
             out.write('-------------------------------------\n')
 
 
-# if __name__ == '__main__':
-#     data = pd.read_csv("synthetic_3d_parity_problem.csv", delimiter=";", header=None, na_values='?')
-#     plot_data_3d(data)
+if __name__ == '__main__':
+    data = pd.read_csv("synthetic_cases/synthetic_3d_gauss.csv", delimiter=";", header=None, na_values='?')
+    plot_data_3d(data)
diff --git a/interaction_distance.py b/interaction_distance.py
@@ -2,7 +2,7 @@
 import numpy as np
 
 import uds
-from constants import CorrelationMeasure, ID_THRESHOLD_QUANTILE
+from constants import CorrelationMeasure, ID_THRESHOLD_QUANTILE, ID_SLIDING_WINDOW
 
 
 def compute_IDs(bin_map, curr, data, dist_bins, dim_maxes):
@@ -81,3 +81,16 @@ def compute_max_ID_threshold(IDs):
     IDs.sort()
 
     return max(IDs)
+
+
+def compute_sliding_count(IDs, ID_threshold):
+    count = []
+    avg = sum(IDs) / len(IDs)
+    for i in range(ID_SLIDING_WINDOW, len(IDs)):
+        start = i - ID_SLIDING_WINDOW if i > ID_SLIDING_WINDOW else 0
+        count.append(sum(1 for id in IDs[start: i] if id > ID_threshold))
+    # i = 0
+    # while i < len(IDs):
+    #     count.append(sum(1 for id in IDs[i: i + ID_SLIDING_WINDOW] if id > ID_threshold))
+    #     i += ID_SLIDING_WINDOW
+    return count
diff --git a/main.py b/main.py
@@ -38,14 +38,15 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure
     dim_count = data.shape[1]
 
     # number of initial dist_bins
-    # todo remove later
+    # todo old remove later
     # init_bins_count = 20  # ceil in original ipd...
     init_bins_count = int(math.ceil(math.sqrt(data.shape[0])))  # ceil in original ipd...
+    write('row count:', data.shape[0])
     write('init_bins_count:', init_bins_count)
     write('ID_THRESHOLD_QUANTILE:', cst.ID_THRESHOLD_QUANTILE)
 
-    # normalization step todo(optional)
-    # todo by default the normalization is optional as it does not influence on the results
+    # normalization step todo old (optional)
+    # todo old by default the normalization is optional as it does not influence on the results
 
     # norm_data = data.apply(lambda x: 2 * cst.NORMALIZATION_RADIUS * (x - x.min()) / (
     #     x.max() - x.min()) - cst.NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape)))
@@ -59,12 +60,12 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure
     orig_binning = Binning(norm_data)
     rank_data = orig_binning.get_rank_data()
 
-    # plt.figure(1)
+    plt.figure(1)
 
     height = int(math.sqrt(dim_count ))
     width = int(math.ceil((dim_count ) / height))
 
-    # fig, axes = plt.subplots(nrows=height, ncols=width, squeeze=False)
+    fig, axes = plt.subplots(nrows=height, ncols=width, squeeze=False)
 
     # iterate over all the dimensions
     for curr in range(dim_count):
@@ -80,6 +81,9 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure
         IDs = id.compute_IDs(bin_map, curr, norm_data, dist_bins, dim_maxes) if method == cst.Method.ORIGINAL else \
             id.compute_IDs_extended(bin_map, curr, norm_data, dist_bins, dim_maxes, cor_measure, cst.MAX_SUBSPACE_SIZE)
         ID_threshold = id.compute_ID_threshold(IDs)
+        # todo ext compute sliding average and count ID peaks above the avg (in a sliding window)
+        # ID_peaks = id.compute_sliding_count(IDs, ID_threshold)
+
         # pd.DataFrame(IDs).to_csv(prefix + "_IDs_" + str(curr) + ".csv")
         # -----------------------------OPTIMAL MERGE STRATEGY----------------------------------
 
@@ -95,11 +99,11 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure
         (curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations, dist_bins,
                                                                            min_id, rank_data)
 
-        # ax1 = axes[int(curr / width), int(curr % width)]
+        ax1 = axes[int(curr / width), int(curr % width)]
         # # ax1.hist(IDs, bins=100, color='c')
-        # ax1.plot([i for i in range(len(IDs))], IDs)
-        # ax1.axhline(ID_threshold, color='b', linestyle='dashed', linewidth=1)
-        # ax1.set_title('dimension ' + str(curr))
+        ax1.plot([i for i in range(len(IDs))], IDs)
+        ax1.axhline(ID_threshold, color='b', linestyle='dashed', linewidth=1)
+        ax1.set_title('dimension ' + str(curr))
 
         # ax2 = axes[int((2*curr + 1) / width), int((2*curr + 1) % width)]
         # ax2.plot(sorted(IDs), color='k')
@@ -118,7 +122,7 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure
             write("{0:.2f}".format(curr_macro_intervals[macro_id][1]) + " -", IDs[macro_bin[-1]], '[q=' +
                   str((sorted(IDs).index(IDs[macro_bin[-1]]) + 1) / len(IDs)) + ']')
             # ax1.axhline(IDs[macro_bin[-1]], color='r', linewidth=1)
-            # ax1.plot([macro_bin[-1]], [IDs[macro_bin[-1]]], marker='o', markersize=3, color="red")
+            ax1.plot([macro_bin[-1]], [IDs[macro_bin[-1]]], marker='o', markersize=3, color="red")
             # ax2.axvline(IDs[macro_bin[-1]], color='r', linewidth=1)
 
         write('\nnumber of points per macrobin:')
@@ -131,9 +135,8 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure
         disc_macro_intervals.append(curr_macro_intervals)
         disc_points.append(curr_macro_points)
 
-    # plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25,
-    #                     wspace=0.35)
-    # plt.savefig(prefix + '.png', format='png')
+    plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25, wspace=0.35)
+    plt.savefig(dir + 'IDs.png', format='png')
     return disc_macro_intervals, disc_points, class_labels
 
 
@@ -161,7 +164,7 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_
 
 
 if __name__ == "__main__":
-    sys.argv = '-f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -r=19881'.split(' ')
+    sys.argv = '-f=synthetic_cases/synthetic_3d_parity_problem.csv -d=;'.split(' ')
     # if len(sys.argv) < 2:
     #     print('Usage: main.py -f=<data_file> -d=<delimiter> -c=<number of columns> -m=<[original|greedy]> -cor=<[uds]>')
     file_arg = list(filter(lambda x: x.startswith("-f="), sys.argv))
@@ -204,6 +207,6 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_
     with open(log_file, 'w') as log:
         disc_intervals, disc_points, class_labels = compute_optimal_discretization(data, method, cor_measure)
 
-    write_out_file(dir + cst.FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels)
-
-    write_cut_file(dir + cst.FILE_DATA_CUTS, disc_intervals)
+    # write_out_file(dir + cst.FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels)
+    #
+    # write_cut_file(dir + cst.FILE_DATA_CUTS, disc_intervals)
diff --git a/merging.py b/merging.py
@@ -22,7 +22,7 @@ def break_points_number(macro_bin, IDs, ID_threshold):
     :param ID_threshold: 
     :return: 
     '''
-    # todo condition as in original IPD: ID > ID_threshold
+    # todo old condition as in original IPD: ID > ID_threshold
     ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]]
     return sum(ID_boolean)
 
@@ -35,10 +35,10 @@ def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold):
     macro_bin_size_code = quasi_uniform_code(macro_bin_size)
     break_points_size = break_points_number(macro_bin, IDs, ID_threshold)
 
-    # todo in the original ipd L_disc L_N is computed for (k-1)
+    # todo old in the original ipd L_disc L_N is computed for (k-1)
     # L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2)
     L_disc = quasi_uniform_code(k - 1) + math.log(comb(c - 1, k - 1), 2)
-    # todo in the original ipd L_disc L_N is computed for (k-1)
+    # todo old in the original ipd L_disc L_N is computed for (k-1)
     # L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
     L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)