adjusting to python 3.4.2

tdembelo · Aug 11, 2017 · f58cab5 · f58cab5
1 parent 49a3b2f
commit f58cab5
Show file tree

Hide file tree

Showing 10 changed files with 210 additions and 198 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@
 *.iml
 
 logs/*
+logs2/*
 *.png
 # Mobile Tools for Java (J2ME)
 .mtj.tmp/

diff --git a/cjs.py b/cjs.py
@@ -244,6 +244,10 @@ def compute_CJSs(bin_map, curr, data, dist_bins, dim_maxes):
     data_wo_curr.pop(curr)
     maxes_ = dim_maxes.drop(curr).to_frame().reset_index(drop=True)
 
+    return compute_CJSs1(bin_map, data_wo_curr, dist_bins, maxes_)
+
+
+def compute_CJSs1(bin_map, data_wo_curr, dist_bins, maxes_):
     cjs = []
     for bin_id, binn in enumerate(dist_bins[1:], start=1):
         bin_data = data_wo_curr.loc[bin_map == binn]

diff --git a/constants.py b/constants.py
@@ -15,7 +15,12 @@ class CorrelationMeasure(Enum):
     MAC = 3
 
 
-ID_THRESHOLD_QUANTILE = 0.8
+class DistanceMeasure(Enum):
+    ID = 1
+    CJS = 2
+
+
+ID_THRESHOLD_QUANTILE = 0.90
 ID_SLIDING_WINDOW = 40
 
 NORMALIZATION_RADIUS = 1
@@ -27,7 +32,7 @@ class CorrelationMeasure(Enum):
 # subspace mining parameters
 MAX_SUBSPACE_SIZE = 5
 HETEROGENEOUS_THRESHOLD=0.8
-BEAM_WIDTH=3
+BEAM_WIDTH=4
 
 # cjs
 CLUMP = 2

diff --git a/data_generation.py b/data_generation.py
@@ -108,14 +108,41 @@ def synthetic_cjs():
                                           axis=1)), axis=0)
 
 
+def append_irrelevant_features(file, n):
+    if n == 0:
+        raise ValueError("# of irrelevant features is 0")
+    data = pd.read_csv(file, delimiter=";", header=None, na_values='?')
+    rows = data.shape[0]
+    last_dim = data.shape[1] - 1
+    irrel_data = np.random.uniform(-0.5, 0.5, (rows, n))
+    return np.concatenate([data.loc[:, :last_dim - 1], irrel_data, data.loc[:, last_dim].to_frame()], axis=1) if (data[last_dim] == 0).all() \
+        else np.concatenate([data, irrel_data], axis=1)
+
+
+def generate():
+    # -------generating dataset
+    # data = synthetic_cube_in_cube(rows, rel_features, irrel_features, 'r')
+    # data__ = synthetic_cjs()
+    #
+    # # add zeroes as default class
+    # data = np.concatenate((data, np.zeros((rows, 1))), axis=1)
+    # -------appending irrelevant features to existing dataset
+    data = append_irrelevant_features(source, irrel_features)
+    # storing to disk
+    pd.DataFrame(data).to_csv(file, sep=';', header=False, index=False, float_format='%.2f')
+
+
 if __name__ == '__main__':
-    file = 'synthetic_cases/synthetic_cjs_nonsimilar.csv'
-    if os.path.isfile(file):
-        raise ValueError
+    for i in [1,2,3,4,5,10]:
+        file = 'synthetic_cases/synthetic_10d_parity_problem_' + str(i) + '.csv'
+        source = 'synthetic_cases/synthetic_10d_parity_problem.csv'
+
+        if os.path.isfile(file):
+            raise ValueError(file + " already exists!")
 
-    rows = 20000
-    # data__ = np.concatenate((synthetic_cube_in_cube(rows, 2, 0, 'r'), np.zeros((rows, 1))), axis=1)
-    # file = 'synthetic_cases/synthetic_3d_gauss2.csv'
-    data__ = synthetic_cjs()
+        # parameters
+        rows = 20000
+        rel_features = 2
+        irrel_features = 10
 
-    pd.DataFrame(data__).to_csv(file, sep=';', header=False, index=False, float_format='%.2f')
+        generate()
diff --git a/interaction_distance.py b/interaction_distance.py
@@ -8,25 +8,25 @@
 def compute_IDs(bin_map, curr, data, dist_bins, dim_maxes):
     data_wo_curr = data.copy()
     data_wo_curr.pop(curr)  # todo slow?
-    return _compute_IDs(bin_map, data_wo_curr, dim_maxes, dist_bins)
-
-
-def compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, cor_measure,
-                         method,
-                         k=cst.MAX_SUBSPACE_SIZE,
-                         delta=cst.HETEROGENEOUS_THRESHOLD,
-                         beam_width=cst.BEAM_WIDTH):
-    if method == cst.Method.GREEDY_TOPK:
-        subspace = sm.greedy_topk(data, curr, k, cor_measure)
-    elif method == cst.Method.HET_GREEDY_TOPK:
-        subspace = sm.het_greedy_topk(data, curr, k, delta, cor_measure)
-    # todo the rest of the methods
-    data = data.copy().loc[:, subspace]
-
-    return _compute_IDs(bin_map, data, dim_maxes, dist_bins)
-
-
-def _compute_IDs(bin_map, data, dim_maxes, dist_bins):
+    return compute_IDs1(bin_map, data_wo_curr, dim_maxes, dist_bins)
+#
+# deprecated
+# def compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, cor_measure,
+#                          method,
+#                          k=cst.MAX_SUBSPACE_SIZE,
+#                          delta=cst.HETEROGENEOUS_THRESHOLD,
+#                          beam_width=cst.BEAM_WIDTH):
+#     if method == cst.Method.GREEDY_TOPK:
+#         subspace = sm.greedy_topk(data, curr, k, cor_measure)
+#     elif method == cst.Method.HET_GREEDY_TOPK:
+#         subspace = sm.het_greedy_topk(data, curr, k, delta, cor_measure)
+#     # todo the rest of the methods
+#     data = data.copy().loc[:, subspace]
+#
+#     return compute_IDs1(bin_map, data, dim_maxes, dist_bins)
+
+
+def compute_IDs1(bin_map, data, dim_maxes, dist_bins):
     inner_bin_measures = []
     inter_bin_measures = []
     for bin_id, binn in enumerate(dist_bins):