added interactionType which can be CUBES or XOR

added XorGenerator
tdembelo · Oct 19, 2017 · 8b0e2f8 · 8b0e2f8
1 parent f9ee841
commit 8b0e2f8
Show file tree

Hide file tree

Showing 8 changed files with 196 additions and 82 deletions.
diff --git a/ID_correlation_measure.py → ID_sm.py b/ID_correlation_measure.py → ID_sm.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import interaction_distance as id
 import data_generator as dg
+import matplotlib.pyplot as plt
 
 def evidence_ID():
     # no interaction
@@ -35,29 +36,35 @@ def evidence_ID():
     # log.plot_data_2d(df)
     # log.plot_data_2d(pd.concat([df.loc[:100], df.loc[7900:8000]], axis=0))
 
-cg = dg.produce_cube_generator(7, 0, 2, "i", ".csv")
+cg = dg.produce_cube_generator(7, 0, 2, "i", 1, ".csv")
 data, filname = cg.build()
 print(cg.subspaces)
 print(cg.perf_disc)
 
 data = pd.DataFrame(data)
 dim_count = data.shape[1]
-for curr in data[:-1]:
+for curr in range(dim_count - 1):
     dims = data.columns.tolist()
     dims.remove(curr)
     dims.remove(dim_count - 1)
-    curr_data = data.sort_values(by=curr).reset_index(drop=True).loc[:, dims]
-    rows = curr_data.shape[0]
+    projected_data = data.sort_values(by=curr).reset_index()
+    curr_index = projected_data['index']
+    projected_data = projected_data.loc[:, dims]
+    rows = projected_data.shape[0]
     print('curr dimension', curr)
     for dim in dims:
         counter = 0
         ids = []
+        dim_x = []
         while(True):
-            if counter + 280 > rows:
+            if counter + 140 > rows:
                 break
-            ids.append(id.compute_ID(curr_data.loc[counter:counter + 140, dim].to_frame(),
-                                     curr_data.loc[counter + 140: counter + 280, dim].to_frame(), [2] * dim_count))
-            counter += 1
+            ids.append(id.compute_ID(projected_data.loc[counter:counter + 70, dim].to_frame(),
+                                     projected_data.loc[counter + 70: counter + 140, dim].to_frame(), [2] * dim_count))
+            dim_x.append(data.loc[curr_index.loc[counter + 70], curr])
+            counter += 140
         # needs data normalization todo
-        print('interaction with', dim, np.average(ids))
+        print('interaction with', dim, np.average(ids), sum([1 if ID > np.average(ids) else 0 for ID in ids]))
+        plt.plot(dim_x, ids)
+        plt.show()
     # break
diff --git a/constants.py b/constants.py
@@ -33,6 +33,11 @@ class DistanceMeasure(Enum):
     CJS = 2
 
 
+class InteractionType(Enum):
+    CUBES = 1
+    XOR = 2
+
+
 ID_THRESHOLD_QUANTILE = 0.3
 ID_SLIDING_WINDOW = 40
 
@@ -72,13 +77,15 @@ class DistanceMeasure(Enum):
 INTERACTIONS_LOWER_BOUND=3 if socket.gethostname() != 'push' else 1
 
 # new settings (more constrained)
+INTERACTION_TYPE_RANGE_LIST=[InteractionType.CUBES, InteractionType.XOR]
 IRRELEVANT_FEATURES_RANGE_LIST = [0, 1, 2, 4, 8, 16, 32, 64, 99, 3, 6, 12, 24, 48, 82] if socket.gethostname() != 'push' else [0, 1, 2, 3]
 RELEVANT_FEATURES_RANGE_LIST = [2, 3, 4, 6, 8, 12, 16, 23, 30] if socket.gethostname() == 'push' else [7]
 INTERACTION_NUMBER_RANGE_LIST = [1, 2, 4, 8, 10] if socket.gethostname() == 'push' else [3]
-INTERACTION_TYPES_RANGE_LIST=["c", 'i', "io"] if socket.gethostname() == 'push' else ['i']
+PARTITION_TYPES_RANGE_LIST=["c", 'i', "io"] if socket.gethostname() == 'push' else ['i']
 CUBES_LOWER_BOUND=1
 CUBES_UPPER_BOUND=3
 NAIVE_CHUNKS_NUMBER_RANGE_LIST = [2, 3, 4, 5, 10, 20, 30]
+XOR_SIGMA=0.1
 
 
 BASE = '/local/tmp/ipd_extended_experiments2/' if socket.gethostname() == 'push' \

diff --git a/data_generator.py b/data_generator.py
@@ -1,3 +1,5 @@
+from abc import abstractmethod
+
 import numpy as np
 import pandas as pd
 import random
@@ -22,20 +24,90 @@ def __init__(self, rows, loc=None):
         self.subspaces = []
 
 
-class CubesGenerator:
-    def __init__(self, rel_feature_count, irr_feature_count, radius, file_name):
-        self.rel_feature_count = rel_feature_count
-        self.file_name = file_name
-        self.cube_parameters = []
+class DataGenerator:
+    def __init__(self, file_name, rel_feature_count, irr_feature_count, radius):
+        self.radius = radius
         self.feature_count = rel_feature_count + irr_feature_count
+        self.irf = irr_feature_count
+        self.file_name = file_name
         self.dim_borders = [[-radius, radius] for d in range(self.feature_count)]
         self.subspaces = []
         self.perf_disc = [{d[1]} for d in self.dim_borders[:rel_feature_count]]
 
     def __repr__(self):
-        return 'CubesGenerator(file_name=' + str(self.file_name) \
-               + ', rel_feature_count=' + str(self.rel_feature_count) \
-               + ', feature_count=' + str(self.feature_count) + ")"
+        return '(file_name=' + str(self.file_name) + ")"
+
+    @abstractmethod
+    def build(self):
+        ...
+
+    @abstractmethod
+    def get_discs(self):
+        ...
+
+    @abstractmethod
+    def get_subspaces(self):
+        ...
+
+
+class XorGenerator(DataGenerator):
+    def __init__(self, rf, irf, radius, rows, sigma, file_name):
+        super().__init__(file_name, rf, irf, radius)
+        self.rows = rows
+        self.slave_features = rf - 1
+        self.sigma = sigma
+        self.perf_disc = [[0, radius] for d in self.dim_borders[:rf]]
+        self.subspaces = [[f for f in range(rf)]]
+
+    def get_discs(self):
+        return self.perf_disc
+
+    def get_subspaces(self):
+        return self.subspaces
+
+
+    def build(self):
+        r_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.slave_features)) if self.slave_features > 0 else np.empty((self.rows, self.slave_features))
+        parity_dim = -(np.sum(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(self.rows, 1) \
+                     * np.random.uniform(0, self.radius, (self.rows, 1)) if self.slave_features > 0 else np.empty((self.rows, self.slave_features))
+
+        irr_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.irf))
+
+
+        xor_dict = dict()
+        counter = [0]
+        curr = []
+
+        def add_value(r):
+            if r == 0:
+                counter[0] += 1
+                xor_dict["".join([str(i) for i in curr]) + str(sum(curr) % 2)] = counter[0]
+                return
+
+            for i in [0, 1]:
+                curr.append(i)
+                add_value(r - 1)
+                curr.pop()
+        add_value(self.slave_features)
+
+
+        class_labels = np.apply_along_axis(lambda a: xor_dict["".join([str(d) for d in a])], 1,
+                                           np.concatenate([np.array(r_dims > 0, dtype='int'),
+                                                           (np.sum(r_dims > 0, axis=1) % 2).reshape(self.rows, 1)],
+                                                          axis=1))
+        class_labels = class_labels.reshape([class_labels.shape[0], 1])
+        data = np.concatenate((r_dims, parity_dim, irr_dims, class_labels), axis=1)
+        if self.sigma:
+            e = np.concatenate((np.random.normal(0, self.sigma, (self.rows, self.slave_features + self.irf + 1)), np.zeros((self.rows, 1))), axis=1)
+            data = data + e
+        return data, self.file_name
+
+class CubesGenerator(DataGenerator):
+    def __init__(self, rel_feature_count, irr_feature_count, radius, file_name):
+        super().__init__(file_name, rel_feature_count, irr_feature_count, radius)
+        self.rel_feature_count = rel_feature_count
+        self.cube_parameters = []
+
 
     def add_cube_parameter(self, cube_param):
         if cube_param.loc is None:
@@ -155,6 +227,10 @@ def produce_cube_generator(rf, irf, interactions, type, cubes, file_name):
     return dg
 
 
+def produce_xor_generator(rf, irf, file_name):
+    return XorGenerator(rf, irf, RADIUS, ROWS, 0.1, file_name)
+
+
 def produce_all_data_generators():
     data_generators = []
     global basedir
@@ -168,13 +244,17 @@ def produce_all_data_generators():
     perf_subspaces = dict()
     perf_discs = dict()
 
-    def produce_dg(name, rf, i, type, cubes):
+    def produce_dg(name, interaction_type, rf, i, type, cubes):
 
-        if os.path.exists(basedir + name) and os.path.exists(
-                                perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
+        if os.path.exists(basedir + name + ".csv") and os.path.exists(perf_disc_dir + 'cut_' + name + ".txt"):
             return
 
-        dg = produce_cube_generator(rf, cst.IRRELEVANT_FEATURES, i, type, cubes, name + ".csv")
+        if interaction_type == cst.InteractionType.CUBES:
+            dg = produce_cube_generator(rf, cst.IRRELEVANT_FEATURES, i, type, cubes, name + ".csv")
+        elif interaction_type == cst.InteractionType.XOR:
+            dg = produce_xor_generator(rf, cst.IRRELEVANT_FEATURES, name + ".csv")
+        else:
+            raise ValueError("no implementation of data generator for", interaction_type.name)
         perf_discs[name] = dg.get_discs()
         perf_subspaces[name] = dg.get_subspaces()
         data_generators.append(dg)
@@ -214,13 +294,10 @@ def store(data):
 
 
 if __name__ == '__main__':
-    cg = produce_cube_generator(7, 2, 3, 'c', 'bla')
-    print(cg.subspaces)
-    cg = produce_cube_generator(7, 2, 3, 'i', 'bla')
-    print(cg.subspaces)
+    # l.plot_data_3d(produce_xor_generator(3, 0, 'bla').build()[0])
     # print(generate_overlap_partition(7, 3))
 
-    # generators = produce_all_data_generators()
-    # for g in generators:
-    #
-    #     store(g.build())
+    generators = produce_all_data_generators()
+    for g in generators:
+
+        store(g.build())
diff --git a/discretization_quality_measure.py b/discretization_quality_measure.py
@@ -166,7 +166,7 @@ def prepare_compression1(experiment_name):
         return False
     return True
 
-def run_compression1(name, rf=None, i=None, type=None, c=None):
+def run_compression1(name, it=None, rf=None, i=None, type=None, c=None):
     # 1. check slim db
     # convert dat-file to db-file if it does not exist
     if not os.path.exists(cst.SLIM_DATA_DIR + name + "/" + name + ".db"):

diff --git a/experiments_logging.py b/experiments_logging.py
@@ -5,6 +5,7 @@
 from mpl_toolkits.mplot3d import Axes3D
 import data_generation as dg_old
 import util
+import numpy as np
 
 
 def plot_disc(problem, method):
@@ -41,18 +42,17 @@ def plot_data_3d(data):
     ax = fig.add_subplot(111, projection='3d')
     # data = data[np.logical_and(data[0] < 0, data[1] > 0)]
 
-    ## 3d parity problem
-    # color_cond = {'b': np.logical_and(data[0] < 0, np.logical_and(data[1] > 0, data[2] < 0)),
-    #               'k': np.logical_and(data[0] < 0, np.logical_and(data[1] > 0, data[2] > 0)),
-    #               'g': np.logical_and(data[0] > 0, data[1] < 0),
-    #               'r': np.logical_and(data[0] < 0, data[1] < 0),
-    #               'c': np.logical_and(data[0] > 0, data[1] > 0),
-    #               }
-    # for c in color_cond:
-    #     ax.scatter(data[0][color_cond[c]], data[1][color_cond[c]], data[2][color_cond[c]], c=c, s=1)
+    # 3d parity problem
+    color_cond = {'b': data[3] == 1,
+                  'k': data[3] == 2,
+                  'r': data[3] == 3,
+                  'g': data[3] == 4,
+                  }
+    for c in color_cond:
+        ax.scatter(data[0][color_cond[c]], data[1][color_cond[c]], data[2][color_cond[c]], c=c, s=1)
 
     ## without coloring
-    ax.scatter(data[0], data[1], data[2], c='k', s=1)
+    # ax.scatter(data[0], data[1], data[2], c='k', s=1)
 
     ax.set_xlabel('X0')
     ax.set_ylabel('X1')

diff --git a/main.py b/main.py
@@ -727,7 +727,7 @@ def prepare(base_dir, data_file, method, time_mark=False, delim=";", columns=Non
 
 def collect_experiment_params(base_dir):
 
-    def collect(name, rf, i, type, c):
+    def collect(name, it, rf, i, type, c):
         params = []
         file_path = cst.DATA_DIR + name + ".csv"
 
@@ -766,10 +766,13 @@ def collect(name, rf, i, type, c):
 
 
 if __name__ == "__main__":
+
+    # print(compute_predefined_subspace_sets_naive(5))
+    # exit(1)
     # cubes_03_10_c
     # print(compute_predefined_subspace_sets(3, [[0,1,2]]))
     # exit(1)
-    params = collect_experiment_params("logs_test3")
+    params = collect_experiment_params("logs_test")
     # print(params)
     # print(compute_subspace_sets("cubes_10_03_i.csv", cst.Method.PREDEFINED_SUBSPACESETS))
     # exit(1)

diff --git a/runExperiment.py b/runExperiment.py
@@ -94,18 +94,18 @@ def register_ideal_disc(self, name):
     loader = Loader()
 
     # todo items.put(WHATEVER PARAMETERS OF TASK)
-    # params = dg.produce_all_data_generators()
-    # for data_generator in params:
-    #     items.put(data_generator)
+    params = dg.produce_all_data_generators()
+    for data_generator in params:
+        items.put(data_generator)
 
-    params = main.collect_experiment_params("logs_test")
+    # params = main.collect_experiment_params("logs_test")
     if len(params) == 0:
         print("no parameters collected!")
         exit(0)
-    for param in params:
-        loader.register_dataset(param.data_file)
-        loader.register_ideal_disc(param.experiment_name)
-        items.put(param)
+    # for param in params:
+    #     loader.register_dataset(param.data_file)
+    #     loader.register_ideal_disc(param.experiment_name)
+    #     items.put(param)
 
     if onlyListTasks:
         while not items.empty():
@@ -143,8 +143,8 @@ def worker(worker_id):
             print('Worker ID ', worker_id, 'is executing', para)
             # todo generate data sets
 
-            # datasets.put(para.build())
-            datasets.put(main.execute(para, loader))
+            datasets.put(para.build())
+            # datasets.put(main.execute(para, loader))
             print('Worker ID ', worker_id, ' execution finished')
             with counterLock:
                 if runningMain:
@@ -161,8 +161,9 @@ def datasetWriter():
             while True:
                 try:
                     result = datasets.get(block=True, timeout=10)
-                    # dg.store(result)
-                    main.store(result, loader)
+                    # todo store
+                    dg.store(result)
+                    # main.store(result, loader)
                 except queue.Empty:
                     break