From f9ee841a5c79938049a2dbc3885e16b89ed0de9c Mon Sep 17 00:00:00 2001
From: Tatiana Dembelova <s8tademb@stud.uni-saarland.de>
Date: Thu, 19 Oct 2017 13:25:22 +0200
Subject: [PATCH] added new baseline method, PREDEFINED_SUBSPACESETS_NAIVE

---
 ID_correlation_measure.py         | 63 ++++++++++++++++++++++++++++++
 constants.py                      | 12 ++++--
 data_generation.py                | 15 +++++++
 data_generator.py                 | 42 +++++++++++---------
 discretization_quality_measure.py |  2 +-
 experiments_logging.py            |  6 ++-
 main.py                           | 24 ++++++++++--
 runExperiment.py                  | 15 +++----
 run_classification.py             | 39 +++++++++++++++++++
 uds.py                            |  8 +++-
 util.py                           | 65 +++++++++++++++++++++++--------
 11 files changed, 238 insertions(+), 53 deletions(-)
 create mode 100644 ID_correlation_measure.py
 create mode 100644 run_classification.py

diff --git a/ID_correlation_measure.py b/ID_correlation_measure.py
new file mode 100644
index 0000000..f18d209
--- /dev/null
+++ b/ID_correlation_measure.py
@@ -0,0 +1,63 @@
+import numpy as np
+import experiments_logging as log
+import pandas as pd
+import interaction_distance as id
+import data_generator as dg
+
+def evidence_ID():
+    # no interaction
+    b = np.matrix(np.random.uniform(1, 2, (4000, 1)))
+    back = np.matrix(np.random.uniform(0, 2, (4000, 1)))
+    res = np.append(b, back, axis=0)
+    b1 = np.matrix(np.random.uniform(0, 2, (8000, 1)))
+
+    # either horizontal or vertical tube
+    # all = np.append(b1, res, axis=1)
+    all = np.append(res, b1, axis=1)
+    df = pd.DataFrame(all)
+    df = df.sort_values(by=0).reset_index(drop=True)
+    print(id.compute_ID(df.loc[:100, 1].to_frame(), df.loc[7900:8000, 1].to_frame(), [0, 2]))
+    # log.plot_data_2d(df)
+    # log.plot_data_2d(pd.concat([df.loc[:100], df.loc[7900:8000]], axis=0))
+
+    # cube interaction
+    b = np.matrix(np.random.uniform(1, 2, (4000, 1)))
+    back = np.matrix(np.random.uniform(0, 2, (4000, 1)))
+    res = np.append(b, back, axis=0)
+    b1 = np.matrix(np.random.uniform(1, 2, (4000, 1)))
+    back1 = np.matrix(np.random.uniform(0, 2, (4000, 1)))
+    res1 = np.append(b1, back1, axis=0)
+
+    all = np.append(res, res1, axis=1)
+    df = pd.DataFrame(all)
+    df = df.sort_values(by=0).reset_index(drop=True)
+    print(id.compute_ID(df.loc[:100, 1].to_frame(), df.loc[7900:8000, 1].to_frame(), [0, 2]))
+    # log.plot_data_2d(df)
+    # log.plot_data_2d(pd.concat([df.loc[:100], df.loc[7900:8000]], axis=0))
+
+cg = dg.produce_cube_generator(7, 0, 2, "i", ".csv")
+data, filname = cg.build()
+print(cg.subspaces)
+print(cg.perf_disc)
+
+data = pd.DataFrame(data)
+dim_count = data.shape[1]
+for curr in data[:-1]:
+    dims = data.columns.tolist()
+    dims.remove(curr)
+    dims.remove(dim_count - 1)
+    curr_data = data.sort_values(by=curr).reset_index(drop=True).loc[:, dims]
+    rows = curr_data.shape[0]
+    print('curr dimension', curr)
+    for dim in dims:
+        counter = 0
+        ids = []
+        while(True):
+            if counter + 280 > rows:
+                break
+            ids.append(id.compute_ID(curr_data.loc[counter:counter + 140, dim].to_frame(),
+                                     curr_data.loc[counter + 140: counter + 280, dim].to_frame(), [2] * dim_count))
+            counter += 1
+        # needs data normalization todo
+        print('interaction with', dim, np.average(ids))
+    # break
\ No newline at end of file
diff --git a/constants.py b/constants.py
index a8f79f8..e70c158 100644
--- a/constants.py
+++ b/constants.py
@@ -15,6 +15,7 @@ class Method(Enum):
     PREDEFINED_SUBSPACESETS = 9 # # the subspace sets gradually increase number of dimensions in one of the subspaces chosen randomly; subspace sets are chosen with a step of 2
     PREDEFINED_SUBSPACESETS_SYNCHRONOUS_GREEDY = 13 # the subspace sets gradually increase number of dimensions in all the subspaces; if a subspace has been used up, it extends to irrelevant dimensions
     PREDEFINED_SUBSPACESETS_SYNCHRONOUS_OPTIMAL = 14 # the subspace sets gradually increase number of dimensions in all the subspaces; if a subspace has been used up, it stays the same
+    PREDEFINED_SUBSPACESETS_NAIVE = 15
 
     PREDEFINED_OPTIMAL_SUBSPACESET = 10
     PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT = 11
@@ -72,9 +73,12 @@ class DistanceMeasure(Enum):
 
 # new settings (more constrained)
 IRRELEVANT_FEATURES_RANGE_LIST = [0, 1, 2, 4, 8, 16, 32, 64, 99, 3, 6, 12, 24, 48, 82] if socket.gethostname() != 'push' else [0, 1, 2, 3]
-RELEVANT_FEATURES_RANGE_LIST = [2, 3, 4, 6, 8, 12, 16, 23, 30] if socket.gethostname() == 'push' else [2]
+RELEVANT_FEATURES_RANGE_LIST = [2, 3, 4, 6, 8, 12, 16, 23, 30] if socket.gethostname() == 'push' else [7]
 INTERACTION_NUMBER_RANGE_LIST = [1, 2, 4, 8, 10] if socket.gethostname() == 'push' else [3]
-INTERACTION_TYPES_RANGE_LIST=["c", 'i', "io"] if socket.gethostname() == 'push' else ['c']
+INTERACTION_TYPES_RANGE_LIST=["c", 'i', "io"] if socket.gethostname() == 'push' else ['i']
+CUBES_LOWER_BOUND=1
+CUBES_UPPER_BOUND=3
+NAIVE_CHUNKS_NUMBER_RANGE_LIST = [2, 3, 4, 5, 10, 20, 30]
 
 
 BASE = '/local/tmp/ipd_extended_experiments2/' if socket.gethostname() == 'push' \
@@ -91,4 +95,6 @@ class DistanceMeasure(Enum):
 SLIM_CONVERT_CONF = SLIM_BASE + "branches/slim/trunk/convertdb.conf"
 
 PRECISION_RECALL_FILENAME = "Precision_recall_runtime.csv"
-COMPRESSION_FILENAME = "Compression.csv"
\ No newline at end of file
+COMPRESSION_FILENAME = "Compression.csv"
+
+WEKA_BIN = "/local/tmp/ipd_extended_experiments2/weka/weka-3-9-1/weka.jar" if socket.gethostname() == 'push' else "/Users/tatyanadembelova/Downloads/weka-3-9-1/weka.jar"
\ No newline at end of file
diff --git a/data_generation.py b/data_generation.py
index cf61838..fe6c696 100644
--- a/data_generation.py
+++ b/data_generation.py
@@ -8,17 +8,32 @@
 
 # synthetic case from uds
 def correlated_data(m, n, sigma, f):
+    # l1 = int(n / 2)
+    # l2 = n - l1
+    # Z = np.random.normal(0, 1, (m, l1))
+    # A = np.matrix(np.random.uniform(1, 2, (l1, l1)))
+    # X1 = Z * A
+    # B = np.matrix(np.random.uniform(1, 2, (l1, l2)))
+    # W = X1 * B
+    # E = np.random.normal(0, sigma, (m, l2))
+    # X2 = f(W) + E
+    # result = np.append(X1, X2, axis=1)
+    # print(result)
+
     l1 = int(n / 2)
     l2 = n - l1
     Z = np.random.normal(0, 1, (m, l1))
     A = np.matrix(np.random.uniform(1, 2, (l1, l1)))
     X1 = Z * A
+    # A = np.matrix(np.random.uniform(1, 2, (m, l1)))
+    # X1 = A
     B = np.matrix(np.random.uniform(1, 2, (l1, l2)))
     W = X1 * B
     E = np.random.normal(0, sigma, (m, l2))
     X2 = f(W) + E
     result = np.append(X1, X2, axis=1)
     print(result)
+
     return result
 
 
diff --git a/data_generator.py b/data_generator.py
index 4dde7fd..d9c6571 100644
--- a/data_generator.py
+++ b/data_generator.py
@@ -132,24 +132,25 @@ def generate_overlap_partition(rf, c):
     return partition
 
 
-def produce_data_generator(rf, irf, c, type, file_name):
+def produce_cube_generator(rf, irf, interactions, type, cubes, file_name):
     dg = CubesGenerator(rf, irf, RADIUS, file_name)
-    # same number of records for each of the cubes + background
-    cube_rows = int(ROWS / (c + 1))
+    # same number of records for each of the interactions * cubes + background
+    cube_rows = int(ROWS / (interactions * cubes + 1))
     if type == 'c':
-        partition = [range(rf) for i in range(c)]
+        partition = [range(rf) for i in range(interactions)]
     elif type == 'i':
-        partition = generate_partition(rf, c)
+        partition = generate_partition(rf, interactions)
     elif type == 'io':
-        partition = generate_overlap_partition(rf, c)
+        partition = generate_overlap_partition(rf, interactions)
     else:
         raise ValueError("no such type!")
 
     for p in partition:
-        location = dict()
-        for j in p:
-            location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH)
-        dg.add_cube_parameter(CubeParameters(cube_rows, location))
+        for cube in range(cubes):
+            location = dict()
+            for j in p:
+                location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH)
+            dg.add_cube_parameter(CubeParameters(cube_rows, location))
     dg.add_cube_parameter(CubeParameters(cube_rows))
     return dg
 
@@ -167,13 +168,13 @@ def produce_all_data_generators():
     perf_subspaces = dict()
     perf_discs = dict()
 
-    def produce_dg(name, rf, c, type):
+    def produce_dg(name, rf, i, type, cubes):
 
-        # if os.path.exists(basedir + name) and os.path.exists(
-        #                         perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
-        #     continue
+        if os.path.exists(basedir + name) and os.path.exists(
+                                perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
+            return
 
-        dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name + ".csv")
+        dg = produce_cube_generator(rf, cst.IRRELEVANT_FEATURES, i, type, cubes, name + ".csv")
         perf_discs[name] = dg.get_discs()
         perf_subspaces[name] = dg.get_subspaces()
         data_generators.append(dg)
@@ -213,8 +214,13 @@ def store(data):
 
 
 if __name__ == '__main__':
+    cg = produce_cube_generator(7, 2, 3, 'c', 'bla')
+    print(cg.subspaces)
+    cg = produce_cube_generator(7, 2, 3, 'i', 'bla')
+    print(cg.subspaces)
     # print(generate_overlap_partition(7, 3))
-    generators = produce_all_data_generators()
-    for g in generators:
 
-        store(g.build())
+    # generators = produce_all_data_generators()
+    # for g in generators:
+    #
+    #     store(g.build())
diff --git a/discretization_quality_measure.py b/discretization_quality_measure.py
index 49fe9d7..7ac065c 100644
--- a/discretization_quality_measure.py
+++ b/discretization_quality_measure.py
@@ -166,7 +166,7 @@ def prepare_compression1(experiment_name):
         return False
     return True
 
-def run_compression1(name, rf=None, c=None, type=None):
+def run_compression1(name, rf=None, i=None, type=None, c=None):
     # 1. check slim db
     # convert dat-file to db-file if it does not exist
     if not os.path.exists(cst.SLIM_DATA_DIR + name + "/" + name + ".db"):
diff --git a/experiments_logging.py b/experiments_logging.py
index 3272d2b..6e5a877 100644
--- a/experiments_logging.py
+++ b/experiments_logging.py
@@ -1,6 +1,7 @@
 import matplotlib.pyplot as plt
 import pandas as pd
 import discretization_quality_measure as dq
+import data_generation as dg
 from mpl_toolkits.mplot3d import Axes3D
 import data_generation as dg_old
 import util
@@ -86,7 +87,7 @@ def save_plot_data_3d(f, data):
 
 
 def plot_data_2d(data):
-    plt.scatter(data[1], data[2], s=1, c='k')
+    plt.scatter(data[0], data[1], s=1, c='k')
     plt.xlabel("dim 0")
     plt.ylabel("dim 1")
     plt.show()
@@ -158,7 +159,8 @@ def get_cuts(disc_intervals):
     # data = pd.read_csv("synthetic_cases/blobs/3d_3_blobs_aligned.csv", delimiter=";", header=None, na_values='?')
     # data = pd.read_csv("new_cubes/cubes_10_100_03_i.csv", delimiter=";", header=None, na_values='?')
     # data = pd.read_csv("new_cubes/cubes_02_03_c.csv", delimiter=";", na_values='?', header=None)
-    data = pd.read_csv('synthetic_cases/uds_new.csv', delimiter=',', header=None)
+    # data = pd.read_csv('synthetic_cases/uds_new.csv', delimiter=',', header=None)
+    data = pd.DataFrame(dg.correlated_data(4000, 2, 0.1, dg.func3))
     # data = pd.DataFrame(dg_old.correlated_data(4000, 3, 0.5, dg_old.func3))
     # data = pd.DataFrame(dg.cubes(4000))
     plot_data_2d(data)
diff --git a/main.py b/main.py
index 14bd5c0..dd4a9e5 100644
--- a/main.py
+++ b/main.py
@@ -447,6 +447,20 @@ def compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subs
     return subspace_sets
 
 
+def compute_predefined_subspace_sets_naive(rel_features):
+    dims = [i for i in range(rel_features + cst.IRRELEVANT_FEATURES)]
+    random.shuffle(dims)
+    subspace_sets = []
+    for chunks in cst.NAIVE_CHUNKS_NUMBER_RANGE_LIST:
+        ss = list(util.chunks(dims, chunks))
+        # merge the last with the previous subspace, if the last consists only of 1 dimension
+        if len(ss[-1]) == 1:
+            ss[-2].extend(ss[-1])
+            del ss[-1]
+        subspace_sets.append(ss)
+    return subspace_sets
+
+
 def compute_subspace_sets(data_file_name, method):
     rel_features = util.parse_relevant_features(data_file_name)
     ideal_subspace_set = get_ideal_subspace_set(data_file_name)
@@ -476,9 +490,11 @@ def compute_subspace_sets(data_file_name, method):
         return compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subspace_set, True)
     elif method is cst.Method.PREDEFINED_SUBSPACESETS_SYNCHRONOUS_OPTIMAL:
         return compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subspace_set, False)
+    elif method is cst.Method.PREDEFINED_SUBSPACESETS_NAIVE:
+        return compute_predefined_subspace_sets_naive(rel_features)
 
     else:
-        raise ValueError("wrong method!")
+        raise ValueError("the method has not been implemented yet! " + method)
 
 
 def execute(param, loader=None):
@@ -709,9 +725,9 @@ def prepare(base_dir, data_file, method, time_mark=False, delim=";", columns=Non
     return params
 
 
-def collect_dataset_params(base_dir):
+def collect_experiment_params(base_dir):
 
-    def collect(name, rf, c, type):
+    def collect(name, rf, i, type, c):
         params = []
         file_path = cst.DATA_DIR + name + ".csv"
 
@@ -753,7 +769,7 @@ def collect(name, rf, c, type):
     # cubes_03_10_c
     # print(compute_predefined_subspace_sets(3, [[0,1,2]]))
     # exit(1)
-    params = collect_dataset_params("logs_test3")
+    params = collect_experiment_params("logs_test3")
     # print(params)
     # print(compute_subspace_sets("cubes_10_03_i.csv", cst.Method.PREDEFINED_SUBSPACESETS))
     # exit(1)
diff --git a/runExperiment.py b/runExperiment.py
index 25f5109..304630d 100755
--- a/runExperiment.py
+++ b/runExperiment.py
@@ -18,6 +18,7 @@
 import main
 import pandas as pd
 import discretization_quality_measure as dqm
+import data_generator as dg
 
 newRun = None
 nbThreads = int(multiprocessing.cpu_count() / 2)
@@ -48,11 +49,6 @@
 
 items = multiprocessing.Queue()
 
-# todo items.put(WHATEVER PARAMETERS OF TASK)
-# data_generators = dg.produce_all_data_generators()
-# for data_generator in data_generators:
-#     items.put(data_generator)
-
 class UnregisteredItem(Exception):
     pass
 
@@ -97,7 +93,12 @@ def register_ideal_disc(self, name):
 
     loader = Loader()
 
-    params = main.collect_dataset_params("logs_test")
+    # todo items.put(WHATEVER PARAMETERS OF TASK)
+    # params = dg.produce_all_data_generators()
+    # for data_generator in params:
+    #     items.put(data_generator)
+
+    params = main.collect_experiment_params("logs_test")
     if len(params) == 0:
         print("no parameters collected!")
         exit(0)
@@ -160,7 +161,7 @@ def datasetWriter():
             while True:
                 try:
                     result = datasets.get(block=True, timeout=10)
-                    # dg.store(dataset)
+                    # dg.store(result)
                     main.store(result, loader)
                 except queue.Empty:
                     break
diff --git a/run_classification.py b/run_classification.py
new file mode 100644
index 0000000..7c8f9ec
--- /dev/null
+++ b/run_classification.py
@@ -0,0 +1,39 @@
+import constants as cst
+import subprocess as sp
+import re
+import os
+
+
+def run_random_forest1(base_dir_name, experiment_name):
+    file_path = cst.BASE + base_dir_name + "/" + experiment_name + "/out.arff"
+    if not os.path.exists(file_path):
+        return None
+    try:
+        output = str(sp.check_output(["java", "-cp", cst.WEKA_BIN,
+                                      "weka.classifiers.trees.RandomForest", '-P', '100', '-I',
+                                      '100', '-num-slots', '1', '-K', '0', '-M', '1.0', '-V', '0.001', '-S', '1',
+                                      "-t", file_path], timeout=30))
+        match = re.search('Correctly Classified Instances\s+\d+\s+(\d+\.\d+)\s+%', output)
+        if match:
+            return experiment_name + "," + match.group(1)
+        return experiment_name + ",?"
+    except sp.TimeoutExpired:
+        print("timeout exceeded", experiment_name)
+        return experiment_name + ",?"
+
+
+def classify_experiments(base_dir_name):
+    results = []
+    for experiment in os.listdir(cst.BASE + base_dir_name):
+        if 'cubes' not in experiment:
+            continue
+        classification = run_random_forest1(base_dir_name, experiment)
+        results.append(classification)
+        results.append("\n")
+    return results
+
+if __name__ == '__main__':
+    base_dir_name = "logs_test"
+    res = classify_experiments(base_dir_name)
+    with open(cst.BASE + base_dir_name + "/Classification.csv", "w") as f:
+        f.writelines(res)
\ No newline at end of file
diff --git a/uds.py b/uds.py
index cf79686..f2ddf43 100644
--- a/uds.py
+++ b/uds.py
@@ -6,6 +6,7 @@
 import data_generation as dg
 from correlation_measures.binning import Binning
 from data_generation import correlated_data
+import experiments_logging as log
 
 # bins count
 UDS_BETA = 20
@@ -151,8 +152,10 @@ def compute_uds(data):
 
 if __name__ == "__main__":
 
-    # data = pd.read_csv('synthetic_cases/uds_new.csv', delimiter=',') # uds_new.csv 0.361766479055
-    data = pd.read_csv('new_cubes/cubes_02_03_c.csv', delimiter=';', header=None) # uds_new.csv 0.361766479055
+    # data = pd.read_csv('synthetic_cases/uds_new.csv', delimiter=',', header=None) # uds_new.csv 0.361766479055
+    data = pd.DataFrame(dg.correlated_data(4000, 2, 0.1, dg.func2))
+
+    # data = pd.read_csv('new_cubes/cubes_02_03_c.csv', delimiter=';', header=None)
     data = data.loc[:, :3]
     # data = pd.DataFrame(correlated_data(4000, 20, 10, dg.func2))
     # data = pd.DataFrame(generate_correlated_data(1000, 10, 2, func1))
@@ -161,6 +164,7 @@ def compute_uds(data):
     uds = compute_uds(data)
 
     print(uds)
+    log.plot_data_2d(data)
     # print(es)
 
     # compute
diff --git a/util.py b/util.py
index a2b6a3f..e60abd1 100644
--- a/util.py
+++ b/util.py
@@ -26,20 +26,53 @@ def collect_params(f):
     # relevant features 2 - 30
     # for rf in range(cst.RELEVANT_FEATURES_LOWER_BOUND, cst.RELEVANT_FEATURES_UPPER_BOUND):
     for rf in cst.RELEVANT_FEATURES_RANGE_LIST:
-        # cubes 1 - 10
-        # for c in range(cst.CUBES_LOWER_BOUND, cst.CUBES_UPPER_BOUND):
-        for c in cst.INTERACTION_NUMBER_RANGE_LIST:
-            # cube types complete, incomplete, incomplete overlapping
+        # interactions 1 - 10
+        for i in cst.INTERACTION_NUMBER_RANGE_LIST:
+
+            # interaction types
+            # c - 1 interaction out of all relevant features
+            # i - partition of relevant features in i non-overlapping interactions
+            # io - partition of relevant features in i overlapping interactions
             for t in cst.INTERACTION_TYPES_RANGE_LIST:
-                if (c == 1 or rf / c < 2) and t != 'c':
-                    continue
-                dataset_name = 'cubes_' + '{0:02d}'.format(rf) + '_' \
-                       + '{0:02d}'.format(c) + '_' \
-                       + t
-                param = f(dataset_name, rf, c, t)
-                print('collected param:', param)
-                if type(param) == list:
-                    params.extend(param)
-                else:
-                    params.append(param)
-    return params
\ No newline at end of file
+                # cube number in each of the interactions
+                # todo random cube number in the constraints
+                for c in range(cst.CUBES_LOWER_BOUND, cst.CUBES_UPPER_BOUND):
+                    # only full set of relevant features is possible
+                    if t == 'c' and i > 1:
+                        continue
+                    if (i == 1 or rf / i < 2) and t != 'c':
+                        continue
+                    dataset_name = construct_dataset_name(i, rf, t, c)
+                    param = f(dataset_name, rf, i, t, c)
+                    print('collected param:', param)
+                    if not param:
+                        continue
+                    if type(param) == list:
+                        params.extend(param)
+                    else:
+                        params.append(param)
+    return params
+
+
+def construct_dataset_name(i, rf, t, c):
+    if t == 'c':
+        # for example, returns cubes_7_3_c.csv where 3 is a number of cubes
+        return 'cubes_' + '{0:02d}'.format(rf) + '_' \
+        + '{0:02d}'.format(c) + '_' \
+        + t
+    if c == 1:
+        # for example, returns cubes_7_3_i.csv where 3 is a number of interactions with 1 cube in each
+        return 'cubes_' + '{0:02d}'.format(rf) + '_' \
+        + '{0:02d}'.format(i) + '_' \
+        + t
+
+    # for example, returns cubes_7_3_2_i.csv where 3 is a number of interactions with 2 cube in each
+    return 'cubes_' + '{0:02d}'.format(rf) + '_' \
+           + '{0:02d}'.format(i) + '_' \
+           + '{0:02d}'.format(c) + '_' \
+           + t
+
+def chunks(l, n):
+    """Yield successive n-sized chunks from l."""
+    for i in range(0, len(l), n):
+        yield l[i:i + n]
\ No newline at end of file