all the discretization quality measures are calculated and stored in …

…the storing phase
tdembelo · Oct 14, 2017 · b5d861f · b5d861f
1 parent 2585898
commit b5d861f
Show file tree

Hide file tree

Showing 9 changed files with 456 additions and 299 deletions.
diff --git a/commands.txt b/commands.txt
@@ -2,7 +2,7 @@ ssh tdembelo@contact.mmci.uni-saarland.de
 
 ssh tdembelo@push.mmci.uni-saarland.de
 
-rsync -av --exclude '.idea/' --exclude '.git' --exclude='logs*' --exclude "*.png" --exclude "data/*" --exclude "tableau/" /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/ tdembelo@badr.mpi-inf.mpg.de:/home/tdembelo/ipd_extended/
+rsync -av --exclude 'ideal_disc/' --exclude 'synthetic_cases/' --exclude '.idea/' --exclude '.git' --exclude='logs*' --exclude "*.png" --exclude "data/*" --exclude "tableau/" --exclude "new_cubes/" /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/ tdembelo@contact.mmci.uni-saarland.de:/home/tdembelo/ipd_extended/
 
 rsync -av tdembelo@badr.mpi-inf.mpg.de:/home/tdembelo/ipd_extended/logs_quality/ /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/logs_quality/
 
@@ -44,4 +44,8 @@ for f in *_*_T*.csv; do mv $f "${f/*_*_T/T}" ;done
 for f in *_*_CJS*.csv; do mv $f "${f/*_*_C/C}" ;done
 for f in *_*_CJS*.csv; do echo mv $f "${f/*_*_C/C}" ;done
 
-rsync -av --exclude 'data*' /Users/tatyanadembelova/Documents/study/thesis/code-fic/ tdembelo@push.mmci.uni-saarland.de:/home/tdembelo/code-fic/
+rsync -av --exclude 'data*' /Users/tatyanadembelova/Documents/study/thesis/code-fic/ tdembelo@push.mmci.uni-saarland.de:/home/tdembelo/code-fic/
+
+# slim
+./bootstrap.sh
+make -Cbuild install
diff --git a/constants.py b/constants.py
@@ -15,7 +15,7 @@ class Method(Enum):
     PREDEFINED_SUBSPACESETS = 9
     PREDEFINED_OPTIMAL_SUBSPACESET = 10
     PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT = 11
-    FULL = 11
+    FULL = 12
 
 
 class CorrelationMeasure(Enum):
@@ -50,7 +50,7 @@ class DistanceMeasure(Enum):
 SUBSPACE_SET_STEP = 2
 
 # todo change later
-IRRELEVANT_FEATURES = 3
+IRRELEVANT_FEATURES = 4
 
 BASE = '/local/tmp/ipd_extended_experiments2/' if socket.gethostname() == 'push' \
     else '/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/'
@@ -62,4 +62,7 @@ class DistanceMeasure(Enum):
 SLIM_DATA_DIR = SLIM_BASE + "data/"
 SLIM_BIN = SLIM_BASE + "branches/slim/trunk/fic"
 SLIM_COMPRESS_CONF = SLIM_BASE + "branches/slim/trunk/compress.conf"
-SLIM_CONVERT_CONF = SLIM_BASE + "branches/slim/trunk/convertdb.conf"
+SLIM_CONVERT_CONF = SLIM_BASE + "branches/slim/trunk/convertdb.conf"
+
+PRECISION_RECALL_FILENAME = "Precision_recall_runtime.csv"
+COMPRESSION_FILENAME = "Compression.csv"
diff --git a/data_generator.py b/data_generator.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import random
+import util
 import time
 import os
 import json
@@ -13,6 +14,7 @@
 ROWS = 6000
 OVERLAP_PROBABILITY = 0.6
 
+
 class CubeParameters:
     def __init__(self, rows, loc=None):
         self.rows = rows
@@ -21,13 +23,19 @@ def __init__(self, rows, loc=None):
 
 
 class CubesGenerator:
-    def __init__(self, feature_count, radius, file_name):
+    def __init__(self, rel_feature_count, irr_feature_count, radius, file_name):
+        self.rel_feature_count = rel_feature_count
         self.file_name = file_name
         self.cube_parameters = []
-        self.feature_count = feature_count
-        self.dim_borders = [[-radius, radius] for d in range(feature_count)]
+        self.feature_count = rel_feature_count + irr_feature_count
+        self.dim_borders = [[-radius, radius] for d in range(self.feature_count)]
         self.subspaces = []
-        self.perf_disc = [{d[1]} for d in self.dim_borders]
+        self.perf_disc = [{d[1]} for d in self.dim_borders[:rel_feature_count]]
+
+    def __repr__(self):
+        return 'CubesGenerator(file_name=' + str(self.file_name) \
+               + ', rel_feature_count=' + str(self.rel_feature_count) \
+               + ', feature_count=' + str(self.feature_count) + ")"
 
     def add_cube_parameter(self, cube_param):
         if cube_param.loc is None:
@@ -37,10 +45,11 @@ def add_cube_parameter(self, cube_param):
         s = list(location_params.keys())
         if s and not s in self.subspaces:
             self.subspaces.append(s)
-        for feat in range(self.feature_count):
+
+        # perfect discretization
+        for feat in range(self.rel_feature_count):
             if feat in cube_param.loc.keys():
                 dim_params = location_params[feat]
-                # perfect discretization
                 if dim_params[0] != -RADIUS:
                     self.perf_disc[feat].add(dim_params[0])
                 self.perf_disc[feat].add(dim_params[0] + dim_params[1])
@@ -61,6 +70,7 @@ def build(self):
             cube = []
             for feat in range(self.feature_count):
                 if feat in location_params.keys():
+                    assert feat < self.rel_feature_count
                     dim_params = location_params[feat]
                     if dim_params[0] < self.dim_borders[feat][0] \
                             or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]:
@@ -122,9 +132,8 @@ def generate_overlap_partition(rf, c):
     return partition
 
 
-def produce_data_generator(rf, irf, c, type, name):
-    total_f = rf + irf
-    dg = CubesGenerator(total_f, RADIUS, name)
+def produce_data_generator(rf, irf, c, type, file_name):
+    dg = CubesGenerator(rf, irf, RADIUS, file_name)
     # same number of records for each of the cubes + background
     cube_rows = int(ROWS / (c + 1))
     if type == 'c':
@@ -158,34 +167,21 @@ def produce_all_data_generators():
 
     perf_subspaces = dict()
     perf_discs = dict()
-    # relevant features 2 - 30
-    for rf in range(2, 3):
-        # cubes 1 - 10
-        for c in range(3, 4):
-            # cube types complete, incomplete, incomplete overlapping
-            for type in ['c']:
-
-    # relevant features 2 - 30
-    # for rf in range(2, 31):
-    #         # cubes 1 - 10
-    #     for c in range(1, 11):
-    #         # cube types complete, incomplete, incomplete overlapping
-    #         for type in ['c', 'i', 'io']:
-                if (c == 1 or rf / c < 2) and type != 'c':
-                    continue
-                name = 'cubes_' + '{0:02d}'.format(rf) + '_' \
-                       + '{0:02d}'.format(c) + '_' \
-                       + type + '.csv'
-                # if os.path.exists(basedir + name) and os.path.exists(
-                #                         perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
-                #     continue
-
-                dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name)
-                perf_discs[name] = dg.get_discs()
-                perf_subspaces[name] = dg.get_subspaces()
-                data_generators.append(dg)
+
+    def produce_dg(name, rf, c, type):
+
+        # if os.path.exists(basedir + name) and os.path.exists(
+        #                         perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
+        #     continue
+
+        dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name + ".csv")
+        perf_discs[name] = dg.get_discs()
+        perf_subspaces[name] = dg.get_subspaces()
+        data_generators.append(dg)
+
+    util.collect_params(produce_dg)
     for name in perf_discs:
-        write_cut_file(perf_disc_dir + 'cut_' + name.replace('csv', 'txt'), perf_discs[name])
+        write_cut_file(perf_disc_dir + 'cut_' + name + ".txt", perf_discs[name])
     with open(perf_subspaces_file, 'w') as psf:
         json.dump(perf_subspaces, psf)
     return data_generators