From b5d861f40499c1fde5a7962339d5754dd4fc618a Mon Sep 17 00:00:00 2001
From: Tatiana Dembelova <s8tademb@stud.uni-saarland.de>
Date: Sat, 14 Oct 2017 17:52:25 +0200
Subject: [PATCH] all the discretization quality measures are calculated and
 stored in the storing phase

---
 commands.txt                      |   8 +-
 constants.py                      |   9 +-
 data_generator.py                 |  68 ++++---
 discretization_quality_measure.py | 276 ++++++++++++++++------------
 experiments_logging.py            |  29 ++-
 main.py                           | 292 +++++++++++++++++-------------
 old/temp_exp.py                   |   2 -
 runExperiment.py                  |  41 ++++-
 util.py                           |  30 ++-
 9 files changed, 456 insertions(+), 299 deletions(-)

diff --git a/commands.txt b/commands.txt
index 9ee0fef..a7b1b0d 100644
--- a/commands.txt
+++ b/commands.txt
@@ -2,7 +2,7 @@ ssh tdembelo@contact.mmci.uni-saarland.de
 
 ssh tdembelo@push.mmci.uni-saarland.de
 
-rsync -av --exclude '.idea/' --exclude '.git' --exclude='logs*' --exclude "*.png" --exclude "data/*" --exclude "tableau/" /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/ tdembelo@badr.mpi-inf.mpg.de:/home/tdembelo/ipd_extended/
+rsync -av --exclude 'ideal_disc/' --exclude 'synthetic_cases/' --exclude '.idea/' --exclude '.git' --exclude='logs*' --exclude "*.png" --exclude "data/*" --exclude "tableau/" --exclude "new_cubes/" /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/ tdembelo@contact.mmci.uni-saarland.de:/home/tdembelo/ipd_extended/
 
 rsync -av tdembelo@badr.mpi-inf.mpg.de:/home/tdembelo/ipd_extended/logs_quality/ /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/logs_quality/
 
@@ -44,4 +44,8 @@ for f in *_*_T*.csv; do mv $f "${f/*_*_T/T}" ;done
 for f in *_*_CJS*.csv; do mv $f "${f/*_*_C/C}" ;done
 for f in *_*_CJS*.csv; do echo mv $f "${f/*_*_C/C}" ;done
 
-rsync -av --exclude 'data*' /Users/tatyanadembelova/Documents/study/thesis/code-fic/ tdembelo@push.mmci.uni-saarland.de:/home/tdembelo/code-fic/
\ No newline at end of file
+rsync -av --exclude 'data*' /Users/tatyanadembelova/Documents/study/thesis/code-fic/ tdembelo@push.mmci.uni-saarland.de:/home/tdembelo/code-fic/
+
+# slim
+./bootstrap.sh
+make -Cbuild install
\ No newline at end of file
diff --git a/constants.py b/constants.py
index 341bc46..cc3ebc5 100644
--- a/constants.py
+++ b/constants.py
@@ -15,7 +15,7 @@ class Method(Enum):
     PREDEFINED_SUBSPACESETS = 9
     PREDEFINED_OPTIMAL_SUBSPACESET = 10
     PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT = 11
-    FULL = 11
+    FULL = 12
 
 
 class CorrelationMeasure(Enum):
@@ -50,7 +50,7 @@ class DistanceMeasure(Enum):
 SUBSPACE_SET_STEP = 2
 
 # todo change later
-IRRELEVANT_FEATURES = 3
+IRRELEVANT_FEATURES = 4
 
 BASE = '/local/tmp/ipd_extended_experiments2/' if socket.gethostname() == 'push' \
     else '/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/'
@@ -62,4 +62,7 @@ class DistanceMeasure(Enum):
 SLIM_DATA_DIR = SLIM_BASE + "data/"
 SLIM_BIN = SLIM_BASE + "branches/slim/trunk/fic"
 SLIM_COMPRESS_CONF = SLIM_BASE + "branches/slim/trunk/compress.conf"
-SLIM_CONVERT_CONF = SLIM_BASE + "branches/slim/trunk/convertdb.conf"
\ No newline at end of file
+SLIM_CONVERT_CONF = SLIM_BASE + "branches/slim/trunk/convertdb.conf"
+
+PRECISION_RECALL_FILENAME = "Precision_recall_runtime.csv"
+COMPRESSION_FILENAME = "Compression.csv"
\ No newline at end of file
diff --git a/data_generator.py b/data_generator.py
index 7d06e9e..09d60cf 100644
--- a/data_generator.py
+++ b/data_generator.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import random
+import util
 import time
 import os
 import json
@@ -13,6 +14,7 @@
 ROWS = 6000
 OVERLAP_PROBABILITY = 0.6
 
+
 class CubeParameters:
     def __init__(self, rows, loc=None):
         self.rows = rows
@@ -21,13 +23,19 @@ def __init__(self, rows, loc=None):
 
 
 class CubesGenerator:
-    def __init__(self, feature_count, radius, file_name):
+    def __init__(self, rel_feature_count, irr_feature_count, radius, file_name):
+        self.rel_feature_count = rel_feature_count
         self.file_name = file_name
         self.cube_parameters = []
-        self.feature_count = feature_count
-        self.dim_borders = [[-radius, radius] for d in range(feature_count)]
+        self.feature_count = rel_feature_count + irr_feature_count
+        self.dim_borders = [[-radius, radius] for d in range(self.feature_count)]
         self.subspaces = []
-        self.perf_disc = [{d[1]} for d in self.dim_borders]
+        self.perf_disc = [{d[1]} for d in self.dim_borders[:rel_feature_count]]
+
+    def __repr__(self):
+        return 'CubesGenerator(file_name=' + str(self.file_name) \
+               + ', rel_feature_count=' + str(self.rel_feature_count) \
+               + ', feature_count=' + str(self.feature_count) + ")"
 
     def add_cube_parameter(self, cube_param):
         if cube_param.loc is None:
@@ -37,10 +45,11 @@ def add_cube_parameter(self, cube_param):
         s = list(location_params.keys())
         if s and not s in self.subspaces:
             self.subspaces.append(s)
-        for feat in range(self.feature_count):
+
+        # perfect discretization
+        for feat in range(self.rel_feature_count):
             if feat in cube_param.loc.keys():
                 dim_params = location_params[feat]
-                # perfect discretization
                 if dim_params[0] != -RADIUS:
                     self.perf_disc[feat].add(dim_params[0])
                 self.perf_disc[feat].add(dim_params[0] + dim_params[1])
@@ -61,6 +70,7 @@ def build(self):
             cube = []
             for feat in range(self.feature_count):
                 if feat in location_params.keys():
+                    assert feat < self.rel_feature_count
                     dim_params = location_params[feat]
                     if dim_params[0] < self.dim_borders[feat][0] \
                             or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]:
@@ -122,9 +132,8 @@ def generate_overlap_partition(rf, c):
     return partition
 
 
-def produce_data_generator(rf, irf, c, type, name):
-    total_f = rf + irf
-    dg = CubesGenerator(total_f, RADIUS, name)
+def produce_data_generator(rf, irf, c, type, file_name):
+    dg = CubesGenerator(rf, irf, RADIUS, file_name)
     # same number of records for each of the cubes + background
     cube_rows = int(ROWS / (c + 1))
     if type == 'c':
@@ -158,34 +167,21 @@ def produce_all_data_generators():
 
     perf_subspaces = dict()
     perf_discs = dict()
-    # relevant features 2 - 30
-    for rf in range(2, 3):
-        # cubes 1 - 10
-        for c in range(3, 4):
-            # cube types complete, incomplete, incomplete overlapping
-            for type in ['c']:
-
-    # relevant features 2 - 30
-    # for rf in range(2, 31):
-    #         # cubes 1 - 10
-    #     for c in range(1, 11):
-    #         # cube types complete, incomplete, incomplete overlapping
-    #         for type in ['c', 'i', 'io']:
-                if (c == 1 or rf / c < 2) and type != 'c':
-                    continue
-                name = 'cubes_' + '{0:02d}'.format(rf) + '_' \
-                       + '{0:02d}'.format(c) + '_' \
-                       + type + '.csv'
-                # if os.path.exists(basedir + name) and os.path.exists(
-                #                         perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
-                #     continue
-
-                dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name)
-                perf_discs[name] = dg.get_discs()
-                perf_subspaces[name] = dg.get_subspaces()
-                data_generators.append(dg)
+
+    def produce_dg(name, rf, c, type):
+
+        # if os.path.exists(basedir + name) and os.path.exists(
+        #                         perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
+        #     continue
+
+        dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name + ".csv")
+        perf_discs[name] = dg.get_discs()
+        perf_subspaces[name] = dg.get_subspaces()
+        data_generators.append(dg)
+
+    util.collect_params(produce_dg)
     for name in perf_discs:
-        write_cut_file(perf_disc_dir + 'cut_' + name.replace('csv', 'txt'), perf_discs[name])
+        write_cut_file(perf_disc_dir + 'cut_' + name + ".txt", perf_discs[name])
     with open(perf_subspaces_file, 'w') as psf:
         json.dump(perf_subspaces, psf)
     return data_generators
diff --git a/discretization_quality_measure.py b/discretization_quality_measure.py
index d684b71..7f745b0 100644
--- a/discretization_quality_measure.py
+++ b/discretization_quality_measure.py
@@ -10,10 +10,11 @@
 MAX_DIM_COUNT = 4
 
 
-def parse_cuts(name):
+def parse_cuts(experiment_name):
+    name = re.search("(.+?_.+?_.+?_.+?)_", experiment_name).group(1)
     try:
         cuts = []
-        with open(name, "r") as f:
+        with open(cst.PERFECT_DISCRETIZATIONS_DIR + "cut_" + name + ".txt", "r") as f:
             cut = []
             for line in f:
                 if line.startswith("dimension"):
@@ -138,32 +139,18 @@ def disc_distance(expected_cuts, cuts):
 
 
 # prepare slim db
-def prepare_compression1(directory, name):
+def prepare_compression1(experiment_name):
     try:
-        dims_count = util.parse_relevant_features(directory + "/" + name)
-        underline_name = name.replace("-", "_")
-        if not os.path.exists(directory + "/" + underline_name + ".csv/out.txt"):
-            return
-        escaped_name = util.get_escaped_name(underline_name)
-        if not os.path.exists(cst.SLIM_DATA_DIR + escaped_name):
-            os.makedirs(cst.SLIM_DATA_DIR + escaped_name)
-        # create a formatted data file
-        with open(directory + "/" + underline_name + ".csv/" + cst.FILE_DATA_OUTPUT, "r") as init_file:
-            with open(cst.SLIM_DATA_DIR + escaped_name + "/" + escaped_name + ".dat", "w") as new_file:
-                row = 0
-                for line in init_file:
-                    if line.startswith("@") or line.strip() == "":
-                        continue
-                    split = line.split(",")
-                    new_file.write(" ".join(split[:dims_count]) + " " + split[-1])
-                    row += 1
-        # modify convert.conf
+        dat_file = cst.SLIM_DATA_DIR + experiment_name + "/" + experiment_name + ".dat"
+        if not os.path.exists(dat_file):
+            print("no initial dat-file for experiment", experiment_name)
+            return False
 
         with open(cst.SLIM_CONVERT_CONF, "r+") as conf_file:
             new_lines = []
             for line in conf_file:
                 if line.startswith("dbName"):
-                    line = "dbName = [" + escaped_name + "]\n"
+                    line = "dbName = [" + experiment_name + "]\n"
                 new_lines.append(line)
             conf_file.seek(0)
             conf_file.writelines(new_lines)
@@ -171,10 +158,69 @@ def prepare_compression1(directory, name):
 
         output = sp.check_output([cst.SLIM_BIN, cst.SLIM_CONVERT_CONF])
         if "exception" in str(output):
-            print('exception during preparation for ' + name)
+            print('exception during preparation for', experiment_name)
+            return False
+
+    except sp.CalledProcessError:
+        print('Prepare compression: conversion failed for', experiment_name)
+        return False
+    return True
+
+def run_compression1(name, rf=None, c=None, type=None):
+    # 1. check slim db
+    # convert dat-file to db-file if it does not exist
+    if not os.path.exists(cst.SLIM_DATA_DIR + name + "/" + name + ".db"):
+        if not prepare_compression1(name):
+            print("run_compression failed for", name)
+            return [name, "", ""]
+
+    # 2. modify compress.conf
+    with open(cst.SLIM_COMPRESS_CONF, "r+") as conf_file:
+        new_lines = []
+        for line in conf_file:
+            if line.startswith("iscName"):
+                line = "iscName = " + name + "-all-1d\n"
+            new_lines.append(line)
+        conf_file.seek(0)
+        conf_file.writelines(new_lines)
+        conf_file.truncate()
 
+    # 3. compress it
+    output = None
+    try:
+        output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=60))
+    except sp.TimeoutExpired:
+        timeout_counter = 0
+        while timeout_counter < 5:
+            try:
+                output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=60))
+                break
+            except sp.TimeoutExpired:
+                timeout_counter += 1
+        if not output:
+            print("timeout exceeded " + str(timeout_counter) + " times for " + name)
+            return [name, "", ""]
     except sp.CalledProcessError:
-        print('Prepare compression: conversion failed for ' + name)
+        return [name, "", ""]
+
+    search_start = re.search('Start:\\\\t\\\\t.+?,(\d+)\)', output)
+    if search_start:
+        start_comp = search_start.group(1)
+    else:
+        print("compression start is not found", name)
+        start_comp = ""
+    search_end = re.search('Result:\\\\t\\\\t.+?,(\d+)\)', output)
+    if search_end:
+        result_comp = search_end.group(1)
+    else:
+        print("compression end is not found", name)
+        result_comp = ""
+    return [name, start_comp, result_comp]
+
+
+def run_compression():
+    results = util.collect_params(run_compression1)
+    return results
 
 
 # returns runtime in seconds and mdl of compression
@@ -365,93 +411,95 @@ def disc_f1(expected, current):
 
 
 if __name__ == '__main__':
-    if len(sys.argv) == 1:
-        print(
-            'Usage: discretization_quality_measure.py '
-            '-p=<problem> '
-            '-m=<[original|greedy_topk|trivial|...]> '
-            '-cor=<[uds]> '
-            '-dist=<[id, cjs]> '
-            '-t=<threshold float> '
-            '-r=<number of rows> ')
-        command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID'
-        print('Running default: ', command)
-        command_list = command.split(' ')
-    else:
-        command_list = sys.argv[1:]
-
-    problem_arg = list(filter(lambda x: x.startswith("-p="), command_list))
-    # if not problem_arg:
-    #     raise ValueError('No problem provided!')
-    base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list))
-    if not base_dir_arg:
-        raise ValueError('No logs base dir provided!')
-    method_arg = list(filter(lambda x: x.startswith("-m="), command_list))
-    # if not method_arg:
-    #     raise ValueError('No method provided!')
-    distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list))
-    # if not distance_measure_arg:
-    #     raise ValueError('No distance measure provided!')
-    threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list))
-    # if not threshold_arg:
-    #     raise ValueError('No threshold provided!')
-    # irr_feat_start_arg = list(filter(lambda x: x.startswith("-is="), command_list))
-    # irr_feat_end_arg = list(filter(lambda x: x.startswith("-ie="), command_list))
-
-    base_dir = base_dir_arg[0].replace('-b=', '')
-    if not os.path.exists(base_dir):
-        os.makedirs(base_dir)
-    if problem_arg:
-        problem = problem_arg[0].replace('-p=', '')
-    if method_arg:
-        method = cst.Method[method_arg[0].replace('-m=', '').upper()]
-    if distance_measure_arg:
-        distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()]
-    if threshold_arg:
-        threshold = float(threshold_arg[0].replace('-t=', ''))
-
-    problems = [
-        # "2d_3_cubes_aligned_xor",
-        # "2d_2_cubes_aligned",
-        # "2d_2_cubes_xor",
-        # "3d_2_cubes_aligned",
-        # "3d_2_cubes_xor",
-        # "3d_3_cubes_aligned",
-        # "3d_3_cubes_aligned_xor",
-        # "3d_3_cubes_xor",
-        # "3d_4_cubes_1_aligned_xor",
-        # "3d_4_cubes_2_aligned",
-        # "3d_4_cubes_xor",
-        # "4d_2_cubes_aligned",
-        # "4d_3_cubes_aligned_xor",
-        # "4d_3_cubes_xor",
-        # "4d_4_cubes_aligned_xor",
-        # "4d_4_cubes_2_aligned",
-        "4d_4_cubes_xor",
-    ]
-
-    runtime = []
-    perf = []
-    compression = []
-
-    cols = ['run-dim', 'precision', 'recall']
-    runtime_cols = ['run', 'subspace mining runtime', 'full runtime']
-    compression_cols = ['run', 'start compression', 'result compression']
-
-    disc_distances = []
-    for problem in problems:
-        print('problem:', problem)
-
-        for method in [cst.Method.TRIVIAL, cst.Method.ORIGINAL, cst.Method.PREDEFINED]:
-        # for method in [cst.Method.PERFECT]:
-            print('method:', method)
-            data = compute_problem_quality_measure(base_dir, problem, method=method)
-            if not data:
-                continue
-            runtime.extend(data[0])
-            perf.extend(data[1])
-            compression.extend(data[2])
-    time = util.now()
-    pd.DataFrame(perf, columns=cols).to_csv(base_dir + "/Precision_recall_" + time + ".csv")
-    pd.DataFrame(runtime, columns=runtime_cols).to_csv(base_dir + "/Discretization_runtimes_" + time + ".csv")
-    pd.DataFrame(compression, columns=compression_cols).to_csv(base_dir + "/Compression_" + time + ".csv")
+    # compression and classification quality measures
+    run_compression()
+    # if len(sys.argv) == 1:
+    #     print(
+    #         'Usage: discretization_quality_measure.py '
+    #         '-p=<problem> '
+    #         '-m=<[original|greedy_topk|trivial|...]> '
+    #         '-cor=<[uds]> '
+    #         '-dist=<[id, cjs]> '
+    #         '-t=<threshold float> '
+    #         '-r=<number of rows> ')
+    #     command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID'
+    #     print('Running default: ', command)
+    #     command_list = command.split(' ')
+    # else:
+    #     command_list = sys.argv[1:]
+    #
+    # problem_arg = list(filter(lambda x: x.startswith("-p="), command_list))
+    # # if not problem_arg:
+    # #     raise ValueError('No problem provided!')
+    # base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list))
+    # if not base_dir_arg:
+    #     raise ValueError('No logs base dir provided!')
+    # method_arg = list(filter(lambda x: x.startswith("-m="), command_list))
+    # # if not method_arg:
+    # #     raise ValueError('No method provided!')
+    # distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list))
+    # # if not distance_measure_arg:
+    # #     raise ValueError('No distance measure provided!')
+    # threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list))
+    # # if not threshold_arg:
+    # #     raise ValueError('No threshold provided!')
+    # # irr_feat_start_arg = list(filter(lambda x: x.startswith("-is="), command_list))
+    # # irr_feat_end_arg = list(filter(lambda x: x.startswith("-ie="), command_list))
+    #
+    # base_dir = base_dir_arg[0].replace('-b=', '')
+    # if not os.path.exists(base_dir):
+    #     os.makedirs(base_dir)
+    # if problem_arg:
+    #     problem = problem_arg[0].replace('-p=', '')
+    # if method_arg:
+    #     method = cst.Method[method_arg[0].replace('-m=', '').upper()]
+    # if distance_measure_arg:
+    #     distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()]
+    # if threshold_arg:
+    #     threshold = float(threshold_arg[0].replace('-t=', ''))
+    #
+    # problems = [
+    #     # "2d_3_cubes_aligned_xor",
+    #     # "2d_2_cubes_aligned",
+    #     # "2d_2_cubes_xor",
+    #     # "3d_2_cubes_aligned",
+    #     # "3d_2_cubes_xor",
+    #     # "3d_3_cubes_aligned",
+    #     # "3d_3_cubes_aligned_xor",
+    #     # "3d_3_cubes_xor",
+    #     # "3d_4_cubes_1_aligned_xor",
+    #     # "3d_4_cubes_2_aligned",
+    #     # "3d_4_cubes_xor",
+    #     # "4d_2_cubes_aligned",
+    #     # "4d_3_cubes_aligned_xor",
+    #     # "4d_3_cubes_xor",
+    #     # "4d_4_cubes_aligned_xor",
+    #     # "4d_4_cubes_2_aligned",
+    #     "4d_4_cubes_xor",
+    # ]
+    #
+    # runtime = []
+    # perf = []
+    # compression = []
+    #
+    # cols = ['run-dim', 'precision', 'recall']
+    # runtime_cols = ['run', 'subspace mining runtime', 'full runtime']
+    # compression_cols = ['run', 'start compression', 'result compression']
+    #
+    # disc_distances = []
+    # for problem in problems:
+    #     print('problem:', problem)
+    #
+    #     for method in [cst.Method.TRIVIAL, cst.Method.ORIGINAL, cst.Method.PREDEFINED]:
+    #     # for method in [cst.Method.PERFECT]:
+    #         print('method:', method)
+    #         data = compute_problem_quality_measure(base_dir, problem, method=method)
+    #         if not data:
+    #             continue
+    #         runtime.extend(data[0])
+    #         perf.extend(data[1])
+    #         compression.extend(data[2])
+    # time = util.now()
+    # pd.DataFrame(perf, columns=cols).to_csv(base_dir + "/Precision_recall_" + time + ".csv")
+    # pd.DataFrame(runtime, columns=runtime_cols).to_csv(base_dir + "/Discretization_runtimes_" + time + ".csv")
+    # pd.DataFrame(compression, columns=compression_cols).to_csv(base_dir + "/Compression_" + time + ".csv")
diff --git a/experiments_logging.py b/experiments_logging.py
index b3762fd..d9b904f 100644
--- a/experiments_logging.py
+++ b/experiments_logging.py
@@ -115,18 +115,29 @@ def write_out_file(problem, disc_intervals, disc_points, class_labels):
     return lines
 
 
-def write_outdat_file(disc_intervals, disc_points, class_labels, relevant_features):
-    lines = []
+def get_out_files(experiment_name, disc_intervals, disc_points, class_labels, relevant_features):
+    dat_lines = []
+    arff_lines = ['@relation ' + experiment_name + "\n\n"]
+
     counter = [1]
-    for i in range(len(disc_intervals)):
+    for i in range(relevant_features):
+        arff_lines.append(
+            '@attribute dim' + str(i) + ' {' + ','.join([str(j + counter[-1]) for j in disc_intervals[i]]) + '}\n')
         counter.append(counter[-1] + len(disc_intervals[i]))
+    arff_lines.append('@attribute class {' + ','.join(['"' + str(i) + '"' for i in class_labels.unique()]) + '}\n\n')
+    arff_lines.append('@data\n')
+
     for i in range(len(disc_points[0])):
-        line = ' '.join([str(disc_points[j][i] + counter[j]) for j in range(relevant_features)])
-        lines.append(line + " " + str(class_labels[i]) + '\n')
-    return lines
+        values = [str(disc_points[j][i] + counter[j]) for j in range(relevant_features)]
+        dat_line = ' '.join(values)
+        dat_lines.append(dat_line + " " + str(class_labels[i]) + '\n')
+
+        arff_line = ",".join(values)
+        arff_lines.append(arff_line + ',"' + str(class_labels[i]) + '"\n')
+    return dat_lines, arff_lines
 
 
-def write_cut_file(disc_intervals):
+def get_cut_file(disc_intervals):
     lines = []
     for i in range(len(disc_intervals)):
         lines.append('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
@@ -136,6 +147,10 @@ def write_cut_file(disc_intervals):
     return lines
 
 
+def get_cuts(disc_intervals):
+    return [[disc_intervals[i][bin][1] for bin in disc_intervals[i]] for i in range(len(disc_intervals))]
+
+
 if __name__ == '__main__':
     # rows = 20000
     # data = np.concatenate((synthetic_cube_in_cube(rows, 2, 0), np.zeros((rows, 1))), axis=1)
diff --git a/main.py b/main.py
index 6bd9931..442b6db 100644
--- a/main.py
+++ b/main.py
@@ -21,7 +21,7 @@
 import experiments_logging as el
 from merging import dynamic_merging
 import cjs
-import discretization_quality_measure as dq
+import discretization_quality_measure as dqm
 import json
 import random
 import traceback
@@ -77,7 +77,7 @@ def compute_distances(bin_map, curr, data, dim_maxes,
                       log=None):
     if method == cst.Method.FULL:
         return (id.compute_IDs(bin_map, curr, data, dim_maxes) if distance_measure == cst.DistanceMeasure.ID
-            else cjs.compute_CJSs(bin_map, curr, data, dim_maxes)), 0
+                else cjs.compute_CJSs(bin_map, curr, data, dim_maxes)), 0
     if method.name.startswith("SM"):
         subspace_mining_start = time.time()
         if method == cst.Method.GREEDY_TOPK:
@@ -105,7 +105,7 @@ def compute_distances(bin_map, curr, data, dim_maxes,
     dim_maxes = dim_maxes[curr_subspace]
 
     return (id.compute_IDs1(bin_map, data, dim_maxes) if distance_measure == cst.DistanceMeasure.ID
-               else cjs.compute_CJSs1(bin_map, data, dim_maxes)), sm_runtime
+            else cjs.compute_CJSs1(bin_map, data, dim_maxes)), sm_runtime
 
 
 def compute_IPD(data, rel_features_count, method=cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET, cor_measure=None,
@@ -136,7 +136,7 @@ def compute_IPD(data, rel_features_count, method=cst.Method.PREDEFINED_OPTIMAL_S
     disc_macro_intervals = []
     disc_points = []
 
-    subspace_map = get_map_from_subspace_set(subspace_set)
+    subspace_map = get_map_from_subspace_set(subspace_set) if subspace_set else None
     distancez = []
     # iterate over all the dimensions
     full_sm_runtime = 0
@@ -372,7 +372,7 @@ def _compute_subspaces(dims, sets):
 # todo return list of dictionaries
 def get_ideal_subspace_set(data_file_name):
     # todo naive implementation
-    return ideal.get(data_file_name)
+    return ideal.get(data_file_name.replace(".csv", ""))
 
 
 def get_map_from_subspace_set(subspace_set):
@@ -402,6 +402,12 @@ def compute_subspace_sets(data_file_name, method):
                 rss = [ideal_subspace + [rf for rf in range(rel_features, irr)] for ideal_subspace in
                        ideal_subspace_set]
                 redundant_subspace_sets.append(rss)
+            if cst.IRRELEVANT_FEATURES % 2 == 0:
+                rss = [ideal_subspace + [rf for rf in range(rel_features, cst.IRRELEVANT_FEATURES + rel_features)] for
+                       ideal_subspace in
+                       ideal_subspace_set]
+                redundant_subspace_sets.append(rss)
+
             return redundant_subspace_sets
 
     elif method is cst.Method.PREDEFINED_SUBSPACESETS:
@@ -503,13 +509,16 @@ def execute(param, loader=None):
 
             # plot_distances(dir, distances, disc_intervals)
         # output file for classification measurements
-        outdat_file_content = el.write_outdat_file(disc_intervals, disc_points, class_labels, relevant_features)
+        outdat, outarff = el.get_out_files(experiment_name, disc_intervals, disc_points, class_labels,
+                                           relevant_features)
         # output file for compression measurements
         # slim_dat_content = dq.prepare_slim_dat(base_dir, experiment_name)
 
-        cut_file_content = el.write_cut_file(disc_intervals)
-        return Result(base_dir, experiment_name, outdat_file_content, cut_file_content, runtime, sm_runtime,
-                      init_bins_count)
+        cut = el.get_cuts(disc_intervals)
+        cut_file_content = el.get_cut_file(disc_intervals)
+        return Result(base_dir, experiment_name, outdat, outarff, cut, cut_file_content, runtime, sm_runtime,
+                      init_bins_count, relevant_features)
+        # return Result(base_dir, experiment_name, None, None, None, None, None, None, relevant_features)
     except:
         print("Error in " + experiment_name + ":", sys.exc_info()[0], sys.exc_info()[1])
         traceback.print_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2],
@@ -521,27 +530,52 @@ def execute(param, loader=None):
 
 
 class Result:
-    def __init__(self, base_dir, experiment_name, outdat_file_content, cut_file_content, runtime, sm_runtime,
-                 initial_bin_count):
+    def __init__(self, base_dir, experiment_name, outdat_file_content, outarff_file_content, cut, cut_file_content, runtime, sm_runtime,
+                 initial_bin_count, rel_feature_count):
+        self.cut_file_content = cut_file_content
+        self.outarff_file_content = outarff_file_content
+        self.rel_feature_count = rel_feature_count
         self.sm_runtime = sm_runtime
         self.initial_bin_count = initial_bin_count
         self.runtime = runtime
         self.base_dir = base_dir
         self.experiment_name = experiment_name
-        self.cut_file_content = cut_file_content
+        self.cut = cut
         self.outdat_file_content = outdat_file_content
-        self.dir = base_dir + '/' + experiment_name + "/"
+        self.dir = base_dir + experiment_name + "/"
 
+    def __repr__(self):
+        return "Result(experiment_name=" + self.experiment_name + ")"
 
-def store(result):
+
+def append_to_quality_measure_files(result, loader):
+    assert type(result) is Result
+    measure_file = result.base_dir + cst.PRECISION_RECALL_FILENAME
+    ideal_cuts = loader.load_ideal_disc(result.experiment_name) if loader else dqm.parse_cuts(result.experiment_name)
+    with open(measure_file, "a") as f:
+        for i in range(result.rel_feature_count):
+            f.write(",".join([result.experiment_name + "-dim" + str(i + 1),
+                              str(dqm.disc_precision(ideal_cuts[i], result.cut[i])),
+                              str(dqm.disc_recall(ideal_cuts[i], result.cut[i])),
+                              str(result.sm_runtime),
+                              str(result.runtime)]))
+            f.write("\n")
+    return
+
+
+def append_to_compression_files(result):
+    measure_file = result.base_dir + cst.COMPRESSION_FILENAME
+    with open(measure_file, "a") as f:
+        f.write(",".join(dqm.run_compression1(result.experiment_name)))
+        f.write("\n")
+
+
+def store(result, loader=None):
     if not result:
         return
     assert type(result) is Result
     print('storing experiment', result.experiment_name)
 
-    if not os.path.exists(result.base_dir):
-        os.makedirs(result.base_dir)
-
     if not os.path.exists(result.dir):
         os.makedirs(result.dir)
 
@@ -549,12 +583,19 @@ def store(result):
         f.write("initial bins count: " + str(result.initial_bin_count) + "\n")
         f.write("runtime " + str(result.runtime) + " seconds\n")
         f.write("sm runtime " + str(result.sm_runtime) + " seconds\n")
+
+    append_to_quality_measure_files(result, loader)
+
     if not os.path.exists(cst.SLIM_DATA_DIR + result.experiment_name):
         os.makedirs(cst.SLIM_DATA_DIR + result.experiment_name)
-
     with open(cst.SLIM_DATA_DIR + result.experiment_name + "/" + result.experiment_name + ".dat", "w") as f:
         f.writelines(result.outdat_file_content)
 
+    append_to_compression_files(result)
+
+    with open(result.dir + cst.FILE_DATA_OUTPUT, "w") as f:
+        f.writelines(result.outarff_file_content)
+
     with open(result.dir + cst.FILE_DATA_CUTS, "w") as f:
         f.writelines(result.cut_file_content)
 
@@ -574,10 +615,14 @@ def __init__(self, base_dir, experiment_name, method, data_file, delim, columns,
         self.experiment_name = experiment_name
         self.base_dir = base_dir
 
+    def __repr__(self):
+        return "RunParams(experiment_name=" + self.experiment_name +\
+               ", subspace_set=" + str(self.subspace_set) + ")"
+
 
 def prepare(base_dir, data_file, method, time_mark=False, delim=";", columns=None, rows=None,
             distance_measure=cst.DistanceMeasure.ID,
-            cor_measure=None, threshold=cst.ID_THRESHOLD_QUANTILE):
+            cor_measure=cst.CorrelationMeasure.UDS, threshold=cst.ID_THRESHOLD_QUANTILE):
     params = []
     # # defining prefix for the output files
     data_file_name = util.get_file_name(data_file)
@@ -588,19 +633,21 @@ def prepare(base_dir, data_file, method, time_mark=False, delim=";", columns=Non
     else:
         subspace_sets = None
 
-    base_dir = cst.BASE + base_dir
-    if not os.path.exists(base_dir):
-        os.makedirs(base_dir)
+    base_dir = cst.BASE + base_dir + "/"
     # full, trivial, SM methods
     if not method.name.startswith("PREDEFINED"):
         experiment_name = data_file_name.replace(".csv", "") + ("_" + str(columns) + "c" if columns else "") + (
             "_" + str(rows) + "r" if rows else "") + "_" \
-                          + method.name.replace("_", "")
+                          + method.name.replace("_", "") \
+                          + ("_" + cor_measure.name if method.name.startswith("SM") else "")
         timed_name = (util.now() if time_mark else "") + ("_" if time_mark else "") + experiment_name
-        params.append(
-            RunParams(base_dir, timed_name, method, data_file, delim, columns, rows, distance_measure, threshold,
-                      cor_measure, None))
-        print("prepared parameters for", experiment_name)
+        if not os.path.exists(base_dir + timed_name):
+            params.append(
+                RunParams(base_dir, timed_name, method, data_file, delim, columns, rows, distance_measure, threshold,
+                          cor_measure, None))
+            print("prepared parameters for", experiment_name)
+        else:
+            print("experiment", experiment_name, "has already been processed")
 
     # predefined subspace sets
     else:
@@ -615,6 +662,9 @@ def prepare(base_dir, data_file, method, time_mark=False, delim=";", columns=Non
                 counter) if method is cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT else "")
             counter += 1
             timed_name = (util.now() + "_" if time_mark else "") + experiment_name
+            if os.path.exists(base_dir + timed_name):
+                print("experiment", experiment_name, "has already been processed")
+                continue
             params.append(
                 RunParams(base_dir, timed_name, method, data_file, delim, columns, rows, distance_measure, threshold,
                           cor_measure, subspace_set))
@@ -623,107 +673,103 @@ def prepare(base_dir, data_file, method, time_mark=False, delim=";", columns=Non
 
 
 def collect_params(base_dir):
-    params = []
-    # relevant features 2 - 30
-    for rf in range(2, 31):
-            # cubes 1 - 10
-        for c in range(1, 11):
-            # cube types complete, incomplete, incomplete overlapping
-            for type in ['c', 'i', 'io']:
-    # for rf in range(2, 3):
-    #     # cubes 1 - 10
-    #     for c in range(3, 4):
-    #         # cube types complete, incomplete, incomplete overlapping
-    #         for type in ['c']:
-                if (c == 1 or rf / c < 2) and type != 'c':
-                    continue
-                filepath = cst.DATA_DIR + 'cubes_' + '{0:02d}'.format(rf) + '_' \
-                           + '{0:02d}'.format(c) + '_' \
-                           + type + '.csv'
-                for method in [cst.Method.PREDEFINED_SUBSPACESETS,
-                               cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT,
-                               cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET
-                               ]:
-                    params.extend(prepare(base_dir, filepath, method))
-    return params
+
+    def collect(name, rf, c, type):
+        params = []
+        file_path = cst.DATA_DIR + name + ".csv"
+        for method in [
+            cst.Method.PREDEFINED_SUBSPACESETS,
+            cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT,
+            cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET,
+            cst.Method.TRIVIAL,
+            cst.Method.FULL
+        ]:
+            print("preparing", name, method)
+            params.extend(prepare(base_dir, file_path, method))
+        return params
+
+    return util.collect_params(collect)
 
 
 if __name__ == "__main__":
-    params = collect_params("logs_test")
+    # params = collect_params("logs_test")
+    # print(params)
     # print(compute_subspace_sets("cubes_10_03_i.csv", cst.Method.PREDEFINED_SUBSPACESETS))
     # exit(1)
-    # if len(sys.argv) == 1:
-    #     # print(
-    #     #     'Usage: main.py '
-    #     #     '-b=<logs base dir> '
-    #     #     '-f=<data_file> '
-    #     #     '-d=<delimiter> '
-    #     #     '-c=<number of columns> '
-    #     #     '-m=<[original|greedy_topk|trivial|...]> '
-    #     #     '-cor=<[uds]> '
-    #     #     '-dist=<[id, cjs]> '
-    #     #     '-t=<threshold float> '
-    #     #     '-s[=<subspace>] '
-    #     #     '-r=<number of rows> ')
-    #     # command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID'
-    #     # print('Running default: ', command)
-    #     # command_list = command.split(' ')
-    #     raise ValueError("no arguments passed!")
-    # else:
-    #     command_list = sys.argv[1:]
-    #
-    # file_arg = list(filter(lambda x: x.startswith("-f="), command_list))
-    # if not file_arg:
-    #     raise ValueError('No data file provided!')
-    # base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list))
-    # if not base_dir_arg:
-    #     raise ValueError('No logs base dir provided!')
-    # time_mark = len(list(filter(lambda x: x.startswith("-time"), command_list))) != 0
-    # delim_arg = list(filter(lambda x: x.startswith("-d="), command_list))
-    # columns_arg = list(filter(lambda x: x.startswith("-c="), command_list))
-    # rows_arg = list(filter(lambda x: x.startswith("-r="), command_list))
-    # method_arg = list(filter(lambda x: x.startswith("-m="), command_list))
-    # corr_measure_arg = list(filter(lambda x: x.startswith("-cor="), command_list))
-    # distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list))
-    # threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list))
-    #
-    # data_file = file_arg[0].replace('-f=', '')
-    # base_dir = base_dir_arg[0].replace('-b=', '')
-    #
-    # if delim_arg:
-    #     delimiter = delim_arg[0].replace('-d=', '')
-    # else:
-    #     print('using default delimiter ;')
-    #     delimiter = ';'
-    # columns = int(columns_arg[0].replace('-c=', '')) if columns_arg else None
-    # rows = int(rows_arg[0].replace('-r=', '')) if rows_arg else None
-    # if method_arg:
-    #     method = cst.Method[method_arg[0].replace('-m=', '').upper()]
-    # else:
-    #     print('using default method PREDEFINED_OPTIMAL_SUBSPACESET')
-    #     method = cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET
-    #
-    # cor_measure = cst.CorrelationMeasure[corr_measure_arg[0].replace('-cor=', '').upper()] if corr_measure_arg \
-    #     else None
-    # if method.name.startswith("SM") and cor_measure is None:
-    #     raise ValueError('A correlation measure should be given!')
-    #
-    # if distance_measure_arg:
-    #     distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()]
-    #     print('using distance measure ' + distance_measure.name)
-    # else:
-    #     distance_measure = cst.DistanceMeasure.ID
-    #     print('using default distance measure ID')
-    # if threshold_arg:
-    #     threshold = float(threshold_arg[0].replace('-t=', ''))
-    #
-    #     print('using ID_THRESHOLD_QUANTILE = ', str(threshold))
-    # else:
-    #     threshold = cst.ID_THRESHOLD_QUANTILE
-    #     print('using default ID_THRESHOLD_QUANTILE = ', str(threshold))
-    #
-    # params = prepare(base_dir, data_file, method, time_mark, delimiter, columns, rows, distance_measure, cor_measure,
-    #                  threshold)
+
+    if len(sys.argv) == 1:
+        # print(
+        #     'Usage: main.py '
+        #     '-b=<logs base dir> '
+        #     '-f=<data_file> '
+        #     '-d=<delimiter> '
+        #     '-c=<number of columns> '
+        #     '-m=<[original|greedy_topk|trivial|...]> '
+        #     '-cor=<[uds]> '
+        #     '-dist=<[id, cjs]> '
+        #     '-t=<threshold float> '
+        #     '-s[=<subspace>] '
+        #     '-r=<number of rows> ')
+        # command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID'
+        # print('Running default: ', command)
+        # command_list = command.split(' ')
+        raise ValueError("no arguments passed!")
+    else:
+        command_list = sys.argv[1:]
+
+    file_arg = list(filter(lambda x: x.startswith("-f="), command_list))
+    if not file_arg:
+        raise ValueError('No data file provided!')
+    base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list))
+    if not base_dir_arg:
+        raise ValueError('No logs base dir provided!')
+    time_mark = len(list(filter(lambda x: x.startswith("-time"), command_list))) != 0
+    delim_arg = list(filter(lambda x: x.startswith("-d="), command_list))
+    columns_arg = list(filter(lambda x: x.startswith("-c="), command_list))
+    rows_arg = list(filter(lambda x: x.startswith("-r="), command_list))
+    method_arg = list(filter(lambda x: x.startswith("-m="), command_list))
+    corr_measure_arg = list(filter(lambda x: x.startswith("-cor="), command_list))
+    distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list))
+    threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list))
+
+    data_file = file_arg[0].replace('-f=', '')
+    base_dir = base_dir_arg[0].replace('-b=', '')
+
+    if delim_arg:
+        delimiter = delim_arg[0].replace('-d=', '')
+    else:
+        print('using default delimiter ;')
+        delimiter = ';'
+    columns = int(columns_arg[0].replace('-c=', '')) if columns_arg else None
+    rows = int(rows_arg[0].replace('-r=', '')) if rows_arg else None
+    if method_arg:
+        method = cst.Method[method_arg[0].replace('-m=', '').upper()]
+    else:
+        print('using default method PREDEFINED_OPTIMAL_SUBSPACESET')
+        method = cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET
+
+    cor_measure = cst.CorrelationMeasure[corr_measure_arg[0].replace('-cor=', '').upper()] if corr_measure_arg \
+        else None
+    if method.name.startswith("SM") and cor_measure is None:
+        raise ValueError('A correlation measure should be given!')
+
+    if distance_measure_arg:
+        distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()]
+        print('using distance measure ' + distance_measure.name)
+    else:
+        distance_measure = cst.DistanceMeasure.ID
+        print('using default distance measure ID')
+    if threshold_arg:
+        threshold = float(threshold_arg[0].replace('-t=', ''))
+
+        print('using ID_THRESHOLD_QUANTILE = ', str(threshold))
+    else:
+        threshold = cst.ID_THRESHOLD_QUANTILE
+        print('using default ID_THRESHOLD_QUANTILE = ', str(threshold))
+
+    params = prepare(base_dir, data_file, method, time_mark, delimiter, columns, rows, distance_measure, cor_measure,
+                     threshold)
+
 
     for p in params:
         result = execute(p)
diff --git a/old/temp_exp.py b/old/temp_exp.py
index 7b15f62..a83d391 100644
--- a/old/temp_exp.py
+++ b/old/temp_exp.py
@@ -12,8 +12,6 @@
 import interaction_distance as id
 import util
 from correlation_measures.binning import Binning
-from experiments_logging import write_out_file, write_cut_file
-from merging import dynamic_merging
 
 
 # ------------------------------------------------------
diff --git a/runExperiment.py b/runExperiment.py
index bf548f0..02baa5b 100755
--- a/runExperiment.py
+++ b/runExperiment.py
@@ -17,6 +17,7 @@
 import psutil
 import main
 import pandas as pd
+import discretization_quality_measure as dqm
 
 newRun = None
 nbThreads = int(multiprocessing.cpu_count() / 2)
@@ -52,37 +53,57 @@
 # for data_generator in data_generators:
 #     items.put(data_generator)
 
-class UnregisteredDataset(Exception):
+class UnregisteredItem(Exception):
     pass
 
 with multiprocessing.Manager() as manager:
     class Loader():
         def __init__(self):
             self.dataset = manager.dict()
+            self.ideal_discs = manager.dict()
 
             self.global_lock = multiprocessing.RLock()
             self.dataset_locks = {}#manager.dict()
+            self.ideal_disc_locks = {}#manager.dict()
+
+        def load_ideal_disc(self, name):
+            if not name in self.ideal_disc_locks:
+                raise UnregisteredItem('Unregistered ideal discretization shall be loaded ', name)
+
+            with self.ideal_disc_locks[name]:
+                if not name in self.ideal_discs:
+                    self.ideal_discs[name] = dqm.parse_cuts(name)
+                return self.ideal_discs[name]
 
         def load_dataset(self, path, delim):
             if not path in self.dataset_locks:
-                raise UnregisteredDataset('Unregistered dataset shall be loaded ', path)
+                raise UnregisteredItem('Unregistered dataset shall be loaded ', path)
 
             with self.dataset_locks[path]:
                 if not path in self.dataset:
                     self.dataset[path] = pd.read_csv(path, delimiter=delim, header=None, na_values='?')
                 return self.dataset[path]
+
         def register_dataset(self, path):
             with self.global_lock:
                 if path not in self.dataset_locks:
                     self.dataset_locks[path] = multiprocessing.RLock()
 
+        def register_ideal_disc(self, name):
+            with self.global_lock:
+                if name not in self.ideal_disc_locks:
+                    self.ideal_disc_locks[name] = multiprocessing.RLock()
 
-    loader = Loader()
 
+    loader = Loader()
 
     params = main.collect_params("logs_test")
+    if len(params) == 0:
+        print("no parameters collected!")
+        exit(0)
     for param in params:
         loader.register_dataset(param.data_file)
+        loader.register_ideal_disc(param.experiment_name)
         items.put(param)
 
     if onlyListTasks:
@@ -90,8 +111,8 @@ def register_dataset(self, path):
             para = items.get()
             print(para)
 
-    nbTasksTotal = items.qsize()
-    nbTasksDone = 0
+    nbTasksTotal = len(params)
+    nbTasksDone = [0]
     counterLock = multiprocessing.RLock()
     paramQueueLock = multiprocessing.RLock()
     runningMain = True
@@ -118,7 +139,7 @@ def worker(worker_id):
                     para = items.get(block=False)
             except queue.Empty:
                 return
-            print('thread ', threading.get_ident(), ' / started', para)
+            print('Worker ID ', worker_id, 'is executing', para)
             # todo generate data sets
 
             # datasets.put(para.build())
@@ -126,8 +147,8 @@ def worker(worker_id):
             print('Worker ID ', worker_id, ' execution finished')
             with counterLock:
                 if runningMain:
-                    nbTasksDone += 1
-                    print("Jobs done ", nbTasksDone, "/", nbTasksTotal)
+                    nbTasksDone[0] += 1
+                    print("Jobs done ", nbTasksDone[0], "/", nbTasksTotal)
                     # items.task_done()
 
 
@@ -140,7 +161,7 @@ def datasetWriter():
                 try:
                     result = datasets.get(block=True, timeout=10)
                     # dg.store(dataset)
-                    main.store(result)
+                    main.store(result, loader)
                 except queue.Empty:
                     break
 
@@ -148,7 +169,7 @@ def datasetWriter():
                     break
 
             with counterLock:
-                if nbTasksDone == nbTasksTotal and datasets.empty() or not runningMain:
+                if nbTasksDone[0] == nbTasksTotal and datasets.empty() or not runningMain:
                     break
 
 
diff --git a/util.py b/util.py
index 65df0f5..5a65852 100644
--- a/util.py
+++ b/util.py
@@ -13,10 +13,36 @@ def get_escaped_name(problem):
     return problem.replace("-", "_").replace(".", "")
 
 
-def parse_relevant_features(data_file):
-    data_file_name = get_file_name(data_file)
+def parse_relevant_features(data_file_name):
     search = re.search('cubes_(\d+)_', data_file_name)
     if not search:
         raise ValueError("wrong file format!")
     dims_count = int(search.group(1))
     return dims_count
+
+
+def collect_params(f):
+    params = []
+    # relevant features 2 - 30
+    for rf in range(2, 31):
+        # cubes 1 - 10
+        for c in range(1, 11):
+            # cube types complete, incomplete, incomplete overlapping
+            for t in ["c", 'i', "io"]:
+    # for rf in range(2, 3):
+    #     # cubes 1 - 10
+    #     for c in range(3, 4):
+    #         # cube types complete, incomplete, incomplete overlapping
+    #         for type in ['c']:
+                if (c == 1 or rf / c < 2) and t != 'c':
+                    continue
+                dataset_name = 'cubes_' + '{0:02d}'.format(rf) + '_' \
+                       + '{0:02d}'.format(c) + '_' \
+                       + t
+                param = f(dataset_name, rf, c, t)
+                print('collected param:', param)
+                if type(param) == list:
+                    params.extend(param)
+                else:
+                    params.append(param)
+    return params
\ No newline at end of file