concurrent execution of main.py

tdembelo · Oct 12, 2017 · 2585898 · 2585898
1 parent d8351be
commit 2585898
Show file tree

Hide file tree

Showing 7 changed files with 633 additions and 354 deletions.
diff --git a/commands.txt b/commands.txt
@@ -43,3 +43,5 @@ for pid in $(ps aux | grep 'python' | grep -v grep | grep -v USER | awk '{print
 for f in *_*_T*.csv; do mv $f "${f/*_*_T/T}" ;done
 for f in *_*_CJS*.csv; do mv $f "${f/*_*_C/C}" ;done
 for f in *_*_CJS*.csv; do echo mv $f "${f/*_*_C/C}" ;done
+
+rsync -av --exclude 'data*' /Users/tatyanadembelova/Documents/study/thesis/code-fic/ tdembelo@push.mmci.uni-saarland.de:/home/tdembelo/code-fic/
diff --git a/constants.py b/constants.py
@@ -1,15 +1,21 @@
 from enum import Enum
+import socket
 
 class Method(Enum):
     PERFECT = 8
     TRIVIAL = 0
-    ORIGINAL = 1
-    GREEDY_TOPK = 2
-    HET_GREEDY_TOPK = 3
-    BEST_FIRST = 4
-    BEAM_SEARCH = 5
-    HET_BEAM_SEARCH = 6
-    PREDEFINED = 7
+    # ORIGINAL = 1       #full ipd
+    SM_GREEDY_TOPK = 2
+    SM_HET_GREEDY_TOPK = 3
+    SM_BEST_FIRST = 4
+    SM_BEAM_SEARCH = 5
+    SM_HET_BEAM_SEARCH = 6
+    # PREDEFINED = 7     #subspaces up to optimal
+
+    PREDEFINED_SUBSPACESETS = 9
+    PREDEFINED_OPTIMAL_SUBSPACESET = 10
+    PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT = 11
+    FULL = 11
 
 
 class CorrelationMeasure(Enum):
@@ -23,7 +29,7 @@ class DistanceMeasure(Enum):
     CJS = 2
 
 
-ID_THRESHOLD_QUANTILE = 0.80
+ID_THRESHOLD_QUANTILE = 0.3
 ID_SLIDING_WINDOW = 40
 
 NORMALIZATION_RADIUS = 1
@@ -41,7 +47,19 @@ class DistanceMeasure(Enum):
 CLUMP = 2
 MAXMAX = 5
 
-SLIM_DATA_DIR = "/Users/tatyanadembelova/Documents/study/thesis/code-fic/data/"
-SLIM_BIN = "/Users/tatyanadembelova/Documents/study/thesis/code-fic/branches/slim/trunk/fic"
-SLIM_COMPRESS_CONF = "/Users/tatyanadembelova/Documents/study/thesis/code-fic/branches/slim/trunk/compress.conf"
-SLIM_CONVERT_CONF = "/Users/tatyanadembelova/Documents/study/thesis/code-fic/branches/slim/trunk/convertdb.conf"
+SUBSPACE_SET_STEP = 2
+
+# todo change later
+IRRELEVANT_FEATURES = 3
+
+BASE = '/local/tmp/ipd_extended_experiments2/' if socket.gethostname() == 'push' \
+    else '/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/'
+DATA_DIR = BASE + 'new_cubes/'
+PERFECT_DISCRETIZATIONS_DIR = BASE + 'ideal_disc/'
+PERFECT_SUBSPACES_JSON = BASE + 'ideal_subspaces.json'
+
+SLIM_BASE = ("/Users/tatyanadembelova/Documents/study/thesis/" if socket.gethostname() != 'push' else BASE) + "code-fic/"
+SLIM_DATA_DIR = SLIM_BASE + "data/"
+SLIM_BIN = SLIM_BASE + "branches/slim/trunk/fic"
+SLIM_COMPRESS_CONF = SLIM_BASE + "branches/slim/trunk/compress.conf"
+SLIM_CONVERT_CONF = SLIM_BASE + "branches/slim/trunk/convertdb.conf"
diff --git a/data_generator.py b/data_generator.py
@@ -4,14 +4,14 @@
 import time
 import os
 import json
+import constants as cst
 import experiments_logging as l
+import socket
 
 RADIUS = 2
 CUBE_WIDTH = 1
 ROWS = 6000
 OVERLAP_PROBABILITY = 0.6
-# BASE = '/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/'
-BASE = '/local/tmp/ipd_extended_experiments/'
 
 class CubeParameters:
     def __init__(self, rows, loc=None):
@@ -148,46 +148,42 @@ def produce_data_generator(rf, irf, c, type, name):
 def produce_all_data_generators():
     data_generators = []
     global basedir
+    basedir = cst.DATA_DIR
+    if not os.path.exists(basedir):
+        os.mkdir(basedir)
+    perf_disc_dir = cst.PERFECT_DISCRETIZATIONS_DIR
+    if not os.path.exists(perf_disc_dir):
+        os.mkdir(perf_disc_dir)
+    perf_subspaces_file = cst.PERFECT_SUBSPACES_JSON
 
-    basedir = BASE + 'new_cubes/'
-    perf_disc_dir = BASE + 'ideal_disc/'
-    perf_subspaces_file = BASE + 'ideal_subspaces.json'
     perf_subspaces = dict()
     perf_discs = dict()
-    ## relevant features 2 - 30
-    # for rf in range(10, 11):
-    #     # irrelevant features 0 - 100:
-    #     for irf in range(100, 101):
-    #         # cubes 1 - 10
-    #         for c in range(3, 4):
-    #             # cube types complete, incomplete, incomplete overlapping
-    #             for type in ['i']:
+    # relevant features 2 - 30
+    for rf in range(2, 3):
+        # cubes 1 - 10
+        for c in range(3, 4):
+            # cube types complete, incomplete, incomplete overlapping
+            for type in ['c']:
 
     # relevant features 2 - 30
-    for rf in range(2, 31):
-        # irrelevant features 0 - 100:
-        for irf in range(101):
-            # cubes 1 - 10
-            for c in range(1, 11):
-                # cube types complete, incomplete, incomplete overlapping
-                for type in ['c', 'i', 'io']:
-                    if c == 1 and type != 'c':
-                        continue
-                    if rf / c < 2 and type != 'c':
-                    # if not (rf / c < 2 and type == 'c'):
-                        continue
-                    name = 'cubes_' + '{0:02d}'.format(rf) + '_' \
-                           + '{0:03d}'.format(irf) + '_' \
-                           + '{0:02d}'.format(c) + '_' \
-                           + type + '.csv'
-                    # if os.path.exists(basedir + name) and os.path.exists(
-                    #                         perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
-                    #     continue
-
-                    dg = produce_data_generator(rf, irf, c, type, name)
-                    perf_discs[name] = dg.get_discs()
-                    perf_subspaces[name] = dg.get_subspaces()
-                    data_generators.append(dg)
+    # for rf in range(2, 31):
+    #         # cubes 1 - 10
+    #     for c in range(1, 11):
+    #         # cube types complete, incomplete, incomplete overlapping
+    #         for type in ['c', 'i', 'io']:
+                if (c == 1 or rf / c < 2) and type != 'c':
+                    continue
+                name = 'cubes_' + '{0:02d}'.format(rf) + '_' \
+                       + '{0:02d}'.format(c) + '_' \
+                       + type + '.csv'
+                # if os.path.exists(basedir + name) and os.path.exists(
+                #                         perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
+                #     continue
+
+                dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name)
+                perf_discs[name] = dg.get_discs()
+                perf_subspaces[name] = dg.get_subspaces()
+                data_generators.append(dg)
     for name in perf_discs:
         write_cut_file(perf_disc_dir + 'cut_' + name.replace('csv', 'txt'), perf_discs[name])
     with open(perf_subspaces_file, 'w') as psf:
@@ -211,4 +207,8 @@ def store(data):
 
 
 if __name__ == '__main__':
-    print(generate_overlap_partition(7, 3))
+    # print(generate_overlap_partition(7, 3))
+    generators = produce_all_data_generators()
+    for g in generators:
+
+        store(g.build())
diff --git a/experiments_logging.py b/experiments_logging.py
@@ -97,31 +97,43 @@ def save_plot_data_2d(f, data):
     plt.clf()
 
 
-def write_out_file(problem, name, disc_intervals, disc_points, class_labels):
-    with open(name, 'w') as out:
-        out.write('@relation ' + util.get_escaped_name(problem) + "\n\n")
-        counter = [1]
-        for i in range(len(disc_intervals)):
-            out.write(
-                '@attribute dim' + str(i) + ' {' + ','.join([str(j + counter[-1]) for j in disc_intervals[i]]) + '}\n')
-            counter.append(counter[-1] + len(disc_intervals[i]))
-        out.write('@attribute class {' + ','.join(['"' + str(i) + '"' for i in class_labels.unique()]) + '}\n\n')
-        out.write('@data\n')
-
-        for i in range(len(disc_points[0])):
-            for j in range(len(disc_points)):
-                out.write(str(disc_points[j][i] + counter[j]))
-                out.write(',')
-            out.write('"' + str(class_labels[i]) + '"\n')
-
-
-def write_cut_file(name, disc_intervals):
-    with open(name, 'w') as out:
-        for i in range(len(disc_intervals)):
-            out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
-            for bin in disc_intervals[i]:
-                out.write(str(disc_intervals[i][bin][1]) + '\n')
-            out.write('-------------------------------------\n')
+def write_out_file(problem, disc_intervals, disc_points, class_labels):
+    lines = ['@relation ' + util.get_escaped_name(problem) + "\n\n"]
+    counter = [1]
+    for i in range(len(disc_intervals)):
+        lines.append(
+            '@attribute dim' + str(i) + ' {' + ','.join([str(j + counter[-1]) for j in disc_intervals[i]]) + '}\n')
+        counter.append(counter[-1] + len(disc_intervals[i]))
+    lines.append('@attribute class {' + ','.join(['"' + str(i) + '"' for i in class_labels.unique()]) + '}\n\n')
+    lines.append('@data\n')
+
+    for i in range(len(disc_points[0])):
+        for j in range(len(disc_points)):
+            lines.append(str(disc_points[j][i] + counter[j]))
+            lines.append(',')
+        lines.append('"' + str(class_labels[i]) + '"\n')
+    return lines
+
+
+def write_outdat_file(disc_intervals, disc_points, class_labels, relevant_features):
+    lines = []
+    counter = [1]
+    for i in range(len(disc_intervals)):
+        counter.append(counter[-1] + len(disc_intervals[i]))
+    for i in range(len(disc_points[0])):
+        line = ' '.join([str(disc_points[j][i] + counter[j]) for j in range(relevant_features)])
+        lines.append(line + " " + str(class_labels[i]) + '\n')
+    return lines
+
+
+def write_cut_file(disc_intervals):
+    lines = []
+    for i in range(len(disc_intervals)):
+        lines.append('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
+        for bin in disc_intervals[i]:
+            lines.append(str(disc_intervals[i][bin][1]) + '\n')
+        lines.append('-------------------------------------\n')
+    return lines
 
 
 if __name__ == '__main__':