Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
concurrent execution of main.py
  • Loading branch information
Tatiana Dembelova committed Oct 12, 2017
1 parent d8351be commit 2585898
Show file tree
Hide file tree
Showing 7 changed files with 633 additions and 354 deletions.
2 changes: 2 additions & 0 deletions commands.txt
Expand Up @@ -43,3 +43,5 @@ for pid in $(ps aux | grep 'python' | grep -v grep | grep -v USER | awk '{print
for f in *_*_T*.csv; do mv $f "${f/*_*_T/T}" ;done
for f in *_*_CJS*.csv; do mv $f "${f/*_*_C/C}" ;done
for f in *_*_CJS*.csv; do echo mv $f "${f/*_*_C/C}" ;done

rsync -av --exclude 'data*' /Users/tatyanadembelova/Documents/study/thesis/code-fic/ tdembelo@push.mmci.uni-saarland.de:/home/tdembelo/code-fic/
42 changes: 30 additions & 12 deletions constants.py
@@ -1,15 +1,21 @@
from enum import Enum
import socket

class Method(Enum):
PERFECT = 8
TRIVIAL = 0
ORIGINAL = 1
GREEDY_TOPK = 2
HET_GREEDY_TOPK = 3
BEST_FIRST = 4
BEAM_SEARCH = 5
HET_BEAM_SEARCH = 6
PREDEFINED = 7
# ORIGINAL = 1 #full ipd
SM_GREEDY_TOPK = 2
SM_HET_GREEDY_TOPK = 3
SM_BEST_FIRST = 4
SM_BEAM_SEARCH = 5
SM_HET_BEAM_SEARCH = 6
# PREDEFINED = 7 #subspaces up to optimal

PREDEFINED_SUBSPACESETS = 9
PREDEFINED_OPTIMAL_SUBSPACESET = 10
PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT = 11
FULL = 11


class CorrelationMeasure(Enum):
Expand All @@ -23,7 +29,7 @@ class DistanceMeasure(Enum):
CJS = 2


ID_THRESHOLD_QUANTILE = 0.80
ID_THRESHOLD_QUANTILE = 0.3
ID_SLIDING_WINDOW = 40

NORMALIZATION_RADIUS = 1
Expand All @@ -41,7 +47,19 @@ class DistanceMeasure(Enum):
CLUMP = 2
MAXMAX = 5

SLIM_DATA_DIR = "/Users/tatyanadembelova/Documents/study/thesis/code-fic/data/"
SLIM_BIN = "/Users/tatyanadembelova/Documents/study/thesis/code-fic/branches/slim/trunk/fic"
SLIM_COMPRESS_CONF = "/Users/tatyanadembelova/Documents/study/thesis/code-fic/branches/slim/trunk/compress.conf"
SLIM_CONVERT_CONF = "/Users/tatyanadembelova/Documents/study/thesis/code-fic/branches/slim/trunk/convertdb.conf"
SUBSPACE_SET_STEP = 2

# todo change later
IRRELEVANT_FEATURES = 3

BASE = '/local/tmp/ipd_extended_experiments2/' if socket.gethostname() == 'push' \
else '/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/'
DATA_DIR = BASE + 'new_cubes/'
PERFECT_DISCRETIZATIONS_DIR = BASE + 'ideal_disc/'
PERFECT_SUBSPACES_JSON = BASE + 'ideal_subspaces.json'

SLIM_BASE = ("/Users/tatyanadembelova/Documents/study/thesis/" if socket.gethostname() != 'push' else BASE) + "code-fic/"
SLIM_DATA_DIR = SLIM_BASE + "data/"
SLIM_BIN = SLIM_BASE + "branches/slim/trunk/fic"
SLIM_COMPRESS_CONF = SLIM_BASE + "branches/slim/trunk/compress.conf"
SLIM_CONVERT_CONF = SLIM_BASE + "branches/slim/trunk/convertdb.conf"
76 changes: 38 additions & 38 deletions data_generator.py
Expand Up @@ -4,14 +4,14 @@
import time
import os
import json
import constants as cst
import experiments_logging as l
import socket

RADIUS = 2
CUBE_WIDTH = 1
ROWS = 6000
OVERLAP_PROBABILITY = 0.6
# BASE = '/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/'
BASE = '/local/tmp/ipd_extended_experiments/'

class CubeParameters:
def __init__(self, rows, loc=None):
Expand Down Expand Up @@ -148,46 +148,42 @@ def produce_data_generator(rf, irf, c, type, name):
def produce_all_data_generators():
data_generators = []
global basedir
basedir = cst.DATA_DIR
if not os.path.exists(basedir):
os.mkdir(basedir)
perf_disc_dir = cst.PERFECT_DISCRETIZATIONS_DIR
if not os.path.exists(perf_disc_dir):
os.mkdir(perf_disc_dir)
perf_subspaces_file = cst.PERFECT_SUBSPACES_JSON

basedir = BASE + 'new_cubes/'
perf_disc_dir = BASE + 'ideal_disc/'
perf_subspaces_file = BASE + 'ideal_subspaces.json'
perf_subspaces = dict()
perf_discs = dict()
## relevant features 2 - 30
# for rf in range(10, 11):
# # irrelevant features 0 - 100:
# for irf in range(100, 101):
# # cubes 1 - 10
# for c in range(3, 4):
# # cube types complete, incomplete, incomplete overlapping
# for type in ['i']:
# relevant features 2 - 30
for rf in range(2, 3):
# cubes 1 - 10
for c in range(3, 4):
# cube types complete, incomplete, incomplete overlapping
for type in ['c']:

# relevant features 2 - 30
for rf in range(2, 31):
# irrelevant features 0 - 100:
for irf in range(101):
# cubes 1 - 10
for c in range(1, 11):
# cube types complete, incomplete, incomplete overlapping
for type in ['c', 'i', 'io']:
if c == 1 and type != 'c':
continue
if rf / c < 2 and type != 'c':
# if not (rf / c < 2 and type == 'c'):
continue
name = 'cubes_' + '{0:02d}'.format(rf) + '_' \
+ '{0:03d}'.format(irf) + '_' \
+ '{0:02d}'.format(c) + '_' \
+ type + '.csv'
# if os.path.exists(basedir + name) and os.path.exists(
# perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
# continue

dg = produce_data_generator(rf, irf, c, type, name)
perf_discs[name] = dg.get_discs()
perf_subspaces[name] = dg.get_subspaces()
data_generators.append(dg)
# for rf in range(2, 31):
# # cubes 1 - 10
# for c in range(1, 11):
# # cube types complete, incomplete, incomplete overlapping
# for type in ['c', 'i', 'io']:
if (c == 1 or rf / c < 2) and type != 'c':
continue
name = 'cubes_' + '{0:02d}'.format(rf) + '_' \
+ '{0:02d}'.format(c) + '_' \
+ type + '.csv'
# if os.path.exists(basedir + name) and os.path.exists(
# perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
# continue

dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name)
perf_discs[name] = dg.get_discs()
perf_subspaces[name] = dg.get_subspaces()
data_generators.append(dg)
for name in perf_discs:
write_cut_file(perf_disc_dir + 'cut_' + name.replace('csv', 'txt'), perf_discs[name])
with open(perf_subspaces_file, 'w') as psf:
Expand All @@ -211,4 +207,8 @@ def store(data):


if __name__ == '__main__':
print(generate_overlap_partition(7, 3))
# print(generate_overlap_partition(7, 3))
generators = produce_all_data_generators()
for g in generators:

store(g.build())
62 changes: 37 additions & 25 deletions experiments_logging.py
Expand Up @@ -97,31 +97,43 @@ def save_plot_data_2d(f, data):
plt.clf()


def write_out_file(problem, name, disc_intervals, disc_points, class_labels):
with open(name, 'w') as out:
out.write('@relation ' + util.get_escaped_name(problem) + "\n\n")
counter = [1]
for i in range(len(disc_intervals)):
out.write(
'@attribute dim' + str(i) + ' {' + ','.join([str(j + counter[-1]) for j in disc_intervals[i]]) + '}\n')
counter.append(counter[-1] + len(disc_intervals[i]))
out.write('@attribute class {' + ','.join(['"' + str(i) + '"' for i in class_labels.unique()]) + '}\n\n')
out.write('@data\n')

for i in range(len(disc_points[0])):
for j in range(len(disc_points)):
out.write(str(disc_points[j][i] + counter[j]))
out.write(',')
out.write('"' + str(class_labels[i]) + '"\n')


def write_cut_file(name, disc_intervals):
with open(name, 'w') as out:
for i in range(len(disc_intervals)):
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
for bin in disc_intervals[i]:
out.write(str(disc_intervals[i][bin][1]) + '\n')
out.write('-------------------------------------\n')
def write_out_file(problem, disc_intervals, disc_points, class_labels):
lines = ['@relation ' + util.get_escaped_name(problem) + "\n\n"]
counter = [1]
for i in range(len(disc_intervals)):
lines.append(
'@attribute dim' + str(i) + ' {' + ','.join([str(j + counter[-1]) for j in disc_intervals[i]]) + '}\n')
counter.append(counter[-1] + len(disc_intervals[i]))
lines.append('@attribute class {' + ','.join(['"' + str(i) + '"' for i in class_labels.unique()]) + '}\n\n')
lines.append('@data\n')

for i in range(len(disc_points[0])):
for j in range(len(disc_points)):
lines.append(str(disc_points[j][i] + counter[j]))
lines.append(',')
lines.append('"' + str(class_labels[i]) + '"\n')
return lines


def write_outdat_file(disc_intervals, disc_points, class_labels, relevant_features):
lines = []
counter = [1]
for i in range(len(disc_intervals)):
counter.append(counter[-1] + len(disc_intervals[i]))
for i in range(len(disc_points[0])):
line = ' '.join([str(disc_points[j][i] + counter[j]) for j in range(relevant_features)])
lines.append(line + " " + str(class_labels[i]) + '\n')
return lines


def write_cut_file(disc_intervals):
lines = []
for i in range(len(disc_intervals)):
lines.append('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
for bin in disc_intervals[i]:
lines.append(str(disc_intervals[i][bin][1]) + '\n')
lines.append('-------------------------------------\n')
return lines


if __name__ == '__main__':
Expand Down

0 comments on commit 2585898

Please sign in to comment.