Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
all the discretization quality measures are calculated and stored in …
…the storing phase
  • Loading branch information
Tatiana Dembelova committed Oct 14, 2017
1 parent 2585898 commit b5d861f
Show file tree
Hide file tree
Showing 9 changed files with 456 additions and 299 deletions.
8 changes: 6 additions & 2 deletions commands.txt
Expand Up @@ -2,7 +2,7 @@ ssh tdembelo@contact.mmci.uni-saarland.de

ssh tdembelo@push.mmci.uni-saarland.de

rsync -av --exclude '.idea/' --exclude '.git' --exclude='logs*' --exclude "*.png" --exclude "data/*" --exclude "tableau/" /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/ tdembelo@badr.mpi-inf.mpg.de:/home/tdembelo/ipd_extended/
rsync -av --exclude 'ideal_disc/' --exclude 'synthetic_cases/' --exclude '.idea/' --exclude '.git' --exclude='logs*' --exclude "*.png" --exclude "data/*" --exclude "tableau/" --exclude "new_cubes/" /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/ tdembelo@contact.mmci.uni-saarland.de:/home/tdembelo/ipd_extended/

rsync -av tdembelo@badr.mpi-inf.mpg.de:/home/tdembelo/ipd_extended/logs_quality/ /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/logs_quality/

Expand Down Expand Up @@ -44,4 +44,8 @@ for f in *_*_T*.csv; do mv $f "${f/*_*_T/T}" ;done
for f in *_*_CJS*.csv; do mv $f "${f/*_*_C/C}" ;done
for f in *_*_CJS*.csv; do echo mv $f "${f/*_*_C/C}" ;done

rsync -av --exclude 'data*' /Users/tatyanadembelova/Documents/study/thesis/code-fic/ tdembelo@push.mmci.uni-saarland.de:/home/tdembelo/code-fic/
rsync -av --exclude 'data*' /Users/tatyanadembelova/Documents/study/thesis/code-fic/ tdembelo@push.mmci.uni-saarland.de:/home/tdembelo/code-fic/

# slim
./bootstrap.sh
make -Cbuild install
9 changes: 6 additions & 3 deletions constants.py
Expand Up @@ -15,7 +15,7 @@ class Method(Enum):
PREDEFINED_SUBSPACESETS = 9
PREDEFINED_OPTIMAL_SUBSPACESET = 10
PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT = 11
FULL = 11
FULL = 12


class CorrelationMeasure(Enum):
Expand Down Expand Up @@ -50,7 +50,7 @@ class DistanceMeasure(Enum):
SUBSPACE_SET_STEP = 2

# todo change later
IRRELEVANT_FEATURES = 3
IRRELEVANT_FEATURES = 4

BASE = '/local/tmp/ipd_extended_experiments2/' if socket.gethostname() == 'push' \
else '/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/'
Expand All @@ -62,4 +62,7 @@ class DistanceMeasure(Enum):
SLIM_DATA_DIR = SLIM_BASE + "data/"
SLIM_BIN = SLIM_BASE + "branches/slim/trunk/fic"
SLIM_COMPRESS_CONF = SLIM_BASE + "branches/slim/trunk/compress.conf"
SLIM_CONVERT_CONF = SLIM_BASE + "branches/slim/trunk/convertdb.conf"
SLIM_CONVERT_CONF = SLIM_BASE + "branches/slim/trunk/convertdb.conf"

PRECISION_RECALL_FILENAME = "Precision_recall_runtime.csv"
COMPRESSION_FILENAME = "Compression.csv"
68 changes: 32 additions & 36 deletions data_generator.py
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
import random
import util
import time
import os
import json
Expand All @@ -13,6 +14,7 @@
ROWS = 6000
OVERLAP_PROBABILITY = 0.6


class CubeParameters:
def __init__(self, rows, loc=None):
self.rows = rows
Expand All @@ -21,13 +23,19 @@ def __init__(self, rows, loc=None):


class CubesGenerator:
def __init__(self, feature_count, radius, file_name):
def __init__(self, rel_feature_count, irr_feature_count, radius, file_name):
self.rel_feature_count = rel_feature_count
self.file_name = file_name
self.cube_parameters = []
self.feature_count = feature_count
self.dim_borders = [[-radius, radius] for d in range(feature_count)]
self.feature_count = rel_feature_count + irr_feature_count
self.dim_borders = [[-radius, radius] for d in range(self.feature_count)]
self.subspaces = []
self.perf_disc = [{d[1]} for d in self.dim_borders]
self.perf_disc = [{d[1]} for d in self.dim_borders[:rel_feature_count]]

def __repr__(self):
return 'CubesGenerator(file_name=' + str(self.file_name) \
+ ', rel_feature_count=' + str(self.rel_feature_count) \
+ ', feature_count=' + str(self.feature_count) + ")"

def add_cube_parameter(self, cube_param):
if cube_param.loc is None:
Expand All @@ -37,10 +45,11 @@ def add_cube_parameter(self, cube_param):
s = list(location_params.keys())
if s and not s in self.subspaces:
self.subspaces.append(s)
for feat in range(self.feature_count):

# perfect discretization
for feat in range(self.rel_feature_count):
if feat in cube_param.loc.keys():
dim_params = location_params[feat]
# perfect discretization
if dim_params[0] != -RADIUS:
self.perf_disc[feat].add(dim_params[0])
self.perf_disc[feat].add(dim_params[0] + dim_params[1])
Expand All @@ -61,6 +70,7 @@ def build(self):
cube = []
for feat in range(self.feature_count):
if feat in location_params.keys():
assert feat < self.rel_feature_count
dim_params = location_params[feat]
if dim_params[0] < self.dim_borders[feat][0] \
or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]:
Expand Down Expand Up @@ -122,9 +132,8 @@ def generate_overlap_partition(rf, c):
return partition


def produce_data_generator(rf, irf, c, type, name):
total_f = rf + irf
dg = CubesGenerator(total_f, RADIUS, name)
def produce_data_generator(rf, irf, c, type, file_name):
dg = CubesGenerator(rf, irf, RADIUS, file_name)
# same number of records for each of the cubes + background
cube_rows = int(ROWS / (c + 1))
if type == 'c':
Expand Down Expand Up @@ -158,34 +167,21 @@ def produce_all_data_generators():

perf_subspaces = dict()
perf_discs = dict()
# relevant features 2 - 30
for rf in range(2, 3):
# cubes 1 - 10
for c in range(3, 4):
# cube types complete, incomplete, incomplete overlapping
for type in ['c']:

# relevant features 2 - 30
# for rf in range(2, 31):
# # cubes 1 - 10
# for c in range(1, 11):
# # cube types complete, incomplete, incomplete overlapping
# for type in ['c', 'i', 'io']:
if (c == 1 or rf / c < 2) and type != 'c':
continue
name = 'cubes_' + '{0:02d}'.format(rf) + '_' \
+ '{0:02d}'.format(c) + '_' \
+ type + '.csv'
# if os.path.exists(basedir + name) and os.path.exists(
# perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
# continue

dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name)
perf_discs[name] = dg.get_discs()
perf_subspaces[name] = dg.get_subspaces()
data_generators.append(dg)

def produce_dg(name, rf, c, type):

# if os.path.exists(basedir + name) and os.path.exists(
# perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
# continue

dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name + ".csv")
perf_discs[name] = dg.get_discs()
perf_subspaces[name] = dg.get_subspaces()
data_generators.append(dg)

util.collect_params(produce_dg)
for name in perf_discs:
write_cut_file(perf_disc_dir + 'cut_' + name.replace('csv', 'txt'), perf_discs[name])
write_cut_file(perf_disc_dir + 'cut_' + name + ".txt", perf_discs[name])
with open(perf_subspaces_file, 'w') as psf:
json.dump(perf_subspaces, psf)
return data_generators
Expand Down

0 comments on commit b5d861f

Please sign in to comment.