Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
added interactionType which can be CUBES or XOR
added XorGenerator
  • Loading branch information
Tatiana Dembelova committed Oct 19, 2017
1 parent f9ee841 commit 8b0e2f8
Show file tree
Hide file tree
Showing 8 changed files with 196 additions and 82 deletions.
25 changes: 16 additions & 9 deletions ID_correlation_measure.py → ID_sm.py
Expand Up @@ -3,6 +3,7 @@
import pandas as pd
import interaction_distance as id
import data_generator as dg
import matplotlib.pyplot as plt

def evidence_ID():
# no interaction
Expand Down Expand Up @@ -35,29 +36,35 @@ def evidence_ID():
# log.plot_data_2d(df)
# log.plot_data_2d(pd.concat([df.loc[:100], df.loc[7900:8000]], axis=0))

cg = dg.produce_cube_generator(7, 0, 2, "i", ".csv")
cg = dg.produce_cube_generator(7, 0, 2, "i", 1, ".csv")
data, filname = cg.build()
print(cg.subspaces)
print(cg.perf_disc)

data = pd.DataFrame(data)
dim_count = data.shape[1]
for curr in data[:-1]:
for curr in range(dim_count - 1):
dims = data.columns.tolist()
dims.remove(curr)
dims.remove(dim_count - 1)
curr_data = data.sort_values(by=curr).reset_index(drop=True).loc[:, dims]
rows = curr_data.shape[0]
projected_data = data.sort_values(by=curr).reset_index()
curr_index = projected_data['index']
projected_data = projected_data.loc[:, dims]
rows = projected_data.shape[0]
print('curr dimension', curr)
for dim in dims:
counter = 0
ids = []
dim_x = []
while(True):
if counter + 280 > rows:
if counter + 140 > rows:
break
ids.append(id.compute_ID(curr_data.loc[counter:counter + 140, dim].to_frame(),
curr_data.loc[counter + 140: counter + 280, dim].to_frame(), [2] * dim_count))
counter += 1
ids.append(id.compute_ID(projected_data.loc[counter:counter + 70, dim].to_frame(),
projected_data.loc[counter + 70: counter + 140, dim].to_frame(), [2] * dim_count))
dim_x.append(data.loc[curr_index.loc[counter + 70], curr])
counter += 140
# needs data normalization todo
print('interaction with', dim, np.average(ids))
print('interaction with', dim, np.average(ids), sum([1 if ID > np.average(ids) else 0 for ID in ids]))
plt.plot(dim_x, ids)
plt.show()
# break
9 changes: 8 additions & 1 deletion constants.py
Expand Up @@ -33,6 +33,11 @@ class DistanceMeasure(Enum):
CJS = 2


class InteractionType(Enum):
CUBES = 1
XOR = 2


ID_THRESHOLD_QUANTILE = 0.3
ID_SLIDING_WINDOW = 40

Expand Down Expand Up @@ -72,13 +77,15 @@ class DistanceMeasure(Enum):
INTERACTIONS_LOWER_BOUND=3 if socket.gethostname() != 'push' else 1

# new settings (more constrained)
INTERACTION_TYPE_RANGE_LIST=[InteractionType.CUBES, InteractionType.XOR]
IRRELEVANT_FEATURES_RANGE_LIST = [0, 1, 2, 4, 8, 16, 32, 64, 99, 3, 6, 12, 24, 48, 82] if socket.gethostname() != 'push' else [0, 1, 2, 3]
RELEVANT_FEATURES_RANGE_LIST = [2, 3, 4, 6, 8, 12, 16, 23, 30] if socket.gethostname() == 'push' else [7]
INTERACTION_NUMBER_RANGE_LIST = [1, 2, 4, 8, 10] if socket.gethostname() == 'push' else [3]
INTERACTION_TYPES_RANGE_LIST=["c", 'i', "io"] if socket.gethostname() == 'push' else ['i']
PARTITION_TYPES_RANGE_LIST=["c", 'i', "io"] if socket.gethostname() == 'push' else ['i']
CUBES_LOWER_BOUND=1
CUBES_UPPER_BOUND=3
NAIVE_CHUNKS_NUMBER_RANGE_LIST = [2, 3, 4, 5, 10, 20, 30]
XOR_SIGMA=0.1


BASE = '/local/tmp/ipd_extended_experiments2/' if socket.gethostname() == 'push' \
Expand Down
117 changes: 97 additions & 20 deletions data_generator.py
@@ -1,3 +1,5 @@
from abc import abstractmethod

import numpy as np
import pandas as pd
import random
Expand All @@ -22,20 +24,90 @@ def __init__(self, rows, loc=None):
self.subspaces = []


class CubesGenerator:
def __init__(self, rel_feature_count, irr_feature_count, radius, file_name):
self.rel_feature_count = rel_feature_count
self.file_name = file_name
self.cube_parameters = []
class DataGenerator:
def __init__(self, file_name, rel_feature_count, irr_feature_count, radius):
self.radius = radius
self.feature_count = rel_feature_count + irr_feature_count
self.irf = irr_feature_count
self.file_name = file_name
self.dim_borders = [[-radius, radius] for d in range(self.feature_count)]
self.subspaces = []
self.perf_disc = [{d[1]} for d in self.dim_borders[:rel_feature_count]]

def __repr__(self):
return 'CubesGenerator(file_name=' + str(self.file_name) \
+ ', rel_feature_count=' + str(self.rel_feature_count) \
+ ', feature_count=' + str(self.feature_count) + ")"
return '(file_name=' + str(self.file_name) + ")"

@abstractmethod
def build(self):
...

@abstractmethod
def get_discs(self):
...

@abstractmethod
def get_subspaces(self):
...


class XorGenerator(DataGenerator):
def __init__(self, rf, irf, radius, rows, sigma, file_name):
super().__init__(file_name, rf, irf, radius)
self.rows = rows
self.slave_features = rf - 1
self.sigma = sigma
self.perf_disc = [[0, radius] for d in self.dim_borders[:rf]]
self.subspaces = [[f for f in range(rf)]]

def get_discs(self):
return self.perf_disc

def get_subspaces(self):
return self.subspaces


def build(self):
r_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.slave_features)) if self.slave_features > 0 else np.empty((self.rows, self.slave_features))
parity_dim = -(np.sum(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(self.rows, 1) \
* np.random.uniform(0, self.radius, (self.rows, 1)) if self.slave_features > 0 else np.empty((self.rows, self.slave_features))

irr_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.irf))


xor_dict = dict()
counter = [0]
curr = []

def add_value(r):
if r == 0:
counter[0] += 1
xor_dict["".join([str(i) for i in curr]) + str(sum(curr) % 2)] = counter[0]
return

for i in [0, 1]:
curr.append(i)
add_value(r - 1)
curr.pop()
add_value(self.slave_features)


class_labels = np.apply_along_axis(lambda a: xor_dict["".join([str(d) for d in a])], 1,
np.concatenate([np.array(r_dims > 0, dtype='int'),
(np.sum(r_dims > 0, axis=1) % 2).reshape(self.rows, 1)],
axis=1))
class_labels = class_labels.reshape([class_labels.shape[0], 1])
data = np.concatenate((r_dims, parity_dim, irr_dims, class_labels), axis=1)
if self.sigma:
e = np.concatenate((np.random.normal(0, self.sigma, (self.rows, self.slave_features + self.irf + 1)), np.zeros((self.rows, 1))), axis=1)
data = data + e
return data, self.file_name

class CubesGenerator(DataGenerator):
def __init__(self, rel_feature_count, irr_feature_count, radius, file_name):
super().__init__(file_name, rel_feature_count, irr_feature_count, radius)
self.rel_feature_count = rel_feature_count
self.cube_parameters = []


def add_cube_parameter(self, cube_param):
if cube_param.loc is None:
Expand Down Expand Up @@ -155,6 +227,10 @@ def produce_cube_generator(rf, irf, interactions, type, cubes, file_name):
return dg


def produce_xor_generator(rf, irf, file_name):
return XorGenerator(rf, irf, RADIUS, ROWS, 0.1, file_name)


def produce_all_data_generators():
data_generators = []
global basedir
Expand All @@ -168,13 +244,17 @@ def produce_all_data_generators():
perf_subspaces = dict()
perf_discs = dict()

def produce_dg(name, rf, i, type, cubes):
def produce_dg(name, interaction_type, rf, i, type, cubes):

if os.path.exists(basedir + name) and os.path.exists(
perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
if os.path.exists(basedir + name + ".csv") and os.path.exists(perf_disc_dir + 'cut_' + name + ".txt"):
return

dg = produce_cube_generator(rf, cst.IRRELEVANT_FEATURES, i, type, cubes, name + ".csv")
if interaction_type == cst.InteractionType.CUBES:
dg = produce_cube_generator(rf, cst.IRRELEVANT_FEATURES, i, type, cubes, name + ".csv")
elif interaction_type == cst.InteractionType.XOR:
dg = produce_xor_generator(rf, cst.IRRELEVANT_FEATURES, name + ".csv")
else:
raise ValueError("no implementation of data generator for", interaction_type.name)
perf_discs[name] = dg.get_discs()
perf_subspaces[name] = dg.get_subspaces()
data_generators.append(dg)
Expand Down Expand Up @@ -214,13 +294,10 @@ def store(data):


if __name__ == '__main__':
cg = produce_cube_generator(7, 2, 3, 'c', 'bla')
print(cg.subspaces)
cg = produce_cube_generator(7, 2, 3, 'i', 'bla')
print(cg.subspaces)
# l.plot_data_3d(produce_xor_generator(3, 0, 'bla').build()[0])
# print(generate_overlap_partition(7, 3))

# generators = produce_all_data_generators()
# for g in generators:
#
# store(g.build())
generators = produce_all_data_generators()
for g in generators:

store(g.build())
2 changes: 1 addition & 1 deletion discretization_quality_measure.py
Expand Up @@ -166,7 +166,7 @@ def prepare_compression1(experiment_name):
return False
return True

def run_compression1(name, rf=None, i=None, type=None, c=None):
def run_compression1(name, it=None, rf=None, i=None, type=None, c=None):
# 1. check slim db
# convert dat-file to db-file if it does not exist
if not os.path.exists(cst.SLIM_DATA_DIR + name + "/" + name + ".db"):
Expand Down
20 changes: 10 additions & 10 deletions experiments_logging.py
Expand Up @@ -5,6 +5,7 @@
from mpl_toolkits.mplot3d import Axes3D
import data_generation as dg_old
import util
import numpy as np


def plot_disc(problem, method):
Expand Down Expand Up @@ -41,18 +42,17 @@ def plot_data_3d(data):
ax = fig.add_subplot(111, projection='3d')
# data = data[np.logical_and(data[0] < 0, data[1] > 0)]

## 3d parity problem
# color_cond = {'b': np.logical_and(data[0] < 0, np.logical_and(data[1] > 0, data[2] < 0)),
# 'k': np.logical_and(data[0] < 0, np.logical_and(data[1] > 0, data[2] > 0)),
# 'g': np.logical_and(data[0] > 0, data[1] < 0),
# 'r': np.logical_and(data[0] < 0, data[1] < 0),
# 'c': np.logical_and(data[0] > 0, data[1] > 0),
# }
# for c in color_cond:
# ax.scatter(data[0][color_cond[c]], data[1][color_cond[c]], data[2][color_cond[c]], c=c, s=1)
# 3d parity problem
color_cond = {'b': data[3] == 1,
'k': data[3] == 2,
'r': data[3] == 3,
'g': data[3] == 4,
}
for c in color_cond:
ax.scatter(data[0][color_cond[c]], data[1][color_cond[c]], data[2][color_cond[c]], c=c, s=1)

## without coloring
ax.scatter(data[0], data[1], data[2], c='k', s=1)
# ax.scatter(data[0], data[1], data[2], c='k', s=1)

ax.set_xlabel('X0')
ax.set_ylabel('X1')
Expand Down
7 changes: 5 additions & 2 deletions main.py
Expand Up @@ -727,7 +727,7 @@ def prepare(base_dir, data_file, method, time_mark=False, delim=";", columns=Non

def collect_experiment_params(base_dir):

def collect(name, rf, i, type, c):
def collect(name, it, rf, i, type, c):
params = []
file_path = cst.DATA_DIR + name + ".csv"

Expand Down Expand Up @@ -766,10 +766,13 @@ def collect(name, rf, i, type, c):


if __name__ == "__main__":

# print(compute_predefined_subspace_sets_naive(5))
# exit(1)
# cubes_03_10_c
# print(compute_predefined_subspace_sets(3, [[0,1,2]]))
# exit(1)
params = collect_experiment_params("logs_test3")
params = collect_experiment_params("logs_test")
# print(params)
# print(compute_subspace_sets("cubes_10_03_i.csv", cst.Method.PREDEFINED_SUBSPACESETS))
# exit(1)
Expand Down
25 changes: 13 additions & 12 deletions runExperiment.py
Expand Up @@ -94,18 +94,18 @@ def register_ideal_disc(self, name):
loader = Loader()

# todo items.put(WHATEVER PARAMETERS OF TASK)
# params = dg.produce_all_data_generators()
# for data_generator in params:
# items.put(data_generator)
params = dg.produce_all_data_generators()
for data_generator in params:
items.put(data_generator)

params = main.collect_experiment_params("logs_test")
# params = main.collect_experiment_params("logs_test")
if len(params) == 0:
print("no parameters collected!")
exit(0)
for param in params:
loader.register_dataset(param.data_file)
loader.register_ideal_disc(param.experiment_name)
items.put(param)
# for param in params:
# loader.register_dataset(param.data_file)
# loader.register_ideal_disc(param.experiment_name)
# items.put(param)

if onlyListTasks:
while not items.empty():
Expand Down Expand Up @@ -143,8 +143,8 @@ def worker(worker_id):
print('Worker ID ', worker_id, 'is executing', para)
# todo generate data sets

# datasets.put(para.build())
datasets.put(main.execute(para, loader))
datasets.put(para.build())
# datasets.put(main.execute(para, loader))
print('Worker ID ', worker_id, ' execution finished')
with counterLock:
if runningMain:
Expand All @@ -161,8 +161,9 @@ def datasetWriter():
while True:
try:
result = datasets.get(block=True, timeout=10)
# dg.store(result)
main.store(result, loader)
# todo store
dg.store(result)
# main.store(result, loader)
except queue.Empty:
break

Expand Down

0 comments on commit 8b0e2f8

Please sign in to comment.