Skip to content
Permalink
8b0e2f8e44
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
303 lines (239 sloc) 10.2 KB
from abc import abstractmethod
import numpy as np
import pandas as pd
import random
import util
import time
import os
import json
import constants as cst
import experiments_logging as l
import socket
RADIUS = 2
CUBE_WIDTH = 1
ROWS = 6000
OVERLAP_PROBABILITY = 0.6
class CubeParameters:
def __init__(self, rows, loc=None):
self.rows = rows
self.loc = loc
self.subspaces = []
class DataGenerator:
def __init__(self, file_name, rel_feature_count, irr_feature_count, radius):
self.radius = radius
self.feature_count = rel_feature_count + irr_feature_count
self.irf = irr_feature_count
self.file_name = file_name
self.dim_borders = [[-radius, radius] for d in range(self.feature_count)]
self.subspaces = []
self.perf_disc = [{d[1]} for d in self.dim_borders[:rel_feature_count]]
def __repr__(self):
return '(file_name=' + str(self.file_name) + ")"
@abstractmethod
def build(self):
...
@abstractmethod
def get_discs(self):
...
@abstractmethod
def get_subspaces(self):
...
class XorGenerator(DataGenerator):
def __init__(self, rf, irf, radius, rows, sigma, file_name):
super().__init__(file_name, rf, irf, radius)
self.rows = rows
self.slave_features = rf - 1
self.sigma = sigma
self.perf_disc = [[0, radius] for d in self.dim_borders[:rf]]
self.subspaces = [[f for f in range(rf)]]
def get_discs(self):
return self.perf_disc
def get_subspaces(self):
return self.subspaces
def build(self):
r_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.slave_features)) if self.slave_features > 0 else np.empty((self.rows, self.slave_features))
parity_dim = -(np.sum(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(self.rows, 1) \
* np.random.uniform(0, self.radius, (self.rows, 1)) if self.slave_features > 0 else np.empty((self.rows, self.slave_features))
irr_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.irf))
xor_dict = dict()
counter = [0]
curr = []
def add_value(r):
if r == 0:
counter[0] += 1
xor_dict["".join([str(i) for i in curr]) + str(sum(curr) % 2)] = counter[0]
return
for i in [0, 1]:
curr.append(i)
add_value(r - 1)
curr.pop()
add_value(self.slave_features)
class_labels = np.apply_along_axis(lambda a: xor_dict["".join([str(d) for d in a])], 1,
np.concatenate([np.array(r_dims > 0, dtype='int'),
(np.sum(r_dims > 0, axis=1) % 2).reshape(self.rows, 1)],
axis=1))
class_labels = class_labels.reshape([class_labels.shape[0], 1])
data = np.concatenate((r_dims, parity_dim, irr_dims, class_labels), axis=1)
if self.sigma:
e = np.concatenate((np.random.normal(0, self.sigma, (self.rows, self.slave_features + self.irf + 1)), np.zeros((self.rows, 1))), axis=1)
data = data + e
return data, self.file_name
class CubesGenerator(DataGenerator):
def __init__(self, rel_feature_count, irr_feature_count, radius, file_name):
super().__init__(file_name, rel_feature_count, irr_feature_count, radius)
self.rel_feature_count = rel_feature_count
self.cube_parameters = []
def add_cube_parameter(self, cube_param):
if cube_param.loc is None:
cube_param.loc = {}
self.cube_parameters.append(cube_param)
location_params = cube_param.loc
s = list(location_params.keys())
if s and not s in self.subspaces:
self.subspaces.append(s)
# perfect discretization
for feat in range(self.rel_feature_count):
if feat in cube_param.loc.keys():
dim_params = location_params[feat]
if dim_params[0] != -RADIUS:
self.perf_disc[feat].add(dim_params[0])
self.perf_disc[feat].add(dim_params[0] + dim_params[1])
def build(self):
cubes = []
for cube_parameter in self.cube_parameters:
location_params = cube_parameter.loc
points_count = cube_parameter.rows
if len(location_params) == 0:
label = 0
else:
label = len(cubes) + 1
cube = []
for feat in range(self.feature_count):
if feat in location_params.keys():
assert feat < self.rel_feature_count
dim_params = location_params[feat]
if dim_params[0] < self.dim_borders[feat][0] \
or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]:
raise ValueError(
"The cube with params " + str(location_params) + " does not fit in dim " + str(
feat) + "!")
column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[
0]
else:
column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count)
cube.append(column)
class_labels = np.empty(points_count)
class_labels.fill(label)
cube.append(class_labels)
cubes.append(cube)
generated_data = np.concatenate([np.array(cube) for cube in cubes], axis=1).transpose()
return generated_data, self.file_name
def get_subspaces(self):
return self.subspaces
def get_discs(self):
return [sorted(p) for p in self.perf_disc]
def generate_partition(rf, c):
arr = [i for i in range(rf)]
random.shuffle(arr)
min = 2
pivot = 0
partition = []
for i in range(c - 1):
max = rf - pivot - (c - i - 1) * min
t = random.randint(min, max)
partition.append(arr[pivot: pivot + t])
pivot += t
partition.append(arr[pivot:])
assert len(partition) == c
return partition
def generate_overlap_partition(rf, c):
partition = generate_partition(rf, c)
additions = []
for p in partition:
add = []
# at most a half of the partition times of possibility of overlap
for l in range(int(len(p) / 2)):
if random.uniform(0, 1) < OVERLAP_PROBABILITY:
others = list({i for i in range(rf)} - set(p))
rand = random.randint(0, rf - len(p) - 1)
add.append(others[rand])
additions.append(add)
for i, p in enumerate(partition):
for add in additions[i]:
p.append(add)
return partition
def produce_cube_generator(rf, irf, interactions, type, cubes, file_name):
dg = CubesGenerator(rf, irf, RADIUS, file_name)
# same number of records for each of the interactions * cubes + background
cube_rows = int(ROWS / (interactions * cubes + 1))
if type == 'c':
partition = [range(rf) for i in range(interactions)]
elif type == 'i':
partition = generate_partition(rf, interactions)
elif type == 'io':
partition = generate_overlap_partition(rf, interactions)
else:
raise ValueError("no such type!")
for p in partition:
for cube in range(cubes):
location = dict()
for j in p:
location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH)
dg.add_cube_parameter(CubeParameters(cube_rows, location))
dg.add_cube_parameter(CubeParameters(cube_rows))
return dg
def produce_xor_generator(rf, irf, file_name):
return XorGenerator(rf, irf, RADIUS, ROWS, 0.1, file_name)
def produce_all_data_generators():
data_generators = []
global basedir
basedir = cst.DATA_DIR
if not os.path.exists(basedir):
os.mkdir(basedir)
perf_disc_dir = cst.PERFECT_DISCRETIZATIONS_DIR
if not os.path.exists(perf_disc_dir):
os.mkdir(perf_disc_dir)
perf_subspaces = dict()
perf_discs = dict()
def produce_dg(name, interaction_type, rf, i, type, cubes):
if os.path.exists(basedir + name + ".csv") and os.path.exists(perf_disc_dir + 'cut_' + name + ".txt"):
return
if interaction_type == cst.InteractionType.CUBES:
dg = produce_cube_generator(rf, cst.IRRELEVANT_FEATURES, i, type, cubes, name + ".csv")
elif interaction_type == cst.InteractionType.XOR:
dg = produce_xor_generator(rf, cst.IRRELEVANT_FEATURES, name + ".csv")
else:
raise ValueError("no implementation of data generator for", interaction_type.name)
perf_discs[name] = dg.get_discs()
perf_subspaces[name] = dg.get_subspaces()
data_generators.append(dg)
util.collect_params(produce_dg)
for name in perf_discs:
write_cut_file(perf_disc_dir + 'cut_' + name + ".txt", perf_discs[name])
write_perf_subspaces(perf_subspaces)
return data_generators
def write_perf_subspaces(perf_subspaces):
all_perf_subspaces = perf_subspaces
if os.path.exists(cst.PERFECT_SUBSPACES_JSON):
with open(cst.PERFECT_SUBSPACES_JSON, 'r') as psf:
old_perf_subspaces = json.load(psf)
all_perf_subspaces.update(old_perf_subspaces)
with open(cst.PERFECT_SUBSPACES_JSON, 'w') as psf:
json.dump(all_perf_subspaces, psf)
def write_cut_file(name, disc_intervals):
with open(name, 'w') as out:
for i in range(len(disc_intervals)):
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
for break_point in disc_intervals[i]:
out.write(format(break_point, '.1f') + '\n')
out.write('-------------------------------------\n')
def store(data):
global basedir
name = data[1]
pd.DataFrame(data[0]).to_csv(basedir + name, sep=';', header=False, index=False, float_format='%.2f')
if __name__ == '__main__':
# l.plot_data_3d(produce_xor_generator(3, 0, 'bla').build()[0])
# print(generate_overlap_partition(7, 3))
generators = produce_all_data_generators()
for g in generators:
store(g.build())