Skip to content
Permalink
b1a05852fb
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
330 lines (267 sloc) 12.9 KB
from abc import abstractmethod
import numpy as np
import pandas as pd
import random
import util
import time
import os
import json
import constants as cst
import experiments_logging as l
import socket
RADIUS = 2
CUBE_WIDTH = 1
ROWS = 6000
OVERLAP_PROBABILITY = 0.6
class CubeParameters:
def __init__(self, rows, loc=None):
self.rows = rows
self.loc = loc
self.subspaces = []
class DataGenerator:
def __init__(self, file_name, rel_feature_count, irr_feature_count, radius):
self.radius = radius
self.feature_count = rel_feature_count + irr_feature_count
self.irf = irr_feature_count
self.file_name = file_name
self.dim_borders = [[-radius, radius] for d in range(self.feature_count)]
self.subspaces = []
def __repr__(self):
return '(file_name=' + str(self.file_name) + ")"
@abstractmethod
def build(self):
...
@abstractmethod
def get_subspaces(self):
...
class XorGenerator(DataGenerator):
def __init__(self, rf, irf, radius, rows, sigma, file_name, distribution='uniform', offset=(0, 0)):
super().__init__(file_name, rf, irf, radius)
self.offset = offset
self.distribution = distribution
self.rows = rows
self.slave_features = rf - 1
self.rf = rf
self.sigma = sigma
self.subspaces = [[f for f in range(rf)]]
def transform_to_gauss(self, a):
return [np.random.normal((d > 0)*2 - 1, (self.radius - ((d > self.offset[1]) * 2 - 1) * self.offset[1]) / 4) if i == self.offset[0]
else np.random.normal((d > 0)*2 - 1, self.radius / 4) for i, d in enumerate(a)]
def get_subspaces(self):
return self.subspaces
def build(self):
offset_r_dim = np.random.uniform(-self.radius - self.offset[1], self.radius, (self.rows, 1))
r_dims = np.random.uniform(-self.radius, self.radius,
(self.rows, self.slave_features - 1)) if self.slave_features > 0 else np.empty(
(self.rows, self.slave_features))
r_dims = np.concatenate((r_dims[:, :self.offset[0]], offset_r_dim, r_dims[:, self.offset[0]:]), axis=1)
# r_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.slave_features)) \
# if self.slave_features > 0 else np.empty((self.rows, self.slave_features))
parity_dim = (np.sum(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(self.rows, 1) \
* np.random.uniform(0, self.radius, (self.rows, 1)) \
if self.slave_features > 0 else np.empty((self.rows, self.slave_features))
xor_cube = np.concatenate((r_dims, parity_dim), axis=1)
if self.distribution == 'gauss':
xor_cube = np.apply_along_axis(self.transform_to_gauss, 1, xor_cube)
irr_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.irf))
xor_dict = dict()
counter = [0]
curr = []
def add_value(r):
if r == 0:
counter[0] += 1
xor_dict["".join([str(i) for i in curr]) + str(sum(curr) % 2)] = counter[0]
return
for i in [0, 1]:
curr.append(i)
add_value(r - 1)
curr.pop()
add_value(self.slave_features)
class_labels = np.apply_along_axis(lambda a: xor_dict["".join([str(d) for d in a])], 1,
np.concatenate([np.array(r_dims > 0, dtype='int'),
(np.sum(r_dims > 0, axis=1) % 2).reshape(self.rows, 1)],
axis=1))
class_labels = class_labels.reshape([class_labels.shape[0], 1])
xor_cube[:, self.offset[0]] = xor_cube[:, self.offset[0]] + self.offset[1]
data = np.concatenate((xor_cube, irr_dims, class_labels), axis=1)
if self.sigma:
e = np.concatenate((np.random.normal(0, self.sigma, (self.rows, self.slave_features + self.irf + 1)), np.zeros((self.rows, 1))), axis=1)
data = data + e
return data, [[data[:, sf].min(), 0 if sf != self.offset[0] else self.offset[1], data[:, sf].max()] for sf in range(self.slave_features + 1)], self.file_name
class CubesGenerator(DataGenerator):
def __init__(self, rel_feature_count, irr_feature_count, radius, file_name, distribution='uniform'):
super().__init__(file_name, rel_feature_count, irr_feature_count, radius)
self.distribution = distribution
self.rel_feature_count = rel_feature_count
self.cube_parameters = []
self.perf_disc_set = [set() for i in range(rel_feature_count)]
def add_cube_parameter(self, cube_param):
if cube_param.loc is None:
cube_param.loc = {}
self.cube_parameters.append(cube_param)
location_params = cube_param.loc
s = list(location_params.keys())
if s and not s in self.subspaces:
self.subspaces.append(s)
# perfect discretization for cubes
for feat in range(self.rel_feature_count):
if feat in cube_param.loc.keys():
dim_params = location_params[feat]
self.perf_disc_set[feat].add(dim_params[0])
self.perf_disc_set[feat].add(dim_params[0] + dim_params[1])
def build(self):
cubes = []
for cube_parameter in self.cube_parameters:
location_params = cube_parameter.loc
points_count = cube_parameter.rows
if len(location_params) == 0:
label = 0
else:
label = len(cubes) + 1
cube = []
for feat in range(self.feature_count):
if feat in location_params.keys():
assert feat < self.rel_feature_count
dim_params = location_params[feat]
if dim_params[0] < self.dim_borders[feat][0] \
or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]:
raise ValueError(
"The cube with params " + str(location_params) + " does not fit in dim " + str(
feat) + "!")
if self.distribution == 'uniform':
column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[0]
else:
column = np.random.normal(dim_params[0] + dim_params[1] / 2, dim_params[1] / 4, points_count)
else:
column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count)
cube.append(column)
class_labels = np.empty(points_count)
class_labels.fill(label)
cube.append(class_labels)
cubes.append(cube)
generated_data = np.concatenate([np.array(cube) for cube in cubes], axis=1).transpose()
return generated_data, [sorted(p.union([min(generated_data[:, i]), max(generated_data[:, i])])) for i, p in enumerate(self.perf_disc_set)], self.file_name
def get_subspaces(self):
return self.subspaces
def generate_partition(rf, interactions):
arr = [i for i in range(rf)]
if interactions == 1:
return [arr]
random.shuffle(arr)
min = 2
pivot = 0
partition = []
for i in range(interactions - 1):
max = rf - pivot - (interactions - i - 1) * min
t = random.randint(min, max)
partition.append(arr[pivot: pivot + t])
pivot += t
partition.append(arr[pivot:])
assert len(partition) == interactions
return partition
def generate_overlap_partition(rf, c):
partition = generate_partition(rf, c)
additions = []
for p in partition:
add = []
# at most a half of the partition times of possibility of overlap
for l in range(int(len(p) / 2)):
if random.uniform(0, 1) < OVERLAP_PROBABILITY:
others = list({i for i in range(rf)} - set(p))
rand = random.randint(0, rf - len(p) - 1)
add.append(others[rand])
additions.append(add)
for i, p in enumerate(partition):
for add in additions[i]:
p.append(add)
return partition
def produce_random_generator(irf, file_name, rows=ROWS):
dg = CubesGenerator(0, irf, RADIUS, file_name)
cube_rows = rows
dg.add_cube_parameter(CubeParameters(cube_rows))
return dg
# todo type
def produce_cube_generator(rf, irf, interactions, cubes, file_name, rows=ROWS, distribution="uniform"):
dg = CubesGenerator(rf, irf, RADIUS, file_name, distribution)
# same number of records for each of the interactions * cubes + background
cube_rows = int(rows / (interactions * cubes + 1))
partition = generate_partition(rf, interactions)
for p in partition:
for cube in range(cubes):
location = dict()
for j in p:
temp_location = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH)
while np.any([abs(temp_location[0] - d) < 0.1 or abs(temp_location[0] + temp_location[1] - d) < 0.1 for d in dg.perf_disc_set[j]]):
temp_location = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH)
# print(temp_location)
location[j] = temp_location
dg.add_cube_parameter(CubeParameters(cube_rows, location))
dg.add_cube_parameter(CubeParameters(cube_rows))
return dg
def produce_xor_generator(rf, irf, file_name, rows=ROWS, distribution='uniform', offset=(0, 0)):
g = XorGenerator(rf, irf, RADIUS, rows, 0.1, file_name, distribution=distribution, offset=offset)
return g
def produce_all_data_generators():
if cst.REAL_DATASETS is not None:
raise ValueError("set REAL_DATASETS to None in order to produce artificial datasets!")
data_generators = []
global basedir, perf_disc_dir
basedir = cst.DATA_DIR
if not os.path.exists(basedir):
os.mkdir(basedir)
perf_disc_dir = cst.PERFECT_DISCRETIZATIONS_DIR
if not os.path.exists(perf_disc_dir):
os.mkdir(perf_disc_dir)
perf_subspaces = dict()
# perf_discs = dict()
def produce_dg(name, interaction_type, rows, rf, i, cubes, offset):
if os.path.exists(basedir + name + ".csv") and os.path.exists(perf_disc_dir + 'cut_' + name + ".txt"):
return
if interaction_type == cst.InteractionType.CUBES:
dg = produce_cube_generator(rf, cst.TOTAL_IRRELEVANT_FEATURES, i, cubes, name + ".csv", rows, "uniform")
elif interaction_type == cst.InteractionType.BLOBS:
dg = produce_cube_generator(rf, cst.TOTAL_IRRELEVANT_FEATURES, i, cubes, name + ".csv", rows, "gauss")
elif interaction_type == cst.InteractionType.XORCUBES:
dg = produce_xor_generator(rf, cst.TOTAL_IRRELEVANT_FEATURES, name + ".csv", rows, 'uniform', offset)
elif interaction_type == cst.InteractionType.XORBLOBS:
dg = produce_xor_generator(rf, cst.TOTAL_IRRELEVANT_FEATURES, name + ".csv", rows, 'gauss', offset)
elif interaction_type == cst.InteractionType.UNIFORM:
dg = produce_random_generator(cst.TOTAL_IRRELEVANT_FEATURES, name + ".csv", rows=rows)
else:
raise ValueError("no implementation of data generator for", interaction_type.name)
# perf_discs[name] = dg.get_discs()
perf_subspaces[name] = dg.get_subspaces()
data_generators.append(dg)
util.datasets_iterator(produce_dg)
# for name in perf_discs:
# write_cut_file(perf_disc_dir + 'cut_' + name + ".txt", perf_discs[name])
write_perf_subspaces(perf_subspaces)
return data_generators
def write_perf_subspaces(perf_subspaces):
all_perf_subspaces = perf_subspaces
if os.path.exists(cst.PERFECT_SUBSPACES_JSON):
with open(cst.PERFECT_SUBSPACES_JSON, 'r') as psf:
old_perf_subspaces = json.load(psf)
all_perf_subspaces.update(old_perf_subspaces)
with open(cst.PERFECT_SUBSPACES_JSON, 'w') as psf:
json.dump(all_perf_subspaces, psf)
def write_cut_file(name, disc_intervals):
with open(name, 'w') as out:
for i in range(len(disc_intervals)):
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
for break_point in disc_intervals[i]:
out.write(format(break_point, '.2f') + '\n')
out.write('-------------------------------------\n')
def store(data):
global basedir, perf_disc_dir
name = data[2]
pd.DataFrame(data[0]).to_csv(basedir + name, sep=';', header=False, index=False, float_format='%.2f')
write_cut_file(perf_disc_dir + 'cut_' + name.replace(".csv", ".txt"), data[1])
if __name__ == '__main__':
# print(produce_cube_generator(5, 1, 2, 2, "bla", 100, "uniform").build()[1])
# exit(2)
# l.plot_data_3d(produce_xor_generator(3, 0, 'bla').build()[0])
# print(generate_partition(10, 3))
generators = produce_all_data_generators()
for g in generators:
store(g.build())