Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/data_generator.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
330 lines (267 sloc)
12.9 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from abc import abstractmethod | |
import numpy as np | |
import pandas as pd | |
import random | |
import util | |
import time | |
import os | |
import json | |
import constants as cst | |
import experiments_logging as l | |
import socket | |
RADIUS = 2 | |
CUBE_WIDTH = 1 | |
ROWS = 6000 | |
OVERLAP_PROBABILITY = 0.6 | |
class CubeParameters: | |
def __init__(self, rows, loc=None): | |
self.rows = rows | |
self.loc = loc | |
self.subspaces = [] | |
class DataGenerator: | |
def __init__(self, file_name, rel_feature_count, irr_feature_count, radius): | |
self.radius = radius | |
self.feature_count = rel_feature_count + irr_feature_count | |
self.irf = irr_feature_count | |
self.file_name = file_name | |
self.dim_borders = [[-radius, radius] for d in range(self.feature_count)] | |
self.subspaces = [] | |
def __repr__(self): | |
return '(file_name=' + str(self.file_name) + ")" | |
@abstractmethod | |
def build(self): | |
... | |
@abstractmethod | |
def get_subspaces(self): | |
... | |
class XorGenerator(DataGenerator): | |
def __init__(self, rf, irf, radius, rows, sigma, file_name, distribution='uniform', offset=(0, 0)): | |
super().__init__(file_name, rf, irf, radius) | |
self.offset = offset | |
self.distribution = distribution | |
self.rows = rows | |
self.slave_features = rf - 1 | |
self.rf = rf | |
self.sigma = sigma | |
self.subspaces = [[f for f in range(rf)]] | |
def transform_to_gauss(self, a): | |
return [np.random.normal((d > 0)*2 - 1, (self.radius - ((d > self.offset[1]) * 2 - 1) * self.offset[1]) / 4) if i == self.offset[0] | |
else np.random.normal((d > 0)*2 - 1, self.radius / 4) for i, d in enumerate(a)] | |
def get_subspaces(self): | |
return self.subspaces | |
def build(self): | |
offset_r_dim = np.random.uniform(-self.radius - self.offset[1], self.radius, (self.rows, 1)) | |
r_dims = np.random.uniform(-self.radius, self.radius, | |
(self.rows, self.slave_features - 1)) if self.slave_features > 0 else np.empty( | |
(self.rows, self.slave_features)) | |
r_dims = np.concatenate((r_dims[:, :self.offset[0]], offset_r_dim, r_dims[:, self.offset[0]:]), axis=1) | |
# r_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.slave_features)) \ | |
# if self.slave_features > 0 else np.empty((self.rows, self.slave_features)) | |
parity_dim = (np.sum(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(self.rows, 1) \ | |
* np.random.uniform(0, self.radius, (self.rows, 1)) \ | |
if self.slave_features > 0 else np.empty((self.rows, self.slave_features)) | |
xor_cube = np.concatenate((r_dims, parity_dim), axis=1) | |
if self.distribution == 'gauss': | |
xor_cube = np.apply_along_axis(self.transform_to_gauss, 1, xor_cube) | |
irr_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.irf)) | |
xor_dict = dict() | |
counter = [0] | |
curr = [] | |
def add_value(r): | |
if r == 0: | |
counter[0] += 1 | |
xor_dict["".join([str(i) for i in curr]) + str(sum(curr) % 2)] = counter[0] | |
return | |
for i in [0, 1]: | |
curr.append(i) | |
add_value(r - 1) | |
curr.pop() | |
add_value(self.slave_features) | |
class_labels = np.apply_along_axis(lambda a: xor_dict["".join([str(d) for d in a])], 1, | |
np.concatenate([np.array(r_dims > 0, dtype='int'), | |
(np.sum(r_dims > 0, axis=1) % 2).reshape(self.rows, 1)], | |
axis=1)) | |
class_labels = class_labels.reshape([class_labels.shape[0], 1]) | |
xor_cube[:, self.offset[0]] = xor_cube[:, self.offset[0]] + self.offset[1] | |
data = np.concatenate((xor_cube, irr_dims, class_labels), axis=1) | |
if self.sigma: | |
e = np.concatenate((np.random.normal(0, self.sigma, (self.rows, self.slave_features + self.irf + 1)), np.zeros((self.rows, 1))), axis=1) | |
data = data + e | |
return data, [[data[:, sf].min(), 0 if sf != self.offset[0] else self.offset[1], data[:, sf].max()] for sf in range(self.slave_features + 1)], self.file_name | |
class CubesGenerator(DataGenerator): | |
def __init__(self, rel_feature_count, irr_feature_count, radius, file_name, distribution='uniform'): | |
super().__init__(file_name, rel_feature_count, irr_feature_count, radius) | |
self.distribution = distribution | |
self.rel_feature_count = rel_feature_count | |
self.cube_parameters = [] | |
self.perf_disc_set = [set() for i in range(rel_feature_count)] | |
def add_cube_parameter(self, cube_param): | |
if cube_param.loc is None: | |
cube_param.loc = {} | |
self.cube_parameters.append(cube_param) | |
location_params = cube_param.loc | |
s = list(location_params.keys()) | |
if s and not s in self.subspaces: | |
self.subspaces.append(s) | |
# perfect discretization for cubes | |
for feat in range(self.rel_feature_count): | |
if feat in cube_param.loc.keys(): | |
dim_params = location_params[feat] | |
self.perf_disc_set[feat].add(dim_params[0]) | |
self.perf_disc_set[feat].add(dim_params[0] + dim_params[1]) | |
def build(self): | |
cubes = [] | |
for cube_parameter in self.cube_parameters: | |
location_params = cube_parameter.loc | |
points_count = cube_parameter.rows | |
if len(location_params) == 0: | |
label = 0 | |
else: | |
label = len(cubes) + 1 | |
cube = [] | |
for feat in range(self.feature_count): | |
if feat in location_params.keys(): | |
assert feat < self.rel_feature_count | |
dim_params = location_params[feat] | |
if dim_params[0] < self.dim_borders[feat][0] \ | |
or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]: | |
raise ValueError( | |
"The cube with params " + str(location_params) + " does not fit in dim " + str( | |
feat) + "!") | |
if self.distribution == 'uniform': | |
column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[0] | |
else: | |
column = np.random.normal(dim_params[0] + dim_params[1] / 2, dim_params[1] / 4, points_count) | |
else: | |
column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count) | |
cube.append(column) | |
class_labels = np.empty(points_count) | |
class_labels.fill(label) | |
cube.append(class_labels) | |
cubes.append(cube) | |
generated_data = np.concatenate([np.array(cube) for cube in cubes], axis=1).transpose() | |
return generated_data, [sorted(p.union([min(generated_data[:, i]), max(generated_data[:, i])])) for i, p in enumerate(self.perf_disc_set)], self.file_name | |
def get_subspaces(self): | |
return self.subspaces | |
def generate_partition(rf, interactions): | |
arr = [i for i in range(rf)] | |
if interactions == 1: | |
return [arr] | |
random.shuffle(arr) | |
min = 2 | |
pivot = 0 | |
partition = [] | |
for i in range(interactions - 1): | |
max = rf - pivot - (interactions - i - 1) * min | |
t = random.randint(min, max) | |
partition.append(arr[pivot: pivot + t]) | |
pivot += t | |
partition.append(arr[pivot:]) | |
assert len(partition) == interactions | |
return partition | |
def generate_overlap_partition(rf, c): | |
partition = generate_partition(rf, c) | |
additions = [] | |
for p in partition: | |
add = [] | |
# at most a half of the partition times of possibility of overlap | |
for l in range(int(len(p) / 2)): | |
if random.uniform(0, 1) < OVERLAP_PROBABILITY: | |
others = list({i for i in range(rf)} - set(p)) | |
rand = random.randint(0, rf - len(p) - 1) | |
add.append(others[rand]) | |
additions.append(add) | |
for i, p in enumerate(partition): | |
for add in additions[i]: | |
p.append(add) | |
return partition | |
def produce_random_generator(irf, file_name, rows=ROWS): | |
dg = CubesGenerator(0, irf, RADIUS, file_name) | |
cube_rows = rows | |
dg.add_cube_parameter(CubeParameters(cube_rows)) | |
return dg | |
# todo type | |
def produce_cube_generator(rf, irf, interactions, cubes, file_name, rows=ROWS, distribution="uniform"): | |
dg = CubesGenerator(rf, irf, RADIUS, file_name, distribution) | |
# same number of records for each of the interactions * cubes + background | |
cube_rows = int(rows / (interactions * cubes + 1)) | |
partition = generate_partition(rf, interactions) | |
for p in partition: | |
for cube in range(cubes): | |
location = dict() | |
for j in p: | |
temp_location = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH) | |
while np.any([abs(temp_location[0] - d) < 0.1 or abs(temp_location[0] + temp_location[1] - d) < 0.1 for d in dg.perf_disc_set[j]]): | |
temp_location = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH) | |
# print(temp_location) | |
location[j] = temp_location | |
dg.add_cube_parameter(CubeParameters(cube_rows, location)) | |
dg.add_cube_parameter(CubeParameters(cube_rows)) | |
return dg | |
def produce_xor_generator(rf, irf, file_name, rows=ROWS, distribution='uniform', offset=(0, 0)): | |
g = XorGenerator(rf, irf, RADIUS, rows, 0.1, file_name, distribution=distribution, offset=offset) | |
return g | |
def produce_all_data_generators(): | |
if cst.REAL_DATASETS is not None: | |
raise ValueError("set REAL_DATASETS to None in order to produce artificial datasets!") | |
data_generators = [] | |
global basedir, perf_disc_dir | |
basedir = cst.DATA_DIR | |
if not os.path.exists(basedir): | |
os.mkdir(basedir) | |
perf_disc_dir = cst.PERFECT_DISCRETIZATIONS_DIR | |
if not os.path.exists(perf_disc_dir): | |
os.mkdir(perf_disc_dir) | |
perf_subspaces = dict() | |
# perf_discs = dict() | |
def produce_dg(name, interaction_type, rows, rf, i, cubes, offset): | |
if os.path.exists(basedir + name + ".csv") and os.path.exists(perf_disc_dir + 'cut_' + name + ".txt"): | |
return | |
if interaction_type == cst.InteractionType.CUBES: | |
dg = produce_cube_generator(rf, cst.TOTAL_IRRELEVANT_FEATURES, i, cubes, name + ".csv", rows, "uniform") | |
elif interaction_type == cst.InteractionType.BLOBS: | |
dg = produce_cube_generator(rf, cst.TOTAL_IRRELEVANT_FEATURES, i, cubes, name + ".csv", rows, "gauss") | |
elif interaction_type == cst.InteractionType.XORCUBES: | |
dg = produce_xor_generator(rf, cst.TOTAL_IRRELEVANT_FEATURES, name + ".csv", rows, 'uniform', offset) | |
elif interaction_type == cst.InteractionType.XORBLOBS: | |
dg = produce_xor_generator(rf, cst.TOTAL_IRRELEVANT_FEATURES, name + ".csv", rows, 'gauss', offset) | |
elif interaction_type == cst.InteractionType.UNIFORM: | |
dg = produce_random_generator(cst.TOTAL_IRRELEVANT_FEATURES, name + ".csv", rows=rows) | |
else: | |
raise ValueError("no implementation of data generator for", interaction_type.name) | |
# perf_discs[name] = dg.get_discs() | |
perf_subspaces[name] = dg.get_subspaces() | |
data_generators.append(dg) | |
util.datasets_iterator(produce_dg) | |
# for name in perf_discs: | |
# write_cut_file(perf_disc_dir + 'cut_' + name + ".txt", perf_discs[name]) | |
write_perf_subspaces(perf_subspaces) | |
return data_generators | |
def write_perf_subspaces(perf_subspaces): | |
all_perf_subspaces = perf_subspaces | |
if os.path.exists(cst.PERFECT_SUBSPACES_JSON): | |
with open(cst.PERFECT_SUBSPACES_JSON, 'r') as psf: | |
old_perf_subspaces = json.load(psf) | |
all_perf_subspaces.update(old_perf_subspaces) | |
with open(cst.PERFECT_SUBSPACES_JSON, 'w') as psf: | |
json.dump(all_perf_subspaces, psf) | |
def write_cut_file(name, disc_intervals): | |
with open(name, 'w') as out: | |
for i in range(len(disc_intervals)): | |
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n') | |
for break_point in disc_intervals[i]: | |
out.write(format(break_point, '.2f') + '\n') | |
out.write('-------------------------------------\n') | |
def store(data): | |
global basedir, perf_disc_dir | |
name = data[2] | |
pd.DataFrame(data[0]).to_csv(basedir + name, sep=';', header=False, index=False, float_format='%.2f') | |
write_cut_file(perf_disc_dir + 'cut_' + name.replace(".csv", ".txt"), data[1]) | |
if __name__ == '__main__': | |
# print(produce_cube_generator(5, 1, 2, 2, "bla", 100, "uniform").build()[1]) | |
# exit(2) | |
# l.plot_data_3d(produce_xor_generator(3, 0, 'bla').build()[0]) | |
# print(generate_partition(10, 3)) | |
generators = produce_all_data_generators() | |
for g in generators: | |
store(g.build()) |