Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/data_generator.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
303 lines (239 sloc)
10.2 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from abc import abstractmethod | |
import numpy as np | |
import pandas as pd | |
import random | |
import util | |
import time | |
import os | |
import json | |
import constants as cst | |
import experiments_logging as l | |
import socket | |
RADIUS = 2 | |
CUBE_WIDTH = 1 | |
ROWS = 6000 | |
OVERLAP_PROBABILITY = 0.6 | |
class CubeParameters: | |
def __init__(self, rows, loc=None): | |
self.rows = rows | |
self.loc = loc | |
self.subspaces = [] | |
class DataGenerator: | |
def __init__(self, file_name, rel_feature_count, irr_feature_count, radius): | |
self.radius = radius | |
self.feature_count = rel_feature_count + irr_feature_count | |
self.irf = irr_feature_count | |
self.file_name = file_name | |
self.dim_borders = [[-radius, radius] for d in range(self.feature_count)] | |
self.subspaces = [] | |
self.perf_disc = [{d[1]} for d in self.dim_borders[:rel_feature_count]] | |
def __repr__(self): | |
return '(file_name=' + str(self.file_name) + ")" | |
@abstractmethod | |
def build(self): | |
... | |
@abstractmethod | |
def get_discs(self): | |
... | |
@abstractmethod | |
def get_subspaces(self): | |
... | |
class XorGenerator(DataGenerator): | |
def __init__(self, rf, irf, radius, rows, sigma, file_name): | |
super().__init__(file_name, rf, irf, radius) | |
self.rows = rows | |
self.slave_features = rf - 1 | |
self.sigma = sigma | |
self.perf_disc = [[0, radius] for d in self.dim_borders[:rf]] | |
self.subspaces = [[f for f in range(rf)]] | |
def get_discs(self): | |
return self.perf_disc | |
def get_subspaces(self): | |
return self.subspaces | |
def build(self): | |
r_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.slave_features)) if self.slave_features > 0 else np.empty((self.rows, self.slave_features)) | |
parity_dim = -(np.sum(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(self.rows, 1) \ | |
* np.random.uniform(0, self.radius, (self.rows, 1)) if self.slave_features > 0 else np.empty((self.rows, self.slave_features)) | |
irr_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.irf)) | |
xor_dict = dict() | |
counter = [0] | |
curr = [] | |
def add_value(r): | |
if r == 0: | |
counter[0] += 1 | |
xor_dict["".join([str(i) for i in curr]) + str(sum(curr) % 2)] = counter[0] | |
return | |
for i in [0, 1]: | |
curr.append(i) | |
add_value(r - 1) | |
curr.pop() | |
add_value(self.slave_features) | |
class_labels = np.apply_along_axis(lambda a: xor_dict["".join([str(d) for d in a])], 1, | |
np.concatenate([np.array(r_dims > 0, dtype='int'), | |
(np.sum(r_dims > 0, axis=1) % 2).reshape(self.rows, 1)], | |
axis=1)) | |
class_labels = class_labels.reshape([class_labels.shape[0], 1]) | |
data = np.concatenate((r_dims, parity_dim, irr_dims, class_labels), axis=1) | |
if self.sigma: | |
e = np.concatenate((np.random.normal(0, self.sigma, (self.rows, self.slave_features + self.irf + 1)), np.zeros((self.rows, 1))), axis=1) | |
data = data + e | |
return data, self.file_name | |
class CubesGenerator(DataGenerator): | |
def __init__(self, rel_feature_count, irr_feature_count, radius, file_name): | |
super().__init__(file_name, rel_feature_count, irr_feature_count, radius) | |
self.rel_feature_count = rel_feature_count | |
self.cube_parameters = [] | |
def add_cube_parameter(self, cube_param): | |
if cube_param.loc is None: | |
cube_param.loc = {} | |
self.cube_parameters.append(cube_param) | |
location_params = cube_param.loc | |
s = list(location_params.keys()) | |
if s and not s in self.subspaces: | |
self.subspaces.append(s) | |
# perfect discretization | |
for feat in range(self.rel_feature_count): | |
if feat in cube_param.loc.keys(): | |
dim_params = location_params[feat] | |
if dim_params[0] != -RADIUS: | |
self.perf_disc[feat].add(dim_params[0]) | |
self.perf_disc[feat].add(dim_params[0] + dim_params[1]) | |
def build(self): | |
cubes = [] | |
for cube_parameter in self.cube_parameters: | |
location_params = cube_parameter.loc | |
points_count = cube_parameter.rows | |
if len(location_params) == 0: | |
label = 0 | |
else: | |
label = len(cubes) + 1 | |
cube = [] | |
for feat in range(self.feature_count): | |
if feat in location_params.keys(): | |
assert feat < self.rel_feature_count | |
dim_params = location_params[feat] | |
if dim_params[0] < self.dim_borders[feat][0] \ | |
or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]: | |
raise ValueError( | |
"The cube with params " + str(location_params) + " does not fit in dim " + str( | |
feat) + "!") | |
column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[ | |
0] | |
else: | |
column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count) | |
cube.append(column) | |
class_labels = np.empty(points_count) | |
class_labels.fill(label) | |
cube.append(class_labels) | |
cubes.append(cube) | |
generated_data = np.concatenate([np.array(cube) for cube in cubes], axis=1).transpose() | |
return generated_data, self.file_name | |
def get_subspaces(self): | |
return self.subspaces | |
def get_discs(self): | |
return [sorted(p) for p in self.perf_disc] | |
def generate_partition(rf, c): | |
arr = [i for i in range(rf)] | |
random.shuffle(arr) | |
min = 2 | |
pivot = 0 | |
partition = [] | |
for i in range(c - 1): | |
max = rf - pivot - (c - i - 1) * min | |
t = random.randint(min, max) | |
partition.append(arr[pivot: pivot + t]) | |
pivot += t | |
partition.append(arr[pivot:]) | |
assert len(partition) == c | |
return partition | |
def generate_overlap_partition(rf, c): | |
partition = generate_partition(rf, c) | |
additions = [] | |
for p in partition: | |
add = [] | |
# at most a half of the partition times of possibility of overlap | |
for l in range(int(len(p) / 2)): | |
if random.uniform(0, 1) < OVERLAP_PROBABILITY: | |
others = list({i for i in range(rf)} - set(p)) | |
rand = random.randint(0, rf - len(p) - 1) | |
add.append(others[rand]) | |
additions.append(add) | |
for i, p in enumerate(partition): | |
for add in additions[i]: | |
p.append(add) | |
return partition | |
def produce_cube_generator(rf, irf, interactions, type, cubes, file_name): | |
dg = CubesGenerator(rf, irf, RADIUS, file_name) | |
# same number of records for each of the interactions * cubes + background | |
cube_rows = int(ROWS / (interactions * cubes + 1)) | |
if type == 'c': | |
partition = [range(rf) for i in range(interactions)] | |
elif type == 'i': | |
partition = generate_partition(rf, interactions) | |
elif type == 'io': | |
partition = generate_overlap_partition(rf, interactions) | |
else: | |
raise ValueError("no such type!") | |
for p in partition: | |
for cube in range(cubes): | |
location = dict() | |
for j in p: | |
location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH) | |
dg.add_cube_parameter(CubeParameters(cube_rows, location)) | |
dg.add_cube_parameter(CubeParameters(cube_rows)) | |
return dg | |
def produce_xor_generator(rf, irf, file_name): | |
return XorGenerator(rf, irf, RADIUS, ROWS, 0.1, file_name) | |
def produce_all_data_generators(): | |
data_generators = [] | |
global basedir | |
basedir = cst.DATA_DIR | |
if not os.path.exists(basedir): | |
os.mkdir(basedir) | |
perf_disc_dir = cst.PERFECT_DISCRETIZATIONS_DIR | |
if not os.path.exists(perf_disc_dir): | |
os.mkdir(perf_disc_dir) | |
perf_subspaces = dict() | |
perf_discs = dict() | |
def produce_dg(name, interaction_type, rf, i, type, cubes): | |
if os.path.exists(basedir + name + ".csv") and os.path.exists(perf_disc_dir + 'cut_' + name + ".txt"): | |
return | |
if interaction_type == cst.InteractionType.CUBES: | |
dg = produce_cube_generator(rf, cst.IRRELEVANT_FEATURES, i, type, cubes, name + ".csv") | |
elif interaction_type == cst.InteractionType.XOR: | |
dg = produce_xor_generator(rf, cst.IRRELEVANT_FEATURES, name + ".csv") | |
else: | |
raise ValueError("no implementation of data generator for", interaction_type.name) | |
perf_discs[name] = dg.get_discs() | |
perf_subspaces[name] = dg.get_subspaces() | |
data_generators.append(dg) | |
util.collect_params(produce_dg) | |
for name in perf_discs: | |
write_cut_file(perf_disc_dir + 'cut_' + name + ".txt", perf_discs[name]) | |
write_perf_subspaces(perf_subspaces) | |
return data_generators | |
def write_perf_subspaces(perf_subspaces): | |
all_perf_subspaces = perf_subspaces | |
if os.path.exists(cst.PERFECT_SUBSPACES_JSON): | |
with open(cst.PERFECT_SUBSPACES_JSON, 'r') as psf: | |
old_perf_subspaces = json.load(psf) | |
all_perf_subspaces.update(old_perf_subspaces) | |
with open(cst.PERFECT_SUBSPACES_JSON, 'w') as psf: | |
json.dump(all_perf_subspaces, psf) | |
def write_cut_file(name, disc_intervals): | |
with open(name, 'w') as out: | |
for i in range(len(disc_intervals)): | |
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n') | |
for break_point in disc_intervals[i]: | |
out.write(format(break_point, '.1f') + '\n') | |
out.write('-------------------------------------\n') | |
def store(data): | |
global basedir | |
name = data[1] | |
pd.DataFrame(data[0]).to_csv(basedir + name, sep=';', header=False, index=False, float_format='%.2f') | |
if __name__ == '__main__': | |
# l.plot_data_3d(produce_xor_generator(3, 0, 'bla').build()[0]) | |
# print(generate_overlap_partition(7, 3)) | |
generators = produce_all_data_generators() | |
for g in generators: | |
store(g.build()) |