Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/data_generator.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
210 lines (172 sloc)
6.88 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import random | |
import util | |
import time | |
import os | |
import json | |
import constants as cst | |
import experiments_logging as l | |
import socket | |
RADIUS = 2 | |
CUBE_WIDTH = 1 | |
ROWS = 6000 | |
OVERLAP_PROBABILITY = 0.6 | |
class CubeParameters: | |
def __init__(self, rows, loc=None): | |
self.rows = rows | |
self.loc = loc | |
self.subspaces = [] | |
class CubesGenerator: | |
def __init__(self, rel_feature_count, irr_feature_count, radius, file_name): | |
self.rel_feature_count = rel_feature_count | |
self.file_name = file_name | |
self.cube_parameters = [] | |
self.feature_count = rel_feature_count + irr_feature_count | |
self.dim_borders = [[-radius, radius] for d in range(self.feature_count)] | |
self.subspaces = [] | |
self.perf_disc = [{d[1]} for d in self.dim_borders[:rel_feature_count]] | |
def __repr__(self): | |
return 'CubesGenerator(file_name=' + str(self.file_name) \ | |
+ ', rel_feature_count=' + str(self.rel_feature_count) \ | |
+ ', feature_count=' + str(self.feature_count) + ")" | |
def add_cube_parameter(self, cube_param): | |
if cube_param.loc is None: | |
cube_param.loc = {} | |
self.cube_parameters.append(cube_param) | |
location_params = cube_param.loc | |
s = list(location_params.keys()) | |
if s and not s in self.subspaces: | |
self.subspaces.append(s) | |
# perfect discretization | |
for feat in range(self.rel_feature_count): | |
if feat in cube_param.loc.keys(): | |
dim_params = location_params[feat] | |
if dim_params[0] != -RADIUS: | |
self.perf_disc[feat].add(dim_params[0]) | |
self.perf_disc[feat].add(dim_params[0] + dim_params[1]) | |
def build(self): | |
cubes = [] | |
for cube_parameter in self.cube_parameters: | |
location_params = cube_parameter.loc | |
points_count = cube_parameter.rows | |
if len(location_params) == 0: | |
label = 0 | |
else: | |
label = len(cubes) + 1 | |
cube = [] | |
for feat in range(self.feature_count): | |
if feat in location_params.keys(): | |
assert feat < self.rel_feature_count | |
dim_params = location_params[feat] | |
if dim_params[0] < self.dim_borders[feat][0] \ | |
or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]: | |
raise ValueError( | |
"The cube with params " + str(location_params) + " does not fit in dim " + str( | |
feat) + "!") | |
column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[ | |
0] | |
else: | |
column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count) | |
cube.append(column) | |
class_labels = np.empty(points_count) | |
class_labels.fill(label) | |
cube.append(class_labels) | |
cubes.append(cube) | |
generated_data = np.concatenate([np.array(cube) for cube in cubes], axis=1).transpose() | |
return generated_data, self.file_name | |
def get_subspaces(self): | |
return self.subspaces | |
def get_discs(self): | |
return [sorted(p) for p in self.perf_disc] | |
def generate_partition(rf, c): | |
arr = [i for i in range(rf)] | |
random.shuffle(arr) | |
min = 2 | |
pivot = 0 | |
partition = [] | |
for i in range(c - 1): | |
max = rf - pivot - (c - i - 1) * min | |
t = random.randint(min, max) | |
partition.append(arr[pivot: pivot + t]) | |
pivot += t | |
partition.append(arr[pivot:]) | |
assert len(partition) == c | |
return partition | |
def generate_overlap_partition(rf, c): | |
partition = generate_partition(rf, c) | |
additions = [] | |
for p in partition: | |
add = [] | |
# at most a half of the partition times of possibility of overlap | |
for l in range(int(len(p) / 2)): | |
if random.uniform(0, 1) < OVERLAP_PROBABILITY: | |
others = list({i for i in range(rf)} - set(p)) | |
rand = random.randint(0, rf - len(p) - 1) | |
add.append(others[rand]) | |
additions.append(add) | |
for i, p in enumerate(partition): | |
for add in additions[i]: | |
p.append(add) | |
return partition | |
def produce_data_generator(rf, irf, c, type, file_name): | |
dg = CubesGenerator(rf, irf, RADIUS, file_name) | |
# same number of records for each of the cubes + background | |
cube_rows = int(ROWS / (c + 1)) | |
if type == 'c': | |
partition = [range(rf) for i in range(c)] | |
elif type == 'i': | |
partition = generate_partition(rf, c) | |
elif type == 'io': | |
partition = generate_overlap_partition(rf, c) | |
else: | |
raise ValueError("no such type!") | |
for p in partition: | |
location = dict() | |
for j in p: | |
location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH) | |
dg.add_cube_parameter(CubeParameters(cube_rows, location)) | |
dg.add_cube_parameter(CubeParameters(cube_rows)) | |
return dg | |
def produce_all_data_generators(): | |
data_generators = [] | |
global basedir | |
basedir = cst.DATA_DIR | |
if not os.path.exists(basedir): | |
os.mkdir(basedir) | |
perf_disc_dir = cst.PERFECT_DISCRETIZATIONS_DIR | |
if not os.path.exists(perf_disc_dir): | |
os.mkdir(perf_disc_dir) | |
perf_subspaces_file = cst.PERFECT_SUBSPACES_JSON | |
perf_subspaces = dict() | |
perf_discs = dict() | |
def produce_dg(name, rf, c, type): | |
# if os.path.exists(basedir + name) and os.path.exists( | |
# perf_disc_dir + 'cut_' + name.replace('csv', 'txt')): | |
# continue | |
dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name + ".csv") | |
perf_discs[name] = dg.get_discs() | |
perf_subspaces[name] = dg.get_subspaces() | |
data_generators.append(dg) | |
util.collect_params(produce_dg) | |
for name in perf_discs: | |
write_cut_file(perf_disc_dir + 'cut_' + name + ".txt", perf_discs[name]) | |
with open(perf_subspaces_file, 'w') as psf: | |
json.dump(perf_subspaces, psf) | |
return data_generators | |
def write_cut_file(name, disc_intervals): | |
with open(name, 'w') as out: | |
for i in range(len(disc_intervals)): | |
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n') | |
for break_point in disc_intervals[i]: | |
out.write(format(break_point, '.1f') + '\n') | |
out.write('-------------------------------------\n') | |
def store(data): | |
global basedir | |
name = data[1] | |
pd.DataFrame(data[0]).to_csv(basedir + name, sep=';', header=False, index=False, float_format='%.2f') | |
if __name__ == '__main__': | |
# print(generate_overlap_partition(7, 3)) | |
generators = produce_all_data_generators() | |
for g in generators: | |
store(g.build()) |