Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/data_generator.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
214 lines (181 sloc)
7.15 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import random | |
import time | |
import os | |
import json | |
import constants as cst | |
import experiments_logging as l | |
import socket | |
RADIUS = 2 | |
CUBE_WIDTH = 1 | |
ROWS = 6000 | |
OVERLAP_PROBABILITY = 0.6 | |
class CubeParameters: | |
def __init__(self, rows, loc=None): | |
self.rows = rows | |
self.loc = loc | |
self.subspaces = [] | |
class CubesGenerator: | |
def __init__(self, feature_count, radius, file_name): | |
self.file_name = file_name | |
self.cube_parameters = [] | |
self.feature_count = feature_count | |
self.dim_borders = [[-radius, radius] for d in range(feature_count)] | |
self.subspaces = [] | |
self.perf_disc = [{d[1]} for d in self.dim_borders] | |
def add_cube_parameter(self, cube_param): | |
if cube_param.loc is None: | |
cube_param.loc = {} | |
self.cube_parameters.append(cube_param) | |
location_params = cube_param.loc | |
s = list(location_params.keys()) | |
if s and not s in self.subspaces: | |
self.subspaces.append(s) | |
for feat in range(self.feature_count): | |
if feat in cube_param.loc.keys(): | |
dim_params = location_params[feat] | |
# perfect discretization | |
if dim_params[0] != -RADIUS: | |
self.perf_disc[feat].add(dim_params[0]) | |
self.perf_disc[feat].add(dim_params[0] + dim_params[1]) | |
def build(self): | |
cubes = [] | |
for cube_parameter in self.cube_parameters: | |
location_params = cube_parameter.loc | |
points_count = cube_parameter.rows | |
if len(location_params) == 0: | |
label = 0 | |
else: | |
label = len(cubes) + 1 | |
cube = [] | |
for feat in range(self.feature_count): | |
if feat in location_params.keys(): | |
dim_params = location_params[feat] | |
if dim_params[0] < self.dim_borders[feat][0] \ | |
or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]: | |
raise ValueError( | |
"The cube with params " + str(location_params) + " does not fit in dim " + str( | |
feat) + "!") | |
column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[ | |
0] | |
else: | |
column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count) | |
cube.append(column) | |
class_labels = np.empty(points_count) | |
class_labels.fill(label) | |
cube.append(class_labels) | |
cubes.append(cube) | |
generated_data = np.concatenate([np.array(cube) for cube in cubes], axis=1).transpose() | |
return generated_data, self.file_name | |
def get_subspaces(self): | |
return self.subspaces | |
def get_discs(self): | |
return [sorted(p) for p in self.perf_disc] | |
def generate_partition(rf, c): | |
arr = [i for i in range(rf)] | |
random.shuffle(arr) | |
min = 2 | |
pivot = 0 | |
partition = [] | |
for i in range(c - 1): | |
max = rf - pivot - (c - i - 1) * min | |
t = random.randint(min, max) | |
partition.append(arr[pivot: pivot + t]) | |
pivot += t | |
partition.append(arr[pivot:]) | |
assert len(partition) == c | |
return partition | |
def generate_overlap_partition(rf, c): | |
partition = generate_partition(rf, c) | |
additions = [] | |
for p in partition: | |
add = [] | |
# at most a half of the partition times of possibility of overlap | |
for l in range(int(len(p) / 2)): | |
if random.uniform(0, 1) < OVERLAP_PROBABILITY: | |
others = list({i for i in range(rf)} - set(p)) | |
rand = random.randint(0, rf - len(p) - 1) | |
add.append(others[rand]) | |
additions.append(add) | |
for i, p in enumerate(partition): | |
for add in additions[i]: | |
p.append(add) | |
return partition | |
def produce_data_generator(rf, irf, c, type, name): | |
total_f = rf + irf | |
dg = CubesGenerator(total_f, RADIUS, name) | |
# same number of records for each of the cubes + background | |
cube_rows = int(ROWS / (c + 1)) | |
if type == 'c': | |
partition = [range(rf) for i in range(c)] | |
elif type == 'i': | |
partition = generate_partition(rf, c) | |
elif type == 'io': | |
partition = generate_overlap_partition(rf, c) | |
else: | |
raise ValueError("no such type!") | |
for p in partition: | |
location = dict() | |
for j in p: | |
location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH) | |
dg.add_cube_parameter(CubeParameters(cube_rows, location)) | |
dg.add_cube_parameter(CubeParameters(cube_rows)) | |
return dg | |
def produce_all_data_generators(): | |
data_generators = [] | |
global basedir | |
basedir = cst.DATA_DIR | |
if not os.path.exists(basedir): | |
os.mkdir(basedir) | |
perf_disc_dir = cst.PERFECT_DISCRETIZATIONS_DIR | |
if not os.path.exists(perf_disc_dir): | |
os.mkdir(perf_disc_dir) | |
perf_subspaces_file = cst.PERFECT_SUBSPACES_JSON | |
perf_subspaces = dict() | |
perf_discs = dict() | |
# relevant features 2 - 30 | |
for rf in range(2, 3): | |
# cubes 1 - 10 | |
for c in range(3, 4): | |
# cube types complete, incomplete, incomplete overlapping | |
for type in ['c']: | |
# relevant features 2 - 30 | |
# for rf in range(2, 31): | |
# # cubes 1 - 10 | |
# for c in range(1, 11): | |
# # cube types complete, incomplete, incomplete overlapping | |
# for type in ['c', 'i', 'io']: | |
if (c == 1 or rf / c < 2) and type != 'c': | |
continue | |
name = 'cubes_' + '{0:02d}'.format(rf) + '_' \ | |
+ '{0:02d}'.format(c) + '_' \ | |
+ type + '.csv' | |
# if os.path.exists(basedir + name) and os.path.exists( | |
# perf_disc_dir + 'cut_' + name.replace('csv', 'txt')): | |
# continue | |
dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name) | |
perf_discs[name] = dg.get_discs() | |
perf_subspaces[name] = dg.get_subspaces() | |
data_generators.append(dg) | |
for name in perf_discs: | |
write_cut_file(perf_disc_dir + 'cut_' + name.replace('csv', 'txt'), perf_discs[name]) | |
with open(perf_subspaces_file, 'w') as psf: | |
json.dump(perf_subspaces, psf) | |
return data_generators | |
def write_cut_file(name, disc_intervals): | |
with open(name, 'w') as out: | |
for i in range(len(disc_intervals)): | |
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n') | |
for break_point in disc_intervals[i]: | |
out.write(format(break_point, '.1f') + '\n') | |
out.write('-------------------------------------\n') | |
def store(data): | |
global basedir | |
name = data[1] | |
pd.DataFrame(data[0]).to_csv(basedir + name, sep=';', header=False, index=False, float_format='%.2f') | |
if __name__ == '__main__': | |
# print(generate_overlap_partition(7, 3)) | |
generators = produce_all_data_generators() | |
for g in generators: | |
store(g.build()) |