Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
added concurrent data generation
- Loading branch information
Tatiana Dembelova
committed
Oct 10, 2017
1 parent
ddd6a67
commit d8351be
Showing
3 changed files
with
362 additions
and
57 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,67 +1,214 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import random | ||
import time | ||
import os | ||
import json | ||
import experiments_logging as l | ||
|
||
RADIUS = 2 | ||
CUBE_WIDTH = 1 | ||
ROWS = 6000 | ||
OVERLAP_PROBABILITY = 0.6 | ||
# BASE = '/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/' | ||
BASE = '/local/tmp/ipd_extended_experiments/' | ||
|
||
class DataGenerator: | ||
def __init__(self, dims_count, radius): | ||
self.feature_count = dims_count | ||
self.dim_borders = [[-radius, radius] for d in range(dims_count)] | ||
self.cubes = [] | ||
self.generated_data = None | ||
|
||
def add_cube(self, points_count, cube_param=None): | ||
if self.generated_data: | ||
raise ValueError("the data is already generated!") | ||
if cube_param is None: | ||
cube_param = {} | ||
label = 0 | ||
else: | ||
label = len(self.cubes) + 1 | ||
cube = [] | ||
class CubeParameters: | ||
def __init__(self, rows, loc=None): | ||
self.rows = rows | ||
self.loc = loc | ||
self.subspaces = [] | ||
|
||
|
||
class CubesGenerator: | ||
def __init__(self, feature_count, radius, file_name): | ||
self.file_name = file_name | ||
self.cube_parameters = [] | ||
self.feature_count = feature_count | ||
self.dim_borders = [[-radius, radius] for d in range(feature_count)] | ||
self.subspaces = [] | ||
self.perf_disc = [{d[1]} for d in self.dim_borders] | ||
|
||
def add_cube_parameter(self, cube_param): | ||
if cube_param.loc is None: | ||
cube_param.loc = {} | ||
self.cube_parameters.append(cube_param) | ||
location_params = cube_param.loc | ||
s = list(location_params.keys()) | ||
if s and not s in self.subspaces: | ||
self.subspaces.append(s) | ||
for feat in range(self.feature_count): | ||
if feat in cube_param.keys(): | ||
dim_params = cube_param[feat] | ||
if dim_params[0] < self.dim_borders[feat][0] \ | ||
or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]: | ||
raise ValueError("The cube with params " + str(cube_param) + " does not fit in dim " + str(feat) + "!") | ||
if feat in cube_param.loc.keys(): | ||
dim_params = location_params[feat] | ||
# perfect discretization | ||
if dim_params[0] != -RADIUS: | ||
self.perf_disc[feat].add(dim_params[0]) | ||
self.perf_disc[feat].add(dim_params[0] + dim_params[1]) | ||
|
||
def build(self): | ||
|
||
cubes = [] | ||
|
||
for cube_parameter in self.cube_parameters: | ||
|
||
column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[0] | ||
location_params = cube_parameter.loc | ||
points_count = cube_parameter.rows | ||
|
||
if len(location_params) == 0: | ||
label = 0 | ||
else: | ||
column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count) | ||
cube.append(column) | ||
class_labels = np.empty(points_count) | ||
class_labels.fill(label) | ||
cube.append(class_labels) | ||
self.cubes.append(cube) | ||
label = len(cubes) + 1 | ||
cube = [] | ||
for feat in range(self.feature_count): | ||
if feat in location_params.keys(): | ||
dim_params = location_params[feat] | ||
if dim_params[0] < self.dim_borders[feat][0] \ | ||
or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]: | ||
raise ValueError( | ||
"The cube with params " + str(location_params) + " does not fit in dim " + str( | ||
feat) + "!") | ||
|
||
def build(self): | ||
if not self.generated_data: | ||
self.generated_data = np.concatenate([np.array(cube) for cube in self.cubes], axis=1).transpose() | ||
return self.generated_data | ||
column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[ | ||
0] | ||
else: | ||
column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count) | ||
cube.append(column) | ||
class_labels = np.empty(points_count) | ||
class_labels.fill(label) | ||
cube.append(class_labels) | ||
cubes.append(cube) | ||
generated_data = np.concatenate([np.array(cube) for cube in cubes], axis=1).transpose() | ||
return generated_data, self.file_name | ||
|
||
def get_subspaces(self): | ||
return self.subspaces | ||
|
||
def get_discs(self): | ||
return [sorted(p) for p in self.perf_disc] | ||
|
||
|
||
def generate_partition(rf, c): | ||
arr = [i for i in range(rf)] | ||
random.shuffle(arr) | ||
min = 2 | ||
pivot = 0 | ||
partition = [] | ||
for i in range(c - 1): | ||
max = rf - pivot - (c - i - 1) * min | ||
t = random.randint(min, max) | ||
partition.append(arr[pivot: pivot + t]) | ||
pivot += t | ||
partition.append(arr[pivot:]) | ||
assert len(partition) == c | ||
return partition | ||
|
||
|
||
def generate_overlap_partition(rf, c): | ||
partition = generate_partition(rf, c) | ||
additions = [] | ||
for p in partition: | ||
add = [] | ||
# at most a half of the partition times of possibility of overlap | ||
for l in range(int(len(p) / 2)): | ||
if random.uniform(0, 1) < OVERLAP_PROBABILITY: | ||
others = list({i for i in range(rf)} - set(p)) | ||
rand = random.randint(0, rf - len(p) - 1) | ||
add.append(others[rand]) | ||
additions.append(add) | ||
|
||
for i, p in enumerate(partition): | ||
for add in additions[i]: | ||
p.append(add) | ||
return partition | ||
|
||
|
||
def produce_data_generator(rf, irf, c, type, name): | ||
total_f = rf + irf | ||
dg = CubesGenerator(total_f, RADIUS, name) | ||
# same number of records for each of the cubes + background | ||
cube_rows = int(ROWS / (c + 1)) | ||
if type == 'c': | ||
partition = [range(rf) for i in range(c)] | ||
elif type == 'i': | ||
partition = generate_partition(rf, c) | ||
elif type == 'io': | ||
partition = generate_overlap_partition(rf, c) | ||
else: | ||
raise ValueError("no such type!") | ||
|
||
for p in partition: | ||
location = dict() | ||
for j in p: | ||
location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH) | ||
dg.add_cube_parameter(CubeParameters(cube_rows, location)) | ||
dg.add_cube_parameter(CubeParameters(cube_rows)) | ||
return dg | ||
|
||
|
||
def produce_all_data_generators(): | ||
data_generators = [] | ||
global basedir | ||
|
||
basedir = BASE + 'new_cubes/' | ||
perf_disc_dir = BASE + 'ideal_disc/' | ||
perf_subspaces_file = BASE + 'ideal_subspaces.json' | ||
perf_subspaces = dict() | ||
perf_discs = dict() | ||
## relevant features 2 - 30 | ||
# for rf in range(10, 11): | ||
# # irrelevant features 0 - 100: | ||
# for irf in range(100, 101): | ||
# # cubes 1 - 10 | ||
# for c in range(3, 4): | ||
# # cube types complete, incomplete, incomplete overlapping | ||
# for type in ['i']: | ||
|
||
# relevant features 2 - 30 | ||
for rf in range(2, 31): | ||
# irrelevant features 0 - 100: | ||
for irf in range(101): | ||
# cubes 1 - 10 | ||
for c in range(1, 11): | ||
# cube types complete, incomplete, incomplete overlapping | ||
for type in ['c', 'i', 'io']: | ||
if c == 1 and type != 'c': | ||
continue | ||
if rf / c < 2 and type != 'c': | ||
# if not (rf / c < 2 and type == 'c'): | ||
continue | ||
name = 'cubes_' + '{0:02d}'.format(rf) + '_' \ | ||
+ '{0:03d}'.format(irf) + '_' \ | ||
+ '{0:02d}'.format(c) + '_' \ | ||
+ type + '.csv' | ||
# if os.path.exists(basedir + name) and os.path.exists( | ||
# perf_disc_dir + 'cut_' + name.replace('csv', 'txt')): | ||
# continue | ||
|
||
dg = produce_data_generator(rf, irf, c, type, name) | ||
perf_discs[name] = dg.get_discs() | ||
perf_subspaces[name] = dg.get_subspaces() | ||
data_generators.append(dg) | ||
for name in perf_discs: | ||
write_cut_file(perf_disc_dir + 'cut_' + name.replace('csv', 'txt'), perf_discs[name]) | ||
with open(perf_subspaces_file, 'w') as psf: | ||
json.dump(perf_subspaces, psf) | ||
return data_generators | ||
|
||
|
||
def write_cut_file(name, disc_intervals): | ||
with open(name, 'w') as out: | ||
for i in range(len(disc_intervals)): | ||
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n') | ||
for break_point in disc_intervals[i]: | ||
out.write(format(break_point, '.1f') + '\n') | ||
out.write('-------------------------------------\n') | ||
|
||
|
||
def store(data): | ||
global basedir | ||
name = data[1] | ||
pd.DataFrame(data[0]).to_csv(basedir + name, sep=';', header=False, index=False, float_format='%.2f') | ||
|
||
|
||
if __name__ == '__main__': | ||
dg = DataGenerator(30, 2) | ||
rows = 3 | ||
dg.add_cube(rows, {0: (-1.7, 1), 1: (-1.7, 1)}) | ||
dg.add_cube(rows, {1: (-0.4, 1), 2: (-1, 1)}) | ||
dg.add_cube(rows, {0: (0, 1), 1: (0, 1), 2: (0, 1)}) | ||
dg.add_cube(rows) | ||
dg.add_cube(rows) | ||
print(dg.build()) | ||
# 3d_cube_99.csv | ||
|
||
# 3d_2cubes_99.csv | ||
# 3d_20cubes_99.csv | ||
|
||
# 3d_2incompletecubes_99.csv | ||
# 3d_20incompletecubes_99.csv | ||
|
||
# 30d_cube_99.csv | ||
|
||
# 30d_2cubes_99.csv | ||
# 30d_20cubes_99.csv | ||
|
||
# 30d_2incompletecubes_99.csv | ||
# 30d_20incompletecubes_99.csv | ||
# l.plot_data_3d(dg.generated_data) | ||
print(generate_overlap_partition(7, 3)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.