Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
added concurrent data generation
  • Loading branch information
Tatiana Dembelova committed Oct 10, 2017
1 parent ddd6a67 commit d8351be
Show file tree
Hide file tree
Showing 3 changed files with 362 additions and 57 deletions.
259 changes: 203 additions & 56 deletions data_generator.py
@@ -1,67 +1,214 @@
import numpy as np
import pandas as pd
import random
import time
import os
import json
import experiments_logging as l

RADIUS = 2
CUBE_WIDTH = 1
ROWS = 6000
OVERLAP_PROBABILITY = 0.6
# BASE = '/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/'
BASE = '/local/tmp/ipd_extended_experiments/'

class DataGenerator:
def __init__(self, dims_count, radius):
self.feature_count = dims_count
self.dim_borders = [[-radius, radius] for d in range(dims_count)]
self.cubes = []
self.generated_data = None

def add_cube(self, points_count, cube_param=None):
if self.generated_data:
raise ValueError("the data is already generated!")
if cube_param is None:
cube_param = {}
label = 0
else:
label = len(self.cubes) + 1
cube = []
class CubeParameters:
def __init__(self, rows, loc=None):
self.rows = rows
self.loc = loc
self.subspaces = []


class CubesGenerator:
def __init__(self, feature_count, radius, file_name):
self.file_name = file_name
self.cube_parameters = []
self.feature_count = feature_count
self.dim_borders = [[-radius, radius] for d in range(feature_count)]
self.subspaces = []
self.perf_disc = [{d[1]} for d in self.dim_borders]

def add_cube_parameter(self, cube_param):
if cube_param.loc is None:
cube_param.loc = {}
self.cube_parameters.append(cube_param)
location_params = cube_param.loc
s = list(location_params.keys())
if s and not s in self.subspaces:
self.subspaces.append(s)
for feat in range(self.feature_count):
if feat in cube_param.keys():
dim_params = cube_param[feat]
if dim_params[0] < self.dim_borders[feat][0] \
or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]:
raise ValueError("The cube with params " + str(cube_param) + " does not fit in dim " + str(feat) + "!")
if feat in cube_param.loc.keys():
dim_params = location_params[feat]
# perfect discretization
if dim_params[0] != -RADIUS:
self.perf_disc[feat].add(dim_params[0])
self.perf_disc[feat].add(dim_params[0] + dim_params[1])

def build(self):

cubes = []

for cube_parameter in self.cube_parameters:

column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[0]
location_params = cube_parameter.loc
points_count = cube_parameter.rows

if len(location_params) == 0:
label = 0
else:
column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count)
cube.append(column)
class_labels = np.empty(points_count)
class_labels.fill(label)
cube.append(class_labels)
self.cubes.append(cube)
label = len(cubes) + 1
cube = []
for feat in range(self.feature_count):
if feat in location_params.keys():
dim_params = location_params[feat]
if dim_params[0] < self.dim_borders[feat][0] \
or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]:
raise ValueError(
"The cube with params " + str(location_params) + " does not fit in dim " + str(
feat) + "!")

def build(self):
if not self.generated_data:
self.generated_data = np.concatenate([np.array(cube) for cube in self.cubes], axis=1).transpose()
return self.generated_data
column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[
0]
else:
column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count)
cube.append(column)
class_labels = np.empty(points_count)
class_labels.fill(label)
cube.append(class_labels)
cubes.append(cube)
generated_data = np.concatenate([np.array(cube) for cube in cubes], axis=1).transpose()
return generated_data, self.file_name

def get_subspaces(self):
return self.subspaces

def get_discs(self):
return [sorted(p) for p in self.perf_disc]


def generate_partition(rf, c):
arr = [i for i in range(rf)]
random.shuffle(arr)
min = 2
pivot = 0
partition = []
for i in range(c - 1):
max = rf - pivot - (c - i - 1) * min
t = random.randint(min, max)
partition.append(arr[pivot: pivot + t])
pivot += t
partition.append(arr[pivot:])
assert len(partition) == c
return partition


def generate_overlap_partition(rf, c):
partition = generate_partition(rf, c)
additions = []
for p in partition:
add = []
# at most a half of the partition times of possibility of overlap
for l in range(int(len(p) / 2)):
if random.uniform(0, 1) < OVERLAP_PROBABILITY:
others = list({i for i in range(rf)} - set(p))
rand = random.randint(0, rf - len(p) - 1)
add.append(others[rand])
additions.append(add)

for i, p in enumerate(partition):
for add in additions[i]:
p.append(add)
return partition


def produce_data_generator(rf, irf, c, type, name):
total_f = rf + irf
dg = CubesGenerator(total_f, RADIUS, name)
# same number of records for each of the cubes + background
cube_rows = int(ROWS / (c + 1))
if type == 'c':
partition = [range(rf) for i in range(c)]
elif type == 'i':
partition = generate_partition(rf, c)
elif type == 'io':
partition = generate_overlap_partition(rf, c)
else:
raise ValueError("no such type!")

for p in partition:
location = dict()
for j in p:
location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH)
dg.add_cube_parameter(CubeParameters(cube_rows, location))
dg.add_cube_parameter(CubeParameters(cube_rows))
return dg


def produce_all_data_generators():
data_generators = []
global basedir

basedir = BASE + 'new_cubes/'
perf_disc_dir = BASE + 'ideal_disc/'
perf_subspaces_file = BASE + 'ideal_subspaces.json'
perf_subspaces = dict()
perf_discs = dict()
## relevant features 2 - 30
# for rf in range(10, 11):
# # irrelevant features 0 - 100:
# for irf in range(100, 101):
# # cubes 1 - 10
# for c in range(3, 4):
# # cube types complete, incomplete, incomplete overlapping
# for type in ['i']:

# relevant features 2 - 30
for rf in range(2, 31):
# irrelevant features 0 - 100:
for irf in range(101):
# cubes 1 - 10
for c in range(1, 11):
# cube types complete, incomplete, incomplete overlapping
for type in ['c', 'i', 'io']:
if c == 1 and type != 'c':
continue
if rf / c < 2 and type != 'c':
# if not (rf / c < 2 and type == 'c'):
continue
name = 'cubes_' + '{0:02d}'.format(rf) + '_' \
+ '{0:03d}'.format(irf) + '_' \
+ '{0:02d}'.format(c) + '_' \
+ type + '.csv'
# if os.path.exists(basedir + name) and os.path.exists(
# perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
# continue

dg = produce_data_generator(rf, irf, c, type, name)
perf_discs[name] = dg.get_discs()
perf_subspaces[name] = dg.get_subspaces()
data_generators.append(dg)
for name in perf_discs:
write_cut_file(perf_disc_dir + 'cut_' + name.replace('csv', 'txt'), perf_discs[name])
with open(perf_subspaces_file, 'w') as psf:
json.dump(perf_subspaces, psf)
return data_generators


def write_cut_file(name, disc_intervals):
with open(name, 'w') as out:
for i in range(len(disc_intervals)):
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
for break_point in disc_intervals[i]:
out.write(format(break_point, '.1f') + '\n')
out.write('-------------------------------------\n')


def store(data):
global basedir
name = data[1]
pd.DataFrame(data[0]).to_csv(basedir + name, sep=';', header=False, index=False, float_format='%.2f')


if __name__ == '__main__':
dg = DataGenerator(30, 2)
rows = 3
dg.add_cube(rows, {0: (-1.7, 1), 1: (-1.7, 1)})
dg.add_cube(rows, {1: (-0.4, 1), 2: (-1, 1)})
dg.add_cube(rows, {0: (0, 1), 1: (0, 1), 2: (0, 1)})
dg.add_cube(rows)
dg.add_cube(rows)
print(dg.build())
# 3d_cube_99.csv

# 3d_2cubes_99.csv
# 3d_20cubes_99.csv

# 3d_2incompletecubes_99.csv
# 3d_20incompletecubes_99.csv

# 30d_cube_99.csv

# 30d_2cubes_99.csv
# 30d_20cubes_99.csv

# 30d_2incompletecubes_99.csv
# 30d_20incompletecubes_99.csv
# l.plot_data_3d(dg.generated_data)
print(generate_overlap_partition(7, 3))
2 changes: 1 addition & 1 deletion experiments_logging.py
Expand Up @@ -128,7 +128,7 @@ def write_cut_file(name, disc_intervals):
# rows = 20000
# data = np.concatenate((synthetic_cube_in_cube(rows, 2, 0), np.zeros((rows, 1))), axis=1)
# data = pd.read_csv("synthetic_cases/blobs/3d_3_blobs_aligned.csv", delimiter=";", header=None, na_values='?')
data = pd.read_csv("synthetic_cases/cubes/3d_3_cubes_aligned_xor.csv", delimiter=";", header=None, na_values='?')
data = pd.read_csv("new_cubes/cubes_10_100_03_i.csv", delimiter=";", header=None, na_values='?')
# data = pd.DataFrame(dg.cubes(4000))
plot_data_3d(data)

Expand Down

0 comments on commit d8351be

Please sign in to comment.