Skip to content
Permalink
25858984cf
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
214 lines (181 sloc) 7.15 KB
import numpy as np
import pandas as pd
import random
import time
import os
import json
import constants as cst
import experiments_logging as l
import socket
RADIUS = 2
CUBE_WIDTH = 1
ROWS = 6000
OVERLAP_PROBABILITY = 0.6
class CubeParameters:
def __init__(self, rows, loc=None):
self.rows = rows
self.loc = loc
self.subspaces = []
class CubesGenerator:
def __init__(self, feature_count, radius, file_name):
self.file_name = file_name
self.cube_parameters = []
self.feature_count = feature_count
self.dim_borders = [[-radius, radius] for d in range(feature_count)]
self.subspaces = []
self.perf_disc = [{d[1]} for d in self.dim_borders]
def add_cube_parameter(self, cube_param):
if cube_param.loc is None:
cube_param.loc = {}
self.cube_parameters.append(cube_param)
location_params = cube_param.loc
s = list(location_params.keys())
if s and not s in self.subspaces:
self.subspaces.append(s)
for feat in range(self.feature_count):
if feat in cube_param.loc.keys():
dim_params = location_params[feat]
# perfect discretization
if dim_params[0] != -RADIUS:
self.perf_disc[feat].add(dim_params[0])
self.perf_disc[feat].add(dim_params[0] + dim_params[1])
def build(self):
cubes = []
for cube_parameter in self.cube_parameters:
location_params = cube_parameter.loc
points_count = cube_parameter.rows
if len(location_params) == 0:
label = 0
else:
label = len(cubes) + 1
cube = []
for feat in range(self.feature_count):
if feat in location_params.keys():
dim_params = location_params[feat]
if dim_params[0] < self.dim_borders[feat][0] \
or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]:
raise ValueError(
"The cube with params " + str(location_params) + " does not fit in dim " + str(
feat) + "!")
column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[
0]
else:
column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count)
cube.append(column)
class_labels = np.empty(points_count)
class_labels.fill(label)
cube.append(class_labels)
cubes.append(cube)
generated_data = np.concatenate([np.array(cube) for cube in cubes], axis=1).transpose()
return generated_data, self.file_name
def get_subspaces(self):
return self.subspaces
def get_discs(self):
return [sorted(p) for p in self.perf_disc]
def generate_partition(rf, c):
arr = [i for i in range(rf)]
random.shuffle(arr)
min = 2
pivot = 0
partition = []
for i in range(c - 1):
max = rf - pivot - (c - i - 1) * min
t = random.randint(min, max)
partition.append(arr[pivot: pivot + t])
pivot += t
partition.append(arr[pivot:])
assert len(partition) == c
return partition
def generate_overlap_partition(rf, c):
partition = generate_partition(rf, c)
additions = []
for p in partition:
add = []
# at most a half of the partition times of possibility of overlap
for l in range(int(len(p) / 2)):
if random.uniform(0, 1) < OVERLAP_PROBABILITY:
others = list({i for i in range(rf)} - set(p))
rand = random.randint(0, rf - len(p) - 1)
add.append(others[rand])
additions.append(add)
for i, p in enumerate(partition):
for add in additions[i]:
p.append(add)
return partition
def produce_data_generator(rf, irf, c, type, name):
total_f = rf + irf
dg = CubesGenerator(total_f, RADIUS, name)
# same number of records for each of the cubes + background
cube_rows = int(ROWS / (c + 1))
if type == 'c':
partition = [range(rf) for i in range(c)]
elif type == 'i':
partition = generate_partition(rf, c)
elif type == 'io':
partition = generate_overlap_partition(rf, c)
else:
raise ValueError("no such type!")
for p in partition:
location = dict()
for j in p:
location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH)
dg.add_cube_parameter(CubeParameters(cube_rows, location))
dg.add_cube_parameter(CubeParameters(cube_rows))
return dg
def produce_all_data_generators():
data_generators = []
global basedir
basedir = cst.DATA_DIR
if not os.path.exists(basedir):
os.mkdir(basedir)
perf_disc_dir = cst.PERFECT_DISCRETIZATIONS_DIR
if not os.path.exists(perf_disc_dir):
os.mkdir(perf_disc_dir)
perf_subspaces_file = cst.PERFECT_SUBSPACES_JSON
perf_subspaces = dict()
perf_discs = dict()
# relevant features 2 - 30
for rf in range(2, 3):
# cubes 1 - 10
for c in range(3, 4):
# cube types complete, incomplete, incomplete overlapping
for type in ['c']:
# relevant features 2 - 30
# for rf in range(2, 31):
# # cubes 1 - 10
# for c in range(1, 11):
# # cube types complete, incomplete, incomplete overlapping
# for type in ['c', 'i', 'io']:
if (c == 1 or rf / c < 2) and type != 'c':
continue
name = 'cubes_' + '{0:02d}'.format(rf) + '_' \
+ '{0:02d}'.format(c) + '_' \
+ type + '.csv'
# if os.path.exists(basedir + name) and os.path.exists(
# perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
# continue
dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name)
perf_discs[name] = dg.get_discs()
perf_subspaces[name] = dg.get_subspaces()
data_generators.append(dg)
for name in perf_discs:
write_cut_file(perf_disc_dir + 'cut_' + name.replace('csv', 'txt'), perf_discs[name])
with open(perf_subspaces_file, 'w') as psf:
json.dump(perf_subspaces, psf)
return data_generators
def write_cut_file(name, disc_intervals):
with open(name, 'w') as out:
for i in range(len(disc_intervals)):
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
for break_point in disc_intervals[i]:
out.write(format(break_point, '.1f') + '\n')
out.write('-------------------------------------\n')
def store(data):
global basedir
name = data[1]
pd.DataFrame(data[0]).to_csv(basedir + name, sep=';', header=False, index=False, float_format='%.2f')
if __name__ == '__main__':
# print(generate_overlap_partition(7, 3))
generators = produce_all_data_generators()
for g in generators:
store(g.build())