Skip to content
Permalink
b5d861f404
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
210 lines (172 sloc) 6.88 KB
import numpy as np
import pandas as pd
import random
import util
import time
import os
import json
import constants as cst
import experiments_logging as l
import socket
RADIUS = 2
CUBE_WIDTH = 1
ROWS = 6000
OVERLAP_PROBABILITY = 0.6
class CubeParameters:
def __init__(self, rows, loc=None):
self.rows = rows
self.loc = loc
self.subspaces = []
class CubesGenerator:
def __init__(self, rel_feature_count, irr_feature_count, radius, file_name):
self.rel_feature_count = rel_feature_count
self.file_name = file_name
self.cube_parameters = []
self.feature_count = rel_feature_count + irr_feature_count
self.dim_borders = [[-radius, radius] for d in range(self.feature_count)]
self.subspaces = []
self.perf_disc = [{d[1]} for d in self.dim_borders[:rel_feature_count]]
def __repr__(self):
return 'CubesGenerator(file_name=' + str(self.file_name) \
+ ', rel_feature_count=' + str(self.rel_feature_count) \
+ ', feature_count=' + str(self.feature_count) + ")"
def add_cube_parameter(self, cube_param):
if cube_param.loc is None:
cube_param.loc = {}
self.cube_parameters.append(cube_param)
location_params = cube_param.loc
s = list(location_params.keys())
if s and not s in self.subspaces:
self.subspaces.append(s)
# perfect discretization
for feat in range(self.rel_feature_count):
if feat in cube_param.loc.keys():
dim_params = location_params[feat]
if dim_params[0] != -RADIUS:
self.perf_disc[feat].add(dim_params[0])
self.perf_disc[feat].add(dim_params[0] + dim_params[1])
def build(self):
cubes = []
for cube_parameter in self.cube_parameters:
location_params = cube_parameter.loc
points_count = cube_parameter.rows
if len(location_params) == 0:
label = 0
else:
label = len(cubes) + 1
cube = []
for feat in range(self.feature_count):
if feat in location_params.keys():
assert feat < self.rel_feature_count
dim_params = location_params[feat]
if dim_params[0] < self.dim_borders[feat][0] \
or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]:
raise ValueError(
"The cube with params " + str(location_params) + " does not fit in dim " + str(
feat) + "!")
column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[
0]
else:
column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count)
cube.append(column)
class_labels = np.empty(points_count)
class_labels.fill(label)
cube.append(class_labels)
cubes.append(cube)
generated_data = np.concatenate([np.array(cube) for cube in cubes], axis=1).transpose()
return generated_data, self.file_name
def get_subspaces(self):
return self.subspaces
def get_discs(self):
return [sorted(p) for p in self.perf_disc]
def generate_partition(rf, c):
arr = [i for i in range(rf)]
random.shuffle(arr)
min = 2
pivot = 0
partition = []
for i in range(c - 1):
max = rf - pivot - (c - i - 1) * min
t = random.randint(min, max)
partition.append(arr[pivot: pivot + t])
pivot += t
partition.append(arr[pivot:])
assert len(partition) == c
return partition
def generate_overlap_partition(rf, c):
partition = generate_partition(rf, c)
additions = []
for p in partition:
add = []
# at most a half of the partition times of possibility of overlap
for l in range(int(len(p) / 2)):
if random.uniform(0, 1) < OVERLAP_PROBABILITY:
others = list({i for i in range(rf)} - set(p))
rand = random.randint(0, rf - len(p) - 1)
add.append(others[rand])
additions.append(add)
for i, p in enumerate(partition):
for add in additions[i]:
p.append(add)
return partition
def produce_data_generator(rf, irf, c, type, file_name):
dg = CubesGenerator(rf, irf, RADIUS, file_name)
# same number of records for each of the cubes + background
cube_rows = int(ROWS / (c + 1))
if type == 'c':
partition = [range(rf) for i in range(c)]
elif type == 'i':
partition = generate_partition(rf, c)
elif type == 'io':
partition = generate_overlap_partition(rf, c)
else:
raise ValueError("no such type!")
for p in partition:
location = dict()
for j in p:
location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH)
dg.add_cube_parameter(CubeParameters(cube_rows, location))
dg.add_cube_parameter(CubeParameters(cube_rows))
return dg
def produce_all_data_generators():
data_generators = []
global basedir
basedir = cst.DATA_DIR
if not os.path.exists(basedir):
os.mkdir(basedir)
perf_disc_dir = cst.PERFECT_DISCRETIZATIONS_DIR
if not os.path.exists(perf_disc_dir):
os.mkdir(perf_disc_dir)
perf_subspaces_file = cst.PERFECT_SUBSPACES_JSON
perf_subspaces = dict()
perf_discs = dict()
def produce_dg(name, rf, c, type):
# if os.path.exists(basedir + name) and os.path.exists(
# perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
# continue
dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name + ".csv")
perf_discs[name] = dg.get_discs()
perf_subspaces[name] = dg.get_subspaces()
data_generators.append(dg)
util.collect_params(produce_dg)
for name in perf_discs:
write_cut_file(perf_disc_dir + 'cut_' + name + ".txt", perf_discs[name])
with open(perf_subspaces_file, 'w') as psf:
json.dump(perf_subspaces, psf)
return data_generators
def write_cut_file(name, disc_intervals):
with open(name, 'w') as out:
for i in range(len(disc_intervals)):
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
for break_point in disc_intervals[i]:
out.write(format(break_point, '.1f') + '\n')
out.write('-------------------------------------\n')
def store(data):
global basedir
name = data[1]
pd.DataFrame(data[0]).to_csv(basedir + name, sep=';', header=False, index=False, float_format='%.2f')
if __name__ == '__main__':
# print(generate_overlap_partition(7, 3))
generators = produce_all_data_generators()
for g in generators:
store(g.build())