Skip to content
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
Cannot retrieve contributors at this time
214 lines (182 sloc) 7.38 KB
import numpy as np
import pandas as pd
import random
import time
import os
import json
import experiments_logging as l
ROWS = 6000
# BASE = '/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/'
BASE = '/local/tmp/ipd_extended_experiments/'
class CubeParameters:
def __init__(self, rows, loc=None):
self.rows = rows
self.loc = loc
self.subspaces = []
class CubesGenerator:
def __init__(self, feature_count, radius, file_name):
self.file_name = file_name
self.cube_parameters = []
self.feature_count = feature_count
self.dim_borders = [[-radius, radius] for d in range(feature_count)]
self.subspaces = []
self.perf_disc = [{d[1]} for d in self.dim_borders]
def add_cube_parameter(self, cube_param):
if cube_param.loc is None:
cube_param.loc = {}
location_params = cube_param.loc
s = list(location_params.keys())
if s and not s in self.subspaces:
for feat in range(self.feature_count):
if feat in cube_param.loc.keys():
dim_params = location_params[feat]
# perfect discretization
if dim_params[0] != -RADIUS:
self.perf_disc[feat].add(dim_params[0] + dim_params[1])
def build(self):
cubes = []
for cube_parameter in self.cube_parameters:
location_params = cube_parameter.loc
points_count = cube_parameter.rows
if len(location_params) == 0:
label = 0
label = len(cubes) + 1
cube = []
for feat in range(self.feature_count):
if feat in location_params.keys():
dim_params = location_params[feat]
if dim_params[0] < self.dim_borders[feat][0] \
or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]:
raise ValueError(
"The cube with params " + str(location_params) + " does not fit in dim " + str(
feat) + "!")
column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[
column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count)
class_labels = np.empty(points_count)
generated_data = np.concatenate([np.array(cube) for cube in cubes], axis=1).transpose()
return generated_data, self.file_name
def get_subspaces(self):
return self.subspaces
def get_discs(self):
return [sorted(p) for p in self.perf_disc]
def generate_partition(rf, c):
arr = [i for i in range(rf)]
min = 2
pivot = 0
partition = []
for i in range(c - 1):
max = rf - pivot - (c - i - 1) * min
t = random.randint(min, max)
partition.append(arr[pivot: pivot + t])
pivot += t
assert len(partition) == c
return partition
def generate_overlap_partition(rf, c):
partition = generate_partition(rf, c)
additions = []
for p in partition:
add = []
# at most a half of the partition times of possibility of overlap
for l in range(int(len(p) / 2)):
if random.uniform(0, 1) < OVERLAP_PROBABILITY:
others = list({i for i in range(rf)} - set(p))
rand = random.randint(0, rf - len(p) - 1)
for i, p in enumerate(partition):
for add in additions[i]:
return partition
def produce_data_generator(rf, irf, c, type, name):
total_f = rf + irf
dg = CubesGenerator(total_f, RADIUS, name)
# same number of records for each of the cubes + background
cube_rows = int(ROWS / (c + 1))
if type == 'c':
partition = [range(rf) for i in range(c)]
elif type == 'i':
partition = generate_partition(rf, c)
elif type == 'io':
partition = generate_overlap_partition(rf, c)
raise ValueError("no such type!")
for p in partition:
location = dict()
for j in p:
location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH)
dg.add_cube_parameter(CubeParameters(cube_rows, location))
return dg
def produce_all_data_generators():
data_generators = []
global basedir
basedir = BASE + 'new_cubes/'
perf_disc_dir = BASE + 'ideal_disc/'
perf_subspaces_file = BASE + 'ideal_subspaces.json'
perf_subspaces = dict()
perf_discs = dict()
## relevant features 2 - 30
# for rf in range(10, 11):
# # irrelevant features 0 - 100:
# for irf in range(100, 101):
# # cubes 1 - 10
# for c in range(3, 4):
# # cube types complete, incomplete, incomplete overlapping
# for type in ['i']:
# relevant features 2 - 30
for rf in range(2, 31):
# irrelevant features 0 - 100:
for irf in range(101):
# cubes 1 - 10
for c in range(1, 11):
# cube types complete, incomplete, incomplete overlapping
for type in ['c', 'i', 'io']:
if c == 1 and type != 'c':
if rf / c < 2 and type != 'c':
# if not (rf / c < 2 and type == 'c'):
name = 'cubes_' + '{0:02d}'.format(rf) + '_' \
+ '{0:03d}'.format(irf) + '_' \
+ '{0:02d}'.format(c) + '_' \
+ type + '.csv'
# if os.path.exists(basedir + name) and os.path.exists(
# perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
# continue
dg = produce_data_generator(rf, irf, c, type, name)
perf_discs[name] = dg.get_discs()
perf_subspaces[name] = dg.get_subspaces()
for name in perf_discs:
write_cut_file(perf_disc_dir + 'cut_' + name.replace('csv', 'txt'), perf_discs[name])
with open(perf_subspaces_file, 'w') as psf:
json.dump(perf_subspaces, psf)
return data_generators
def write_cut_file(name, disc_intervals):
with open(name, 'w') as out:
for i in range(len(disc_intervals)):
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
for break_point in disc_intervals[i]:
out.write(format(break_point, '.1f') + '\n')
def store(data):
global basedir
name = data[1]
pd.DataFrame(data[0]).to_csv(basedir + name, sep=';', header=False, index=False, float_format='%.2f')
if __name__ == '__main__':
print(generate_overlap_partition(7, 3))