data_generator.py

import numpy as np
import pandas as pd
import random
import util
import time
import os
import json
import constants as cst
import experiments_logging as l
import socket

RADIUS = 2
CUBE_WIDTH = 1
ROWS = 6000
OVERLAP_PROBABILITY = 0.6


class CubeParameters:
    def __init__(self, rows, loc=None):
        self.rows = rows
        self.loc = loc
        self.subspaces = []


class CubesGenerator:
    def __init__(self, rel_feature_count, irr_feature_count, radius, file_name):
        self.rel_feature_count = rel_feature_count
        self.file_name = file_name
        self.cube_parameters = []
        self.feature_count = rel_feature_count + irr_feature_count
        self.dim_borders = [[-radius, radius] for d in range(self.feature_count)]
        self.subspaces = []
        self.perf_disc = [{d[1]} for d in self.dim_borders[:rel_feature_count]]

    def __repr__(self):
        return 'CubesGenerator(file_name=' + str(self.file_name) \
               + ', rel_feature_count=' + str(self.rel_feature_count) \
               + ', feature_count=' + str(self.feature_count) + ")"

    def add_cube_parameter(self, cube_param):
        if cube_param.loc is None:
            cube_param.loc = {}
        self.cube_parameters.append(cube_param)
        location_params = cube_param.loc
        s = list(location_params.keys())
        if s and not s in self.subspaces:
            self.subspaces.append(s)

        # perfect discretization
        for feat in range(self.rel_feature_count):
            if feat in cube_param.loc.keys():
                dim_params = location_params[feat]
                if dim_params[0] != -RADIUS:
                    self.perf_disc[feat].add(dim_params[0])
                self.perf_disc[feat].add(dim_params[0] + dim_params[1])

    def build(self):

        cubes = []

        for cube_parameter in self.cube_parameters:

            location_params = cube_parameter.loc
            points_count = cube_parameter.rows

            if len(location_params) == 0:
                label = 0
            else:
                label = len(cubes) + 1
            cube = []
            for feat in range(self.feature_count):
                if feat in location_params.keys():
                    assert feat < self.rel_feature_count
                    dim_params = location_params[feat]
                    if dim_params[0] < self.dim_borders[feat][0] \
                            or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]:
                        raise ValueError(
                            "The cube with params " + str(location_params) + " does not fit in dim " + str(
                                feat) + "!")

                    column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[
                        0]
                else:
                    column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count)
                cube.append(column)
            class_labels = np.empty(points_count)
            class_labels.fill(label)
            cube.append(class_labels)
            cubes.append(cube)
        generated_data = np.concatenate([np.array(cube) for cube in cubes], axis=1).transpose()
        return generated_data, self.file_name

    def get_subspaces(self):
        return self.subspaces

    def get_discs(self):
        return [sorted(p) for p in self.perf_disc]


def generate_partition(rf, c):
    arr = [i for i in range(rf)]
    random.shuffle(arr)
    min = 2
    pivot = 0
    partition = []
    for i in range(c - 1):
        max = rf - pivot - (c - i - 1) * min
        t = random.randint(min, max)
        partition.append(arr[pivot: pivot + t])
        pivot += t
    partition.append(arr[pivot:])
    assert len(partition) == c
    return partition


def generate_overlap_partition(rf, c):
    partition = generate_partition(rf, c)
    additions = []
    for p in partition:
        add = []
        # at most a half of the partition times of possibility of overlap
        for l in range(int(len(p) / 2)):
            if random.uniform(0, 1) < OVERLAP_PROBABILITY:
                others = list({i for i in range(rf)} - set(p))
                rand = random.randint(0, rf - len(p) - 1)
                add.append(others[rand])
        additions.append(add)

    for i, p in enumerate(partition):
        for add in additions[i]:
            p.append(add)
    return partition


def produce_data_generator(rf, irf, c, type, file_name):
    dg = CubesGenerator(rf, irf, RADIUS, file_name)
    # same number of records for each of the cubes + background
    cube_rows = int(ROWS / (c + 1))
    if type == 'c':
        partition = [range(rf) for i in range(c)]
    elif type == 'i':
        partition = generate_partition(rf, c)
    elif type == 'io':
        partition = generate_overlap_partition(rf, c)
    else:
        raise ValueError("no such type!")

    for p in partition:
        location = dict()
        for j in p:
            location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH)
        dg.add_cube_parameter(CubeParameters(cube_rows, location))
    dg.add_cube_parameter(CubeParameters(cube_rows))
    return dg


def produce_all_data_generators():
    data_generators = []
    global basedir
    basedir = cst.DATA_DIR
    if not os.path.exists(basedir):
        os.mkdir(basedir)
    perf_disc_dir = cst.PERFECT_DISCRETIZATIONS_DIR
    if not os.path.exists(perf_disc_dir):
        os.mkdir(perf_disc_dir)
    perf_subspaces_file = cst.PERFECT_SUBSPACES_JSON

    perf_subspaces = dict()
    perf_discs = dict()

    def produce_dg(name, rf, c, type):

        # if os.path.exists(basedir + name) and os.path.exists(
        #                         perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
        #     continue

        dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name + ".csv")
        perf_discs[name] = dg.get_discs()
        perf_subspaces[name] = dg.get_subspaces()
        data_generators.append(dg)

    util.collect_params(produce_dg)
    for name in perf_discs:
        write_cut_file(perf_disc_dir + 'cut_' + name + ".txt", perf_discs[name])
    with open(perf_subspaces_file, 'w') as psf:
        json.dump(perf_subspaces, psf)
    return data_generators


def write_cut_file(name, disc_intervals):
    with open(name, 'w') as out:
        for i in range(len(disc_intervals)):
            out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
            for break_point in disc_intervals[i]:
                out.write(format(break_point, '.1f') + '\n')
            out.write('-------------------------------------\n')


def store(data):
    global basedir
    name = data[1]
    pd.DataFrame(data[0]).to_csv(basedir + name, sep=';', header=False, index=False, float_format='%.2f')


if __name__ == '__main__':
    # print(generate_overlap_partition(7, 3))
    generators = produce_all_data_generators()
    for g in generators:

        store(g.build())
	import numpy as np
	import pandas as pd
	import random
	import util
	import time
	import os
	import json
	import constants as cst
	import experiments_logging as l
	import socket

	RADIUS = 2
	CUBE_WIDTH = 1
	ROWS = 6000
	OVERLAP_PROBABILITY = 0.6


	class CubeParameters:
	def __init__(self, rows, loc=None):
	self.rows = rows
	self.loc = loc
	self.subspaces = []


	class CubesGenerator:
	def __init__(self, rel_feature_count, irr_feature_count, radius, file_name):
	self.rel_feature_count = rel_feature_count
	self.file_name = file_name
	self.cube_parameters = []
	self.feature_count = rel_feature_count + irr_feature_count
	self.dim_borders = [[-radius, radius] for d in range(self.feature_count)]
	self.subspaces = []
	self.perf_disc = [{d[1]} for d in self.dim_borders[:rel_feature_count]]

	def __repr__(self):
	return 'CubesGenerator(file_name=' + str(self.file_name) \
	+ ', rel_feature_count=' + str(self.rel_feature_count) \
	+ ', feature_count=' + str(self.feature_count) + ")"

	def add_cube_parameter(self, cube_param):
	if cube_param.loc is None:
	cube_param.loc = {}
	self.cube_parameters.append(cube_param)
	location_params = cube_param.loc
	s = list(location_params.keys())
	if s and not s in self.subspaces:
	self.subspaces.append(s)

	# perfect discretization
	for feat in range(self.rel_feature_count):
	if feat in cube_param.loc.keys():
	dim_params = location_params[feat]
	if dim_params[0] != -RADIUS:
	self.perf_disc[feat].add(dim_params[0])
	self.perf_disc[feat].add(dim_params[0] + dim_params[1])

	def build(self):

	cubes = []

	for cube_parameter in self.cube_parameters:

	location_params = cube_parameter.loc
	points_count = cube_parameter.rows

	if len(location_params) == 0:
	label = 0
	else:
	label = len(cubes) + 1
	cube = []
	for feat in range(self.feature_count):
	if feat in location_params.keys():
	assert feat < self.rel_feature_count
	dim_params = location_params[feat]
	if dim_params[0] < self.dim_borders[feat][0] \
	or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]:
	raise ValueError(
	"The cube with params " + str(location_params) + " does not fit in dim " + str(
	feat) + "!")

	column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[
	0]
	else:
	column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count)
	cube.append(column)
	class_labels = np.empty(points_count)
	class_labels.fill(label)
	cube.append(class_labels)
	cubes.append(cube)
	generated_data = np.concatenate([np.array(cube) for cube in cubes], axis=1).transpose()
	return generated_data, self.file_name

	def get_subspaces(self):
	return self.subspaces

	def get_discs(self):
	return [sorted(p) for p in self.perf_disc]


	def generate_partition(rf, c):
	arr = [i for i in range(rf)]
	random.shuffle(arr)
	min = 2
	pivot = 0
	partition = []
	for i in range(c - 1):
	max = rf - pivot - (c - i - 1) * min
	t = random.randint(min, max)
	partition.append(arr[pivot: pivot + t])
	pivot += t
	partition.append(arr[pivot:])
	assert len(partition) == c
	return partition


	def generate_overlap_partition(rf, c):
	partition = generate_partition(rf, c)
	additions = []
	for p in partition:
	add = []
	# at most a half of the partition times of possibility of overlap
	for l in range(int(len(p) / 2)):
	if random.uniform(0, 1) < OVERLAP_PROBABILITY:
	others = list({i for i in range(rf)} - set(p))
	rand = random.randint(0, rf - len(p) - 1)
	add.append(others[rand])
	additions.append(add)

	for i, p in enumerate(partition):
	for add in additions[i]:
	p.append(add)
	return partition


	def produce_data_generator(rf, irf, c, type, file_name):
	dg = CubesGenerator(rf, irf, RADIUS, file_name)
	# same number of records for each of the cubes + background
	cube_rows = int(ROWS / (c + 1))
	if type == 'c':
	partition = [range(rf) for i in range(c)]
	elif type == 'i':
	partition = generate_partition(rf, c)
	elif type == 'io':
	partition = generate_overlap_partition(rf, c)
	else:
	raise ValueError("no such type!")

	for p in partition:
	location = dict()
	for j in p:
	location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH)
	dg.add_cube_parameter(CubeParameters(cube_rows, location))
	dg.add_cube_parameter(CubeParameters(cube_rows))
	return dg


	def produce_all_data_generators():
	data_generators = []
	global basedir
	basedir = cst.DATA_DIR
	if not os.path.exists(basedir):
	os.mkdir(basedir)
	perf_disc_dir = cst.PERFECT_DISCRETIZATIONS_DIR
	if not os.path.exists(perf_disc_dir):
	os.mkdir(perf_disc_dir)
	perf_subspaces_file = cst.PERFECT_SUBSPACES_JSON

	perf_subspaces = dict()
	perf_discs = dict()

	def produce_dg(name, rf, c, type):

	# if os.path.exists(basedir + name) and os.path.exists(
	# perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
	# continue

	dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name + ".csv")
	perf_discs[name] = dg.get_discs()
	perf_subspaces[name] = dg.get_subspaces()
	data_generators.append(dg)

	util.collect_params(produce_dg)
	for name in perf_discs:
	write_cut_file(perf_disc_dir + 'cut_' + name + ".txt", perf_discs[name])
	with open(perf_subspaces_file, 'w') as psf:
	json.dump(perf_subspaces, psf)
	return data_generators


	def write_cut_file(name, disc_intervals):
	with open(name, 'w') as out:
	for i in range(len(disc_intervals)):
	out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
	for break_point in disc_intervals[i]:
	out.write(format(break_point, '.1f') + '\n')
	out.write('-------------------------------------\n')


	def store(data):
	global basedir
	name = data[1]
	pd.DataFrame(data[0]).to_csv(basedir + name, sep=';', header=False, index=False, float_format='%.2f')


	if __name__ == '__main__':
	# print(generate_overlap_partition(7, 3))
	generators = produce_all_data_generators()
	for g in generators:

	store(g.build())