data_generator.py

from abc import abstractmethod

import numpy as np
import pandas as pd
import random
import util
import time
import os
import json
import constants as cst
import experiments_logging as l
import socket

RADIUS = 2
CUBE_WIDTH = 1
ROWS = 6000
OVERLAP_PROBABILITY = 0.6


class CubeParameters:
    def __init__(self, rows, loc=None):
        self.rows = rows
        self.loc = loc
        self.subspaces = []


class DataGenerator:
    def __init__(self, file_name, rel_feature_count, irr_feature_count, radius):
        self.radius = radius
        self.feature_count = rel_feature_count + irr_feature_count
        self.irf = irr_feature_count
        self.file_name = file_name
        self.dim_borders = [[-radius, radius] for d in range(self.feature_count)]
        self.subspaces = []
        self.perf_disc = [{d[1]} for d in self.dim_borders[:rel_feature_count]]

    def __repr__(self):
        return '(file_name=' + str(self.file_name) + ")"

    @abstractmethod
    def build(self):
        ...

    @abstractmethod
    def get_discs(self):
        ...

    @abstractmethod
    def get_subspaces(self):
        ...


class XorGenerator(DataGenerator):
    def __init__(self, rf, irf, radius, rows, sigma, file_name):
        super().__init__(file_name, rf, irf, radius)
        self.rows = rows
        self.slave_features = rf - 1
        self.sigma = sigma
        self.perf_disc = [[0, radius] for d in self.dim_borders[:rf]]
        self.subspaces = [[f for f in range(rf)]]

    def get_discs(self):
        return self.perf_disc

    def get_subspaces(self):
        return self.subspaces


    def build(self):
        r_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.slave_features)) if self.slave_features > 0 else np.empty((self.rows, self.slave_features))
        parity_dim = -(np.sum(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(self.rows, 1) \
                     * np.random.uniform(0, self.radius, (self.rows, 1)) if self.slave_features > 0 else np.empty((self.rows, self.slave_features))

        irr_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.irf))


        xor_dict = dict()
        counter = [0]
        curr = []

        def add_value(r):
            if r == 0:
                counter[0] += 1
                xor_dict["".join([str(i) for i in curr]) + str(sum(curr) % 2)] = counter[0]
                return

            for i in [0, 1]:
                curr.append(i)
                add_value(r - 1)
                curr.pop()
        add_value(self.slave_features)


        class_labels = np.apply_along_axis(lambda a: xor_dict["".join([str(d) for d in a])], 1,
                                           np.concatenate([np.array(r_dims > 0, dtype='int'),
                                                           (np.sum(r_dims > 0, axis=1) % 2).reshape(self.rows, 1)],
                                                          axis=1))
        class_labels = class_labels.reshape([class_labels.shape[0], 1])
        data = np.concatenate((r_dims, parity_dim, irr_dims, class_labels), axis=1)
        if self.sigma:
            e = np.concatenate((np.random.normal(0, self.sigma, (self.rows, self.slave_features + self.irf + 1)), np.zeros((self.rows, 1))), axis=1)
            data = data + e
        return data, self.file_name

class CubesGenerator(DataGenerator):
    def __init__(self, rel_feature_count, irr_feature_count, radius, file_name):
        super().__init__(file_name, rel_feature_count, irr_feature_count, radius)
        self.rel_feature_count = rel_feature_count
        self.cube_parameters = []


    def add_cube_parameter(self, cube_param):
        if cube_param.loc is None:
            cube_param.loc = {}
        self.cube_parameters.append(cube_param)
        location_params = cube_param.loc
        s = list(location_params.keys())
        if s and not s in self.subspaces:
            self.subspaces.append(s)

        # perfect discretization
        for feat in range(self.rel_feature_count):
            if feat in cube_param.loc.keys():
                dim_params = location_params[feat]
                if dim_params[0] != -RADIUS:
                    self.perf_disc[feat].add(dim_params[0])
                self.perf_disc[feat].add(dim_params[0] + dim_params[1])

    def build(self):

        cubes = []

        for cube_parameter in self.cube_parameters:

            location_params = cube_parameter.loc
            points_count = cube_parameter.rows

            if len(location_params) == 0:
                label = 0
            else:
                label = len(cubes) + 1
            cube = []
            for feat in range(self.feature_count):
                if feat in location_params.keys():
                    assert feat < self.rel_feature_count
                    dim_params = location_params[feat]
                    if dim_params[0] < self.dim_borders[feat][0] \
                            or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]:
                        raise ValueError(
                            "The cube with params " + str(location_params) + " does not fit in dim " + str(
                                feat) + "!")

                    column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[
                        0]
                else:
                    column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count)
                cube.append(column)
            class_labels = np.empty(points_count)
            class_labels.fill(label)
            cube.append(class_labels)
            cubes.append(cube)
        generated_data = np.concatenate([np.array(cube) for cube in cubes], axis=1).transpose()
        return generated_data, self.file_name

    def get_subspaces(self):
        return self.subspaces

    def get_discs(self):
        return [sorted(p) for p in self.perf_disc]


def generate_partition(rf, c):
    arr = [i for i in range(rf)]
    random.shuffle(arr)
    min = 2
    pivot = 0
    partition = []
    for i in range(c - 1):
        max = rf - pivot - (c - i - 1) * min
        t = random.randint(min, max)
        partition.append(arr[pivot: pivot + t])
        pivot += t
    partition.append(arr[pivot:])
    assert len(partition) == c
    return partition


def generate_overlap_partition(rf, c):
    partition = generate_partition(rf, c)
    additions = []
    for p in partition:
        add = []
        # at most a half of the partition times of possibility of overlap
        for l in range(int(len(p) / 2)):
            if random.uniform(0, 1) < OVERLAP_PROBABILITY:
                others = list({i for i in range(rf)} - set(p))
                rand = random.randint(0, rf - len(p) - 1)
                add.append(others[rand])
        additions.append(add)

    for i, p in enumerate(partition):
        for add in additions[i]:
            p.append(add)
    return partition


def produce_cube_generator(rf, irf, interactions, type, cubes, file_name):
    dg = CubesGenerator(rf, irf, RADIUS, file_name)
    # same number of records for each of the interactions * cubes + background
    cube_rows = int(ROWS / (interactions * cubes + 1))
    if type == 'c':
        partition = [range(rf) for i in range(interactions)]
    elif type == 'i':
        partition = generate_partition(rf, interactions)
    elif type == 'io':
        partition = generate_overlap_partition(rf, interactions)
    else:
        raise ValueError("no such type!")

    for p in partition:
        for cube in range(cubes):
            location = dict()
            for j in p:
                location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH)
            dg.add_cube_parameter(CubeParameters(cube_rows, location))
    dg.add_cube_parameter(CubeParameters(cube_rows))
    return dg


def produce_xor_generator(rf, irf, file_name):
    return XorGenerator(rf, irf, RADIUS, ROWS, 0.1, file_name)


def produce_all_data_generators():
    data_generators = []
    global basedir
    basedir = cst.DATA_DIR
    if not os.path.exists(basedir):
        os.mkdir(basedir)
    perf_disc_dir = cst.PERFECT_DISCRETIZATIONS_DIR
    if not os.path.exists(perf_disc_dir):
        os.mkdir(perf_disc_dir)

    perf_subspaces = dict()
    perf_discs = dict()

    def produce_dg(name, interaction_type, rf, i, type, cubes):

        if os.path.exists(basedir + name + ".csv") and os.path.exists(perf_disc_dir + 'cut_' + name + ".txt"):
            return

        if interaction_type == cst.InteractionType.CUBES:
            dg = produce_cube_generator(rf, cst.IRRELEVANT_FEATURES, i, type, cubes, name + ".csv")
        elif interaction_type == cst.InteractionType.XOR:
            dg = produce_xor_generator(rf, cst.IRRELEVANT_FEATURES, name + ".csv")
        else:
            raise ValueError("no implementation of data generator for", interaction_type.name)
        perf_discs[name] = dg.get_discs()
        perf_subspaces[name] = dg.get_subspaces()
        data_generators.append(dg)

    util.collect_params(produce_dg)
    for name in perf_discs:
        write_cut_file(perf_disc_dir + 'cut_' + name + ".txt", perf_discs[name])

    write_perf_subspaces(perf_subspaces)
    return data_generators


def write_perf_subspaces(perf_subspaces):
    all_perf_subspaces = perf_subspaces
    if os.path.exists(cst.PERFECT_SUBSPACES_JSON):
        with open(cst.PERFECT_SUBSPACES_JSON, 'r') as psf:
            old_perf_subspaces = json.load(psf)
        all_perf_subspaces.update(old_perf_subspaces)

    with open(cst.PERFECT_SUBSPACES_JSON, 'w') as psf:
        json.dump(all_perf_subspaces, psf)


def write_cut_file(name, disc_intervals):
    with open(name, 'w') as out:
        for i in range(len(disc_intervals)):
            out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
            for break_point in disc_intervals[i]:
                out.write(format(break_point, '.1f') + '\n')
            out.write('-------------------------------------\n')


def store(data):
    global basedir
    name = data[1]
    pd.DataFrame(data[0]).to_csv(basedir + name, sep=';', header=False, index=False, float_format='%.2f')


if __name__ == '__main__':
    # l.plot_data_3d(produce_xor_generator(3, 0, 'bla').build()[0])
    # print(generate_overlap_partition(7, 3))

    generators = produce_all_data_generators()
    for g in generators:

        store(g.build())
	from abc import abstractmethod

	import numpy as np
	import pandas as pd
	import random
	import util
	import time
	import os
	import json
	import constants as cst
	import experiments_logging as l
	import socket

	RADIUS = 2
	CUBE_WIDTH = 1
	ROWS = 6000
	OVERLAP_PROBABILITY = 0.6


	class CubeParameters:
	def __init__(self, rows, loc=None):
	self.rows = rows
	self.loc = loc
	self.subspaces = []


	class DataGenerator:
	def __init__(self, file_name, rel_feature_count, irr_feature_count, radius):
	self.radius = radius
	self.feature_count = rel_feature_count + irr_feature_count
	self.irf = irr_feature_count
	self.file_name = file_name
	self.dim_borders = [[-radius, radius] for d in range(self.feature_count)]
	self.subspaces = []
	self.perf_disc = [{d[1]} for d in self.dim_borders[:rel_feature_count]]

	def __repr__(self):
	return '(file_name=' + str(self.file_name) + ")"

	@abstractmethod
	def build(self):
	...

	@abstractmethod
	def get_discs(self):
	...

	@abstractmethod
	def get_subspaces(self):
	...


	class XorGenerator(DataGenerator):
	def __init__(self, rf, irf, radius, rows, sigma, file_name):
	super().__init__(file_name, rf, irf, radius)
	self.rows = rows
	self.slave_features = rf - 1
	self.sigma = sigma
	self.perf_disc = [[0, radius] for d in self.dim_borders[:rf]]
	self.subspaces = [[f for f in range(rf)]]

	def get_discs(self):
	return self.perf_disc

	def get_subspaces(self):
	return self.subspaces


	def build(self):
	r_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.slave_features)) if self.slave_features > 0 else np.empty((self.rows, self.slave_features))
	parity_dim = -(np.sum(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(self.rows, 1) \
	* np.random.uniform(0, self.radius, (self.rows, 1)) if self.slave_features > 0 else np.empty((self.rows, self.slave_features))

	irr_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.irf))


	xor_dict = dict()
	counter = [0]
	curr = []

	def add_value(r):
	if r == 0:
	counter[0] += 1
	xor_dict["".join([str(i) for i in curr]) + str(sum(curr) % 2)] = counter[0]
	return

	for i in [0, 1]:
	curr.append(i)
	add_value(r - 1)
	curr.pop()
	add_value(self.slave_features)


	class_labels = np.apply_along_axis(lambda a: xor_dict["".join([str(d) for d in a])], 1,
	np.concatenate([np.array(r_dims > 0, dtype='int'),
	(np.sum(r_dims > 0, axis=1) % 2).reshape(self.rows, 1)],
	axis=1))
	class_labels = class_labels.reshape([class_labels.shape[0], 1])
	data = np.concatenate((r_dims, parity_dim, irr_dims, class_labels), axis=1)
	if self.sigma:
	e = np.concatenate((np.random.normal(0, self.sigma, (self.rows, self.slave_features + self.irf + 1)), np.zeros((self.rows, 1))), axis=1)
	data = data + e
	return data, self.file_name

	class CubesGenerator(DataGenerator):
	def __init__(self, rel_feature_count, irr_feature_count, radius, file_name):
	super().__init__(file_name, rel_feature_count, irr_feature_count, radius)
	self.rel_feature_count = rel_feature_count
	self.cube_parameters = []


	def add_cube_parameter(self, cube_param):
	if cube_param.loc is None:
	cube_param.loc = {}
	self.cube_parameters.append(cube_param)
	location_params = cube_param.loc
	s = list(location_params.keys())
	if s and not s in self.subspaces:
	self.subspaces.append(s)

	# perfect discretization
	for feat in range(self.rel_feature_count):
	if feat in cube_param.loc.keys():
	dim_params = location_params[feat]
	if dim_params[0] != -RADIUS:
	self.perf_disc[feat].add(dim_params[0])
	self.perf_disc[feat].add(dim_params[0] + dim_params[1])

	def build(self):

	cubes = []

	for cube_parameter in self.cube_parameters:

	location_params = cube_parameter.loc
	points_count = cube_parameter.rows

	if len(location_params) == 0:
	label = 0
	else:
	label = len(cubes) + 1
	cube = []
	for feat in range(self.feature_count):
	if feat in location_params.keys():
	assert feat < self.rel_feature_count
	dim_params = location_params[feat]
	if dim_params[0] < self.dim_borders[feat][0] \
	or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]:
	raise ValueError(
	"The cube with params " + str(location_params) + " does not fit in dim " + str(
	feat) + "!")

	column = np.random.uniform(0, dim_params[1], points_count) + np.ones(points_count) * dim_params[
	0]
	else:
	column = np.random.uniform(self.dim_borders[feat][0], self.dim_borders[feat][1], points_count)
	cube.append(column)
	class_labels = np.empty(points_count)
	class_labels.fill(label)
	cube.append(class_labels)
	cubes.append(cube)
	generated_data = np.concatenate([np.array(cube) for cube in cubes], axis=1).transpose()
	return generated_data, self.file_name

	def get_subspaces(self):
	return self.subspaces

	def get_discs(self):
	return [sorted(p) for p in self.perf_disc]


	def generate_partition(rf, c):
	arr = [i for i in range(rf)]
	random.shuffle(arr)
	min = 2
	pivot = 0
	partition = []
	for i in range(c - 1):
	max = rf - pivot - (c - i - 1) * min
	t = random.randint(min, max)
	partition.append(arr[pivot: pivot + t])
	pivot += t
	partition.append(arr[pivot:])
	assert len(partition) == c
	return partition


	def generate_overlap_partition(rf, c):
	partition = generate_partition(rf, c)
	additions = []
	for p in partition:
	add = []
	# at most a half of the partition times of possibility of overlap
	for l in range(int(len(p) / 2)):
	if random.uniform(0, 1) < OVERLAP_PROBABILITY:
	others = list({i for i in range(rf)} - set(p))
	rand = random.randint(0, rf - len(p) - 1)
	add.append(others[rand])
	additions.append(add)

	for i, p in enumerate(partition):
	for add in additions[i]:
	p.append(add)
	return partition


	def produce_cube_generator(rf, irf, interactions, type, cubes, file_name):
	dg = CubesGenerator(rf, irf, RADIUS, file_name)
	# same number of records for each of the interactions * cubes + background
	cube_rows = int(ROWS / (interactions * cubes + 1))
	if type == 'c':
	partition = [range(rf) for i in range(interactions)]
	elif type == 'i':
	partition = generate_partition(rf, interactions)
	elif type == 'io':
	partition = generate_overlap_partition(rf, interactions)
	else:
	raise ValueError("no such type!")

	for p in partition:
	for cube in range(cubes):
	location = dict()
	for j in p:
	location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH)
	dg.add_cube_parameter(CubeParameters(cube_rows, location))
	dg.add_cube_parameter(CubeParameters(cube_rows))
	return dg


	def produce_xor_generator(rf, irf, file_name):
	return XorGenerator(rf, irf, RADIUS, ROWS, 0.1, file_name)


	def produce_all_data_generators():
	data_generators = []
	global basedir
	basedir = cst.DATA_DIR
	if not os.path.exists(basedir):
	os.mkdir(basedir)
	perf_disc_dir = cst.PERFECT_DISCRETIZATIONS_DIR
	if not os.path.exists(perf_disc_dir):
	os.mkdir(perf_disc_dir)

	perf_subspaces = dict()
	perf_discs = dict()

	def produce_dg(name, interaction_type, rf, i, type, cubes):

	if os.path.exists(basedir + name + ".csv") and os.path.exists(perf_disc_dir + 'cut_' + name + ".txt"):
	return

	if interaction_type == cst.InteractionType.CUBES:
	dg = produce_cube_generator(rf, cst.IRRELEVANT_FEATURES, i, type, cubes, name + ".csv")
	elif interaction_type == cst.InteractionType.XOR:
	dg = produce_xor_generator(rf, cst.IRRELEVANT_FEATURES, name + ".csv")
	else:
	raise ValueError("no implementation of data generator for", interaction_type.name)
	perf_discs[name] = dg.get_discs()
	perf_subspaces[name] = dg.get_subspaces()
	data_generators.append(dg)

	util.collect_params(produce_dg)
	for name in perf_discs:
	write_cut_file(perf_disc_dir + 'cut_' + name + ".txt", perf_discs[name])

	write_perf_subspaces(perf_subspaces)
	return data_generators


	def write_perf_subspaces(perf_subspaces):
	all_perf_subspaces = perf_subspaces
	if os.path.exists(cst.PERFECT_SUBSPACES_JSON):
	with open(cst.PERFECT_SUBSPACES_JSON, 'r') as psf:
	old_perf_subspaces = json.load(psf)
	all_perf_subspaces.update(old_perf_subspaces)

	with open(cst.PERFECT_SUBSPACES_JSON, 'w') as psf:
	json.dump(all_perf_subspaces, psf)


	def write_cut_file(name, disc_intervals):
	with open(name, 'w') as out:
	for i in range(len(disc_intervals)):
	out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
	for break_point in disc_intervals[i]:
	out.write(format(break_point, '.1f') + '\n')
	out.write('-------------------------------------\n')


	def store(data):
	global basedir
	name = data[1]
	pd.DataFrame(data[0]).to_csv(basedir + name, sep=';', header=False, index=False, float_format='%.2f')


	if __name__ == '__main__':
	# l.plot_data_3d(produce_xor_generator(3, 0, 'bla').build()[0])
	# print(generate_overlap_partition(7, 3))

	generators = produce_all_data_generators()
	for g in generators:

	store(g.build())