data_generation.py

import numpy as np
import pandas as pd
import re
import os
from pathlib import Path
from os import listdir


def artificial():

    # a = [str(i) + ";" + str(int((i / 200) % 2)) + ";0" + "\n" for i in range(20000)]
    # with open("/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/new_cubes/artificial20000.csv", "w") as f:
    #     f.writelines(a)

    a = [str(i) + ";0" + ";0" + "\n" for i in range(10000)]
    size = len(a)
    a.extend([str(i + size) + ";1;0" + "\n" for i in range(10000)])
    with open("/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/new_cubes/artificialtrick10000.csv", "w") as f:
        f.writelines(a)

# synthetic case from uds
def correlated_data(m, n, sigma, f):
    # l1 = int(n / 2)
    # l2 = n - l1
    # Z = np.random.normal(0, 1, (m, l1))
    # A = np.matrix(np.random.uniform(1, 2, (l1, l1)))
    # X1 = Z * A
    # B = np.matrix(np.random.uniform(1, 2, (l1, l2)))
    # W = X1 * B
    # E = np.random.normal(0, sigma, (m, l2))
    # X2 = f(W) + E
    # result = np.append(X1, X2, axis=1)
    # print(result)

    l1 = int(n / 2)
    l2 = n - l1
    Z = np.random.normal(0, 1, (m, l1))
    A = np.matrix(np.random.uniform(1, 2, (l1, l1)))
    X1 = Z * A
    # A = np.matrix(np.random.uniform(1, 2, (m, l1)))
    # X1 = A
    B = np.matrix(np.random.uniform(1, 2, (l1, l2)))
    W = X1 * B
    E = np.random.normal(0, sigma, (m, l2))
    X2 = f(W) + E
    result = np.append(X1, X2, axis=1)
    print(result)

    return result


def generate_uncorrelated_data(m, n):
    return np.random.normal(0, 1, (m, n))


def func1(X):
    return 2 * X + 1


def func2(X):
    return np.log2(np.abs(X) + 1)


def func3(X):
    return np.power(X, 2)


def synthetic_data_uni(m, r, s, sigma=0.1):
    r_dims = np.random.uniform(-0.5, 0.5, (m, r)) if r > 0 else np.empty((m, r))
    parity_dim = -(np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.random.uniform(0, 0.5,
                                                                                                       (m,
                                                                                                        1)) if r > 0 else np.empty(
        (m, r))
    s_dims = np.random.uniform(-0.5, 0.5, (m, s))
    data = np.concatenate((r_dims, parity_dim, s_dims), axis=1)
    if sigma:
        e = np.random.normal(0, sigma, (m, r + s + 1))
        data = data + e

    return data


def synthetic_data_uni_negative(m, r, s, sigma=0.1):
    r_dims = np.random.uniform(-0.5, 0.5, (m, r)) if r > 0 else np.empty((m, r))
    parity_dim = (np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.random.uniform(0, 0.5,
                                                                                                      (m,
                                                                                                       1)) if r > 0 else np.empty(
        (m, r))
    s_dims = np.random.uniform(-0.5, 0.5, (m, s))
    data = np.concatenate((r_dims, parity_dim, s_dims), axis=1)
    if sigma:
        e = np.random.normal(0, sigma, (m, r + s + 1))
        data = data + e

    return data


def synthetic_data_gauss(m, r, s, sigma=0.1):
    r_dims = np.random.normal(0, 1, (m, r)) if r > 0 else np.empty((m, r))
    parity_dim = -(np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.abs(np.random.normal(0, 1,
                                                                                                             (m,
                                                                                                              1))) if r > 0 else np.empty(
        (m, r))
    s_dims = np.random.normal(0, 1, (m, s))
    data = np.concatenate((r_dims, parity_dim, s_dims), axis=1)
    if sigma:
        e = np.random.normal(0, sigma, (m, r + s + 1))
        data = data + e

    return data


def synthetic_with_nearcopies(m, k, l, sigma=0.1):
    k_dims = np.repeat(np.random.uniform(-0.5, 0, (m, 1)), k, axis=1) if k > 0 else np.empty((m, k))
    l_dims = np.repeat(np.random.uniform(0, 0.5, (m, 1)), l, axis=1) if l > 0 else np.empty((m, l))

    data = np.concatenate((k_dims, l_dims), axis=1)
    if sigma:
        e = np.random.normal(0, sigma, (m, k + l))
        data = data + e

    return data


def synthetic_cube_in_cube(m, r, i, side, sigma=0.5):
    if r < 1:
        raise ValueError
    h = int(m * sigma)
    range = [-0.5, 0] if side == 'l' else [-0.25, 0.25] if side == 'm' else [0, 0.5]
    contra_range = [0, 0.5] if side == 'l' else [-0.25, 0.25] if side == 'm' else [-0.5, 0]
    r_dims = np.concatenate((
        # np.concatenate(
        #     (np.random.uniform(range[0], range[1], (h, 1)),
        #      np.random.uniform(contra_range[0], contra_range[1], (h, 1))),
        #     axis=1)
        np.random.uniform(range[0], range[1], (h, r))
        , np.random.uniform(-0.5, 0.5, (m - h, r))), axis=0)
    i_dims = np.random.uniform(-0.5, 0.5, (m, i)) if i > 0 else np.empty((m, i))
    data = np.concatenate((r_dims, i_dims), axis=1)

    return data


def synthetic_cjs():
    return np.concatenate((np.concatenate((np.random.normal(0, 1, (100, 1)), np.random.normal(2, 1, (100, 1))), axis=1),
                           np.concatenate((np.random.normal(4, 1, (100, 1)), np.random.normal(5, 1, (100, 1))),
                                          axis=1)), axis=0)

# def blobs(rows):
#     blobs_number = 4
#     dims = 4
#     l = int(rows/blobs_number)
#     blob1 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * -3, np.ones((l, 1)) * -3, np.ones((l, 1)) * -3, np.ones((l, 1)) * -3), axis=1)
#     blob2 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 0, np.ones((l, 1)) * 0, np.ones((l, 1)) * 0, np.ones((l, 1)) * 0), axis=1)
#     blob3 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 3, np.ones((l, 1)) * 3, np.ones((l, 1)) * 3, np.ones((l, 1)) * 3), axis=1)
#     blob4 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 6, np.ones((l, 1)) * 6, np.ones((l, 1)) * 6, np.ones((l, 1)) * 6), axis=1)
#
#     return np.concatenate((blob1, blob2, blob3, blob4), axis=0)
#     # return np.concatenate((blob1, blob2, blob3), axis=0)

def cubes(rows):
    cubes_number = 4
    dims = 4
    l = int(rows/cubes_number)
    blob1 = np.random.uniform(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * -1.7, np.ones((l, 1)) * -1.7, np.ones((l, 1)) * -1.7, np.ones((l, 1)) * -1.7), axis=1)
    blob2 = np.random.uniform(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 0, np.ones((l, 1)) * 0, np.ones((l, 1)) * 0, np.ones((l, 1)) * 0), axis=1)
    blob3 = np.random.uniform(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 1.5, np.ones((l, 1)) * 1.5, np.ones((l, 1)) * 1.5, np.ones((l, 1)) * 1.5), axis=1)
    blob4 = np.random.uniform(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 3, np.ones((l, 1)) * 3, np.ones((l, 1)) * 3, np.ones((l, 1)) * 3), axis=1)
    background = np.random.uniform(-2, 4, (l, dims))
    # blob3 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 3, np.ones((l, 1)) * 3, np.ones((l, 1)) * 3, np.ones((l, 1)) * 3), axis=1)
    # blob4 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 6, np.ones((l, 1)) * 6, np.ones((l, 1)) * 6, np.ones((l, 1)) * 6), axis=1)

    # return np.concatenate((blob1, blob2, background), axis=0)
    return np.concatenate((blob1, blob2, blob3, blob4, background), axis=0)
    # return np.concatenate((blob1, blob2, blob3, background), axis=0)


def append_irrelevant_features(file, n):
    if n == 0:
        raise ValueError("# of irrelevant features is 0")
    data = pd.read_csv(file, delimiter=";", header=None, na_values='?')
    rows = data.shape[0]
    last_dim = data.shape[1] - 1
    irrel_data = np.random.uniform(-0.5, 0.5, (rows, n))
    return np.concatenate([data.loc[:, :last_dim - 1], irrel_data, data.loc[:, last_dim].to_frame()], axis=1)


def generate():
    # -------generating dataset
    # data = synthetic_cube_in_cube(rows, rel_features, irrel_features, 'l')
    # data__ = synthetic_cjs()
    # data = correlated_data(rows, rel_features + irrel_features, 1, func1)
    # data = cubes(rows)
    # # add zeroes as default class
    # data = np.concatenate((data, np.zeros((data.shape[0], 1))), axis=1)
    # -------appending irrelevant features to existing dataset
    data = append_irrelevant_features(source, irrel_features)
    # storing to disk
    pd.DataFrame(data).to_csv(file, sep=';', header=False, index=False, float_format='%.2f')

def cube_func(r):
    if -1.5 <= r[0] <= -0.5 and -1.5 <= r[1] <= -0.5 and -1.5 <= r[2] <= -0.5:
        return 1
    if 0 <= r[0] <= 1 and 0 <= r[1] <= 1 and 0 <= r[2] <= 1:
        return 2
    if 1.5 <= r[0] <= 2.5 and 1.5 <= r[1] <= 2.5 and 1.5 <= r[2] <= 2.5:
        return 3
    if 3.0 <= r[0] <= 4.0 and 3.0 <= r[1] <= 4.0 and 3.0 <= r[2] <= 4.0:
        return 3
    # if -1.7 <= r[0] <= -0.7 and -1.7 <= r[1] <= -0.7 and -1.7 <= r[2] <= -0.7:
    #     return 1
    # if 0 <= r[0] <= 1 and 0 <= r[1] <= 1 and 0 <= r[2] <= 1:
    #     return 2
    # if 1.5 <= r[0] <= 2.5 and 1.5 <= r[1] <= 2.5 and 1.5 <= r[2] <= 2.5:
    #     return 3
    # elif 3 <= r[0] <= 4 and 3 <= r[1] <= 4 and 3 <= r[2] <= 4:
    #     return 4
    # elif 0 <= r[0] <= 1 and 0 <= r[1] <= 1:
    #     return 3
    else:
        return 0

def update_class(file, new_file):
    data = pd.read_csv(file, delimiter=";", header=None, na_values='?')
    zeros_dim = data.shape[1] - 1
    classes = data.apply(cube_func, axis=1).to_frame()
    new_data = np.concatenate([data.loc[:, :zeros_dim - 1], classes], axis=1)
    pd.DataFrame(new_data).to_csv(new_file, sep=';', header=False, index=False, float_format='%.2f')

# def update_out_txt(directory):
#     directory = str(directory)
#     if not os.path.exists(directory + "/out.txt"):
#         return
#     search = re.search("(\dd_(\d).*.csv)", directory)
#     run = search.group(1)
#     classes_count = int(search.group(2)) + 1
#     classes = pd.read_csv("synthetic_cases/cubes/" + run, delimiter=";", header=None, na_values='?')
#     classes = classes[classes.shape[1] - 1]
#
#     class_row = 0
#     with open(directory + "/out.txt", "r") as old_out:
#         with open(directory + "/out2.txt", "w") as new_out:
#             for line in old_out:
#                 if "@attribute class {" in line:
#                     line = "@attribute class {" + ",".join(['"' + str(i) + '.0"' for i in range(classes_count)]) + "}"
#                 if ',"0.0"' in line:
#                     line = line.replace(',"0.0"', ',"' + str(classes[class_row]) + '"')
#                     class_row += 1
#                 new_out.write(line)


# def export_out(directory):
#     directory = str(directory)
#     if not os.path.exists(directory + "/out2.txt"):
#         return
#     destination = "logs_test/arff/" + directory.replace(".csv", "").replace("logs_test/", "") + ".arff"
#     search = re.search("_(\d)d_.*", directory)
#     rel_dim_count = int(search.group(1))
#
#     with open(directory + "/out2.txt", "r") as out_txt:
#         with open(destination, "w") as new_out:
#             for line in out_txt:
#                 if line.startswith("@attribute class"):
#                     line = line + "\n"
#                 if "@attribute dim" in line:
#                     if int(re.search("@attribute dim(\d+) ", line).group(1)) >= rel_dim_count:
#                         continue
#
#                 if not line.startswith("@") and line.strip() != "":
#                     split = line.split(",")
#                     line = ",".join(split[:rel_dim_count]) + ',' + split[-1]
#                 new_out.write(line)


def correct_weka_output():
    file = "test_experiment_all.arff"
    file2 = "test_experiment_all2.arff"
    pathlist = listdir("logs_quality/arff")

    with open(file, "r") as old_out:
        with open(file2, "w") as output2:
            path_id = 0
            counter = 0
            for line in old_out:
                if line.startswith("DB"):
                    line = line.replace("DB", str(pathlist[path_id].replace(".arff", "")))
                    counter += 1
                    if counter == 100:
                        path_id += 1
                        counter = 0
                output2.write(line)

if __name__ == '__main__':
    # artificial()
    # exit(1)
#     file = 'synthetic_cases/synthetic_cube_in_cube_10.csv'
    for source_name in ["gas_small.csv"]:
        source = 'data/' + source_name
        file = 'data/' + source_name.replace(".csv", "") + '_r128.csv'

        if os.path.isfile(file):
            raise ValueError(file + " already exists!")

        # parameters
        # rows = 6000
        # rel_features = 10
        irrel_features = 100

        generate()
    # pd.DataFrame(correlated_data(4000, 3, 0.5, func3)).to_csv("synthetic_cases/uds_new.csv", index=False)
    # update_out_txt("logs_test/ID_PERFECT_0.3_4d_4_cubes_xor_9.csv")
    # export_out("logs_test/ID_PERFECT_0.3_4d_4_cubes_xor_9.csv")

    # ------UPDATE THE EXISTING CUBE DATASETS WITH ACTUAL CLASSES
    #
    # problem = "3d_4_cubes_xor"
    # update_class("synthetic_cases/cubes/" + problem + ".csv",
    #              "synthetic_cases/sup_cubes/" + problem + ".csv")
    # for i in range(1, 11):
    #     update_class("synthetic_cases/cubes/" + problem + "_" + str(i) + ".csv",
    #                  "synthetic_cases/sup_cubes/" + problem + "_" + str(i) + ".csv")

    # -----CORRECT WEKA OUTPUT (PUT CORRECT DB NAMES)
    # correct_weka_output()

    # -----UPDATE OUT FILE

    # pathlist = Path("logs_quality").glob('*.csv')
    # for path in pathlist:
    #     export_out(path)

    # ------APPENDING IRRELEVANT FEATURES
    # for source_name in [
    #     "2d_2_cubes_aligned.csv",
    #     "2d_2_cubes_xor.csv",
    #     "2d_3_cubes_aligned_xor.csv",
    #     "3d_2_cubes_aligned.csv",
    #     "3d_2_cubes_xor.csv",
    #     "3d_3_cubes_aligned.csv",
    #     "3d_3_cubes_aligned_xor.csv",
    #     "3d_3_cubes_xor.csv",
    #     "3d_4_cubes_1_aligned_xor.csv",
    #     "3d_4_cubes_2_aligned.csv",
    #     "3d_4_cubes_xor.csv",
    #     "4d_2_cubes_aligned.csv",
    #     "4d_3_cubes_aligned_xor.csv",
    #     "4d_3_cubes_xor.csv",
    #     "4d_4_cubes_2_aligned.csv",
    #     "4d_4_cubes_aligned_xor.csv",
    #     "4d_4_cubes_xor.csv",
    # ]:
    #     for i in [4,5,6,7,8,9,10]:
    #     #     file = 'synthetic_cases/synthetic_cube_in_cube_10.csv'
    #         source = 'synthetic_cases/cubes/' + source_name
    #         file = 'synthetic_cases/cubes/' + source_name.replace(".csv", "") + '_' + str(i) + '.csv'
    #
    #         if os.path.isfile(file):
    #             raise ValueError(file + " already exists!")
    #
    #         # parameters
    #         rows = 4000
    #         # rel_features = 10
    #         irrel_features = i
    #
    #         generate()

    # ----GENERATION----
    # file = 'synthetic_cases/cubes/4d_4_cubes_xor.csv'
    #
    # if os.path.isfile(file):
    #     raise ValueError(file + " already exists!")
    #
    # # parameters
    # rows = 4000
    # rel_features = 2
    # irrel_features = 0
    #
    # generate()
	import numpy as np
	import pandas as pd
	import re
	import os
	from pathlib import Path
	from os import listdir


	def artificial():

	# a = [str(i) + ";" + str(int((i / 200) % 2)) + ";0" + "\n" for i in range(20000)]
	# with open("/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/new_cubes/artificial20000.csv", "w") as f:
	# f.writelines(a)

	a = [str(i) + ";0" + ";0" + "\n" for i in range(10000)]
	size = len(a)
	a.extend([str(i + size) + ";1;0" + "\n" for i in range(10000)])
	with open("/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/new_cubes/artificialtrick10000.csv", "w") as f:
	f.writelines(a)

	# synthetic case from uds
	def correlated_data(m, n, sigma, f):
	# l1 = int(n / 2)
	# l2 = n - l1
	# Z = np.random.normal(0, 1, (m, l1))
	# A = np.matrix(np.random.uniform(1, 2, (l1, l1)))
	# X1 = Z * A
	# B = np.matrix(np.random.uniform(1, 2, (l1, l2)))
	# W = X1 * B
	# E = np.random.normal(0, sigma, (m, l2))
	# X2 = f(W) + E
	# result = np.append(X1, X2, axis=1)
	# print(result)

	l1 = int(n / 2)
	l2 = n - l1
	Z = np.random.normal(0, 1, (m, l1))
	A = np.matrix(np.random.uniform(1, 2, (l1, l1)))
	X1 = Z * A
	# A = np.matrix(np.random.uniform(1, 2, (m, l1)))
	# X1 = A
	B = np.matrix(np.random.uniform(1, 2, (l1, l2)))
	W = X1 * B
	E = np.random.normal(0, sigma, (m, l2))
	X2 = f(W) + E
	result = np.append(X1, X2, axis=1)
	print(result)

	return result


	def generate_uncorrelated_data(m, n):
	return np.random.normal(0, 1, (m, n))


	def func1(X):
	return 2 * X + 1


	def func2(X):
	return np.log2(np.abs(X) + 1)


	def func3(X):
	return np.power(X, 2)


	def synthetic_data_uni(m, r, s, sigma=0.1):
	r_dims = np.random.uniform(-0.5, 0.5, (m, r)) if r > 0 else np.empty((m, r))
	parity_dim = -(np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.random.uniform(0, 0.5,
	(m,
	1)) if r > 0 else np.empty(
	(m, r))
	s_dims = np.random.uniform(-0.5, 0.5, (m, s))
	data = np.concatenate((r_dims, parity_dim, s_dims), axis=1)
	if sigma:
	e = np.random.normal(0, sigma, (m, r + s + 1))
	data = data + e

	return data


	def synthetic_data_uni_negative(m, r, s, sigma=0.1):
	r_dims = np.random.uniform(-0.5, 0.5, (m, r)) if r > 0 else np.empty((m, r))
	parity_dim = (np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.random.uniform(0, 0.5,
	(m,
	1)) if r > 0 else np.empty(
	(m, r))
	s_dims = np.random.uniform(-0.5, 0.5, (m, s))
	data = np.concatenate((r_dims, parity_dim, s_dims), axis=1)
	if sigma:
	e = np.random.normal(0, sigma, (m, r + s + 1))
	data = data + e

	return data


	def synthetic_data_gauss(m, r, s, sigma=0.1):
	r_dims = np.random.normal(0, 1, (m, r)) if r > 0 else np.empty((m, r))
	parity_dim = -(np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.abs(np.random.normal(0, 1,
	(m,
	1))) if r > 0 else np.empty(
	(m, r))
	s_dims = np.random.normal(0, 1, (m, s))
	data = np.concatenate((r_dims, parity_dim, s_dims), axis=1)
	if sigma:
	e = np.random.normal(0, sigma, (m, r + s + 1))
	data = data + e

	return data


	def synthetic_with_nearcopies(m, k, l, sigma=0.1):
	k_dims = np.repeat(np.random.uniform(-0.5, 0, (m, 1)), k, axis=1) if k > 0 else np.empty((m, k))
	l_dims = np.repeat(np.random.uniform(0, 0.5, (m, 1)), l, axis=1) if l > 0 else np.empty((m, l))

	data = np.concatenate((k_dims, l_dims), axis=1)
	if sigma:
	e = np.random.normal(0, sigma, (m, k + l))
	data = data + e

	return data


	def synthetic_cube_in_cube(m, r, i, side, sigma=0.5):
	if r < 1:
	raise ValueError
	h = int(m * sigma)
	range = [-0.5, 0] if side == 'l' else [-0.25, 0.25] if side == 'm' else [0, 0.5]
	contra_range = [0, 0.5] if side == 'l' else [-0.25, 0.25] if side == 'm' else [-0.5, 0]
	r_dims = np.concatenate((
	# np.concatenate(
	# (np.random.uniform(range[0], range[1], (h, 1)),
	# np.random.uniform(contra_range[0], contra_range[1], (h, 1))),
	# axis=1)
	np.random.uniform(range[0], range[1], (h, r))
	, np.random.uniform(-0.5, 0.5, (m - h, r))), axis=0)
	i_dims = np.random.uniform(-0.5, 0.5, (m, i)) if i > 0 else np.empty((m, i))
	data = np.concatenate((r_dims, i_dims), axis=1)

	return data


	def synthetic_cjs():
	return np.concatenate((np.concatenate((np.random.normal(0, 1, (100, 1)), np.random.normal(2, 1, (100, 1))), axis=1),
	np.concatenate((np.random.normal(4, 1, (100, 1)), np.random.normal(5, 1, (100, 1))),
	axis=1)), axis=0)

	# def blobs(rows):
	# blobs_number = 4
	# dims = 4
	# l = int(rows/blobs_number)
	# blob1 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * -3, np.ones((l, 1)) * -3, np.ones((l, 1)) * -3, np.ones((l, 1)) * -3), axis=1)
	# blob2 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 0, np.ones((l, 1)) * 0, np.ones((l, 1)) * 0, np.ones((l, 1)) * 0), axis=1)
	# blob3 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 3, np.ones((l, 1)) * 3, np.ones((l, 1)) * 3, np.ones((l, 1)) * 3), axis=1)
	# blob4 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 6, np.ones((l, 1)) * 6, np.ones((l, 1)) * 6, np.ones((l, 1)) * 6), axis=1)
	#
	# return np.concatenate((blob1, blob2, blob3, blob4), axis=0)
	# # return np.concatenate((blob1, blob2, blob3), axis=0)

	def cubes(rows):
	cubes_number = 4
	dims = 4
	l = int(rows/cubes_number)
	blob1 = np.random.uniform(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * -1.7, np.ones((l, 1)) * -1.7, np.ones((l, 1)) * -1.7, np.ones((l, 1)) * -1.7), axis=1)
	blob2 = np.random.uniform(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 0, np.ones((l, 1)) * 0, np.ones((l, 1)) * 0, np.ones((l, 1)) * 0), axis=1)
	blob3 = np.random.uniform(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 1.5, np.ones((l, 1)) * 1.5, np.ones((l, 1)) * 1.5, np.ones((l, 1)) * 1.5), axis=1)
	blob4 = np.random.uniform(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 3, np.ones((l, 1)) * 3, np.ones((l, 1)) * 3, np.ones((l, 1)) * 3), axis=1)
	background = np.random.uniform(-2, 4, (l, dims))
	# blob3 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 3, np.ones((l, 1)) * 3, np.ones((l, 1)) * 3, np.ones((l, 1)) * 3), axis=1)
	# blob4 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 6, np.ones((l, 1)) * 6, np.ones((l, 1)) * 6, np.ones((l, 1)) * 6), axis=1)

	# return np.concatenate((blob1, blob2, background), axis=0)
	return np.concatenate((blob1, blob2, blob3, blob4, background), axis=0)
	# return np.concatenate((blob1, blob2, blob3, background), axis=0)


	def append_irrelevant_features(file, n):
	if n == 0:
	raise ValueError("# of irrelevant features is 0")
	data = pd.read_csv(file, delimiter=";", header=None, na_values='?')
	rows = data.shape[0]
	last_dim = data.shape[1] - 1
	irrel_data = np.random.uniform(-0.5, 0.5, (rows, n))
	return np.concatenate([data.loc[:, :last_dim - 1], irrel_data, data.loc[:, last_dim].to_frame()], axis=1)


	def generate():
	# -------generating dataset
	# data = synthetic_cube_in_cube(rows, rel_features, irrel_features, 'l')
	# data__ = synthetic_cjs()
	# data = correlated_data(rows, rel_features + irrel_features, 1, func1)
	# data = cubes(rows)
	# # add zeroes as default class
	# data = np.concatenate((data, np.zeros((data.shape[0], 1))), axis=1)
	# -------appending irrelevant features to existing dataset
	data = append_irrelevant_features(source, irrel_features)
	# storing to disk
	pd.DataFrame(data).to_csv(file, sep=';', header=False, index=False, float_format='%.2f')

	def cube_func(r):
	if -1.5 <= r[0] <= -0.5 and -1.5 <= r[1] <= -0.5 and -1.5 <= r[2] <= -0.5:
	return 1
	if 0 <= r[0] <= 1 and 0 <= r[1] <= 1 and 0 <= r[2] <= 1:
	return 2
	if 1.5 <= r[0] <= 2.5 and 1.5 <= r[1] <= 2.5 and 1.5 <= r[2] <= 2.5:
	return 3
	if 3.0 <= r[0] <= 4.0 and 3.0 <= r[1] <= 4.0 and 3.0 <= r[2] <= 4.0:
	return 3
	# if -1.7 <= r[0] <= -0.7 and -1.7 <= r[1] <= -0.7 and -1.7 <= r[2] <= -0.7:
	# return 1
	# if 0 <= r[0] <= 1 and 0 <= r[1] <= 1 and 0 <= r[2] <= 1:
	# return 2
	# if 1.5 <= r[0] <= 2.5 and 1.5 <= r[1] <= 2.5 and 1.5 <= r[2] <= 2.5:
	# return 3
	# elif 3 <= r[0] <= 4 and 3 <= r[1] <= 4 and 3 <= r[2] <= 4:
	# return 4
	# elif 0 <= r[0] <= 1 and 0 <= r[1] <= 1:
	# return 3
	else:
	return 0

	def update_class(file, new_file):
	data = pd.read_csv(file, delimiter=";", header=None, na_values='?')
	zeros_dim = data.shape[1] - 1
	classes = data.apply(cube_func, axis=1).to_frame()
	new_data = np.concatenate([data.loc[:, :zeros_dim - 1], classes], axis=1)
	pd.DataFrame(new_data).to_csv(new_file, sep=';', header=False, index=False, float_format='%.2f')

	# def update_out_txt(directory):
	# directory = str(directory)
	# if not os.path.exists(directory + "/out.txt"):
	# return
	# search = re.search("(\dd_(\d).*.csv)", directory)
	# run = search.group(1)
	# classes_count = int(search.group(2)) + 1
	# classes = pd.read_csv("synthetic_cases/cubes/" + run, delimiter=";", header=None, na_values='?')
	# classes = classes[classes.shape[1] - 1]
	#
	# class_row = 0
	# with open(directory + "/out.txt", "r") as old_out:
	# with open(directory + "/out2.txt", "w") as new_out:
	# for line in old_out:
	# if "@attribute class {" in line:
	# line = "@attribute class {" + ",".join(['"' + str(i) + '.0"' for i in range(classes_count)]) + "}"
	# if ',"0.0"' in line:
	# line = line.replace(',"0.0"', ',"' + str(classes[class_row]) + '"')
	# class_row += 1
	# new_out.write(line)


	# def export_out(directory):
	# directory = str(directory)
	# if not os.path.exists(directory + "/out2.txt"):
	# return
	# destination = "logs_test/arff/" + directory.replace(".csv", "").replace("logs_test/", "") + ".arff"
	# search = re.search("_(\d)d_.*", directory)
	# rel_dim_count = int(search.group(1))
	#
	# with open(directory + "/out2.txt", "r") as out_txt:
	# with open(destination, "w") as new_out:
	# for line in out_txt:
	# if line.startswith("@attribute class"):
	# line = line + "\n"
	# if "@attribute dim" in line:
	# if int(re.search("@attribute dim(\d+) ", line).group(1)) >= rel_dim_count:
	# continue
	#
	# if not line.startswith("@") and line.strip() != "":
	# split = line.split(",")
	# line = ",".join(split[:rel_dim_count]) + ',' + split[-1]
	# new_out.write(line)


	def correct_weka_output():
	file = "test_experiment_all.arff"
	file2 = "test_experiment_all2.arff"
	pathlist = listdir("logs_quality/arff")

	with open(file, "r") as old_out:
	with open(file2, "w") as output2:
	path_id = 0
	counter = 0
	for line in old_out:
	if line.startswith("DB"):
	line = line.replace("DB", str(pathlist[path_id].replace(".arff", "")))
	counter += 1
	if counter == 100:
	path_id += 1
	counter = 0
	output2.write(line)

	if __name__ == '__main__':
	# artificial()
	# exit(1)
	# file = 'synthetic_cases/synthetic_cube_in_cube_10.csv'
	for source_name in ["gas_small.csv"]:
	source = 'data/' + source_name
	file = 'data/' + source_name.replace(".csv", "") + '_r128.csv'

	if os.path.isfile(file):
	raise ValueError(file + " already exists!")

	# parameters
	# rows = 6000
	# rel_features = 10
	irrel_features = 100

	generate()
	# pd.DataFrame(correlated_data(4000, 3, 0.5, func3)).to_csv("synthetic_cases/uds_new.csv", index=False)
	# update_out_txt("logs_test/ID_PERFECT_0.3_4d_4_cubes_xor_9.csv")
	# export_out("logs_test/ID_PERFECT_0.3_4d_4_cubes_xor_9.csv")

	# ------UPDATE THE EXISTING CUBE DATASETS WITH ACTUAL CLASSES
	#
	# problem = "3d_4_cubes_xor"
	# update_class("synthetic_cases/cubes/" + problem + ".csv",
	# "synthetic_cases/sup_cubes/" + problem + ".csv")
	# for i in range(1, 11):
	# update_class("synthetic_cases/cubes/" + problem + "_" + str(i) + ".csv",
	# "synthetic_cases/sup_cubes/" + problem + "_" + str(i) + ".csv")

	# -----CORRECT WEKA OUTPUT (PUT CORRECT DB NAMES)
	# correct_weka_output()

	# -----UPDATE OUT FILE

	# pathlist = Path("logs_quality").glob('*.csv')
	# for path in pathlist:
	# export_out(path)

	# ------APPENDING IRRELEVANT FEATURES
	# for source_name in [
	# "2d_2_cubes_aligned.csv",
	# "2d_2_cubes_xor.csv",
	# "2d_3_cubes_aligned_xor.csv",
	# "3d_2_cubes_aligned.csv",
	# "3d_2_cubes_xor.csv",
	# "3d_3_cubes_aligned.csv",
	# "3d_3_cubes_aligned_xor.csv",
	# "3d_3_cubes_xor.csv",
	# "3d_4_cubes_1_aligned_xor.csv",
	# "3d_4_cubes_2_aligned.csv",
	# "3d_4_cubes_xor.csv",
	# "4d_2_cubes_aligned.csv",
	# "4d_3_cubes_aligned_xor.csv",
	# "4d_3_cubes_xor.csv",
	# "4d_4_cubes_2_aligned.csv",
	# "4d_4_cubes_aligned_xor.csv",
	# "4d_4_cubes_xor.csv",
	# ]:
	# for i in [4,5,6,7,8,9,10]:
	# # file = 'synthetic_cases/synthetic_cube_in_cube_10.csv'
	# source = 'synthetic_cases/cubes/' + source_name
	# file = 'synthetic_cases/cubes/' + source_name.replace(".csv", "") + '_' + str(i) + '.csv'
	#
	# if os.path.isfile(file):
	# raise ValueError(file + " already exists!")
	#
	# # parameters
	# rows = 4000
	# # rel_features = 10
	# irrel_features = i
	#
	# generate()

	# ----GENERATION----
	# file = 'synthetic_cases/cubes/4d_4_cubes_xor.csv'
	#
	# if os.path.isfile(file):
	# raise ValueError(file + " already exists!")
	#
	# # parameters
	# rows = 4000
	# rel_features = 2
	# irrel_features = 0
	#
	# generate()