diff --git a/data_generation.py b/data_generation.py index 60abb12..b4c6751 100644 --- a/data_generation.py +++ b/data_generation.py @@ -111,6 +111,16 @@ def synthetic_cjs(): np.concatenate((np.random.normal(4, 1, (100, 1)), np.random.normal(5, 1, (100, 1))), axis=1)), axis=0) +def blobs(): + # 2d + n = 4000 + l = int(n/2) + blob1 = np.random.normal(0, 1, (l, 2)) - np.concatenate((np.ones((l, 1)), np.zeros((l, 1))), axis=1) + + blob2 = np.random.normal(0, 1, (l, 2)) + np.concatenate((np.ones((l, 1)), np.zeros((l, 1))), axis=1) + + return np.concatenate((blob1, blob2), axis=0) + def append_irrelevant_features(file, n): if n == 0: @@ -127,28 +137,40 @@ def generate(): # -------generating dataset # data = synthetic_cube_in_cube(rows, rel_features, irrel_features, 'l') # data__ = synthetic_cjs() + data = correlated_data(rows, rel_features + irrel_features, 1, func1) # # # add zeroes as default class - # data = np.concatenate((data, np.zeros((rows, 1))), axis=1) + data = np.concatenate((data, np.zeros((rows, 1))), axis=1) # -------appending irrelevant features to existing dataset - data = append_irrelevant_features(source, irrel_features) + # data = append_irrelevant_features(source, irrel_features) # storing to disk pd.DataFrame(data).to_csv(file, sep=';', header=False, index=False, float_format='%.2f') if __name__ == '__main__': - for j in [3,4,5,10]: - for i in [1,2,3,4,5,10]: - # file = 'synthetic_cases/synthetic_cube_in_cube_10.csv' - file = 'synthetic_cases/synthetic_cube_in_cube_' + str(j) + '_' + str(i) + '.csv' - source = 'synthetic_cases/synthetic_cube_in_cube_' + str(j) + '.csv' + # for j in [3,4,5,10]: + # for i in [1,2,3,4,5,10]: + # # file = 'synthetic_cases/synthetic_cube_in_cube_10.csv' + # file = 'synthetic_cases/synthetic_cube_in_cube_' + str(j) + '_' + str(i) + '.csv' + # source = 'synthetic_cases/synthetic_cube_in_cube_' + str(j) + '.csv' + # + # if os.path.isfile(file): + # raise ValueError(file + " already exists!") + # + # # parameters + # rows = 20000 + # rel_features = 10 + # irrel_features = i + # + # generate() + file = 'synthetic_cases/uds_test_4.csv' - if os.path.isfile(file): - raise ValueError(file + " already exists!") + if os.path.isfile(file): + raise ValueError(file + " already exists!") - # parameters - rows = 20000 - rel_features = 10 - irrel_features = i + # parameters + rows = 4000 + rel_features = 2 + irrel_features = 2 - generate() \ No newline at end of file + generate() \ No newline at end of file diff --git a/experiments_logging.py b/experiments_logging.py index 9f8121e..b758ac1 100644 --- a/experiments_logging.py +++ b/experiments_logging.py @@ -65,5 +65,5 @@ def write_cut_file(name, disc_intervals): if __name__ == '__main__': # rows = 20000 # data = np.concatenate((synthetic_cube_in_cube(rows, 2, 0), np.zeros((rows, 1))), axis=1) - data = pd.read_csv("synthetic_cases/synthetic_cube_in_cube_5.csv", delimiter=";", header=None, na_values='?') + data = pd.read_csv("synthetic_cases/synthetic_cube_in_sparse_cube_3_0.csv", delimiter=";", header=None, na_values='?') plot_data_3d(data) \ No newline at end of file diff --git a/main.py b/main.py index 3c718fe..36c17d6 100644 --- a/main.py +++ b/main.py @@ -85,6 +85,9 @@ def compute_distances(bin_map, curr, data, dim_maxes, subspace = sm.het_beam_search(data, curr, k, beam_width, delta, cor_measure) else: raise ValueError("there is no such method!") + + # if len(subspace) == 0: + # todo the rest of the methods data = data.copy().loc[:, subspace] dim_maxes = dim_maxes[subspace] diff --git a/subspace_mining.py b/subspace_mining.py index 09af626..841e454 100644 --- a/subspace_mining.py +++ b/subspace_mining.py @@ -74,6 +74,9 @@ def __init__(self, subspace, score): def __lt__(self, other): return self.score < other.score + def __repr__(self): + return '(' + str(self.subspace) + ', ' + str(self.score) + ')' + def best_first(data, curr, k, cor_measure): dims = set(data.columns.tolist()) diff --git a/uds.py b/uds.py index e314b1b..ad1ca61 100644 --- a/uds.py +++ b/uds.py @@ -52,13 +52,12 @@ def dim_optimal_disc(curr, prev, I, data): min_cost = None arg_min = None for j in range(l - 1, i): - temp_cost = (support[i] - support[j]) / support[i] * f[i][j + 1] + support[j] / support[i] * val[j][ - l - 1] + temp_cost = (support[i] - support[j]) / support[i] * f[i][j + 1] \ + + support[j] / support[i] * val[j][l - 1] if not min_cost or temp_cost < min_cost: min_cost = temp_cost arg_min = j - # val[i][l] val[i].append(min_cost) disc = b[arg_min][l - 1].copy() disc.append(merged_bins[i][arg_min + 1]) @@ -125,8 +124,11 @@ def compute_uds(data): opt_score = None for l, score in enumerate(scores): temp_I = extend_I(I, discs[l]) - temp_cost = score / CEs[dim] + entropy(temp_I, len(data)) / ( - math.log(UDS_BETA, 2) + sum([math.log(e + 1, 2) for e in es])) + temp_cost = score / CEs[dim] + (entropy(temp_I, len(data)) / ( + # todo old + math.log(UDS_BETA, 2) + sum([math.log(e + 1, 2) for e in es]))) + # math.log(l + 1, 2) + sum([math.log(e + 1, 2) for e in es])) + # if math.log(l + 1, 2) + sum([math.log(e + 1, 2) for e in es]) != 0 else 0) if not opt_cost or temp_cost < opt_cost: opt_cost = temp_cost opt_score = score @@ -148,7 +150,7 @@ def compute_uds(data): if __name__ == "__main__": - data = pd.read_csv('synthetic_cases/uds_test.csv', delimiter=';', header=None) + data = pd.read_csv('synthetic_cases/synthetic_3d_parity_problem.csv', delimiter=';', usecols=[0,1,2], header=None) #0.0716387590375 # data = pd.DataFrame(correlated_data(4000, 20, 10, dg.func2)) # data = pd.DataFrame(generate_correlated_data(1000, 10, 2, func1)) # data = pd.DataFrame(generate_uncorrelated_data(4000, 20)) @@ -156,6 +158,4 @@ def compute_uds(data): uds = compute_uds(data) print(uds) - # print(es) - - + # print(es) \ No newline at end of file