diff --git a/ID_correlation_measure.py b/ID_sm.py similarity index 68% rename from ID_correlation_measure.py rename to ID_sm.py index f18d209..020ab79 100644 --- a/ID_correlation_measure.py +++ b/ID_sm.py @@ -3,6 +3,7 @@ import pandas as pd import interaction_distance as id import data_generator as dg +import matplotlib.pyplot as plt def evidence_ID(): # no interaction @@ -35,29 +36,35 @@ def evidence_ID(): # log.plot_data_2d(df) # log.plot_data_2d(pd.concat([df.loc[:100], df.loc[7900:8000]], axis=0)) -cg = dg.produce_cube_generator(7, 0, 2, "i", ".csv") +cg = dg.produce_cube_generator(7, 0, 2, "i", 1, ".csv") data, filname = cg.build() print(cg.subspaces) print(cg.perf_disc) data = pd.DataFrame(data) dim_count = data.shape[1] -for curr in data[:-1]: +for curr in range(dim_count - 1): dims = data.columns.tolist() dims.remove(curr) dims.remove(dim_count - 1) - curr_data = data.sort_values(by=curr).reset_index(drop=True).loc[:, dims] - rows = curr_data.shape[0] + projected_data = data.sort_values(by=curr).reset_index() + curr_index = projected_data['index'] + projected_data = projected_data.loc[:, dims] + rows = projected_data.shape[0] print('curr dimension', curr) for dim in dims: counter = 0 ids = [] + dim_x = [] while(True): - if counter + 280 > rows: + if counter + 140 > rows: break - ids.append(id.compute_ID(curr_data.loc[counter:counter + 140, dim].to_frame(), - curr_data.loc[counter + 140: counter + 280, dim].to_frame(), [2] * dim_count)) - counter += 1 + ids.append(id.compute_ID(projected_data.loc[counter:counter + 70, dim].to_frame(), + projected_data.loc[counter + 70: counter + 140, dim].to_frame(), [2] * dim_count)) + dim_x.append(data.loc[curr_index.loc[counter + 70], curr]) + counter += 140 # needs data normalization todo - print('interaction with', dim, np.average(ids)) + print('interaction with', dim, np.average(ids), sum([1 if ID > np.average(ids) else 0 for ID in ids])) + plt.plot(dim_x, ids) + plt.show() # break \ No newline at end of file diff --git a/constants.py b/constants.py index e70c158..73b837d 100644 --- a/constants.py +++ b/constants.py @@ -33,6 +33,11 @@ class DistanceMeasure(Enum): CJS = 2 +class InteractionType(Enum): + CUBES = 1 + XOR = 2 + + ID_THRESHOLD_QUANTILE = 0.3 ID_SLIDING_WINDOW = 40 @@ -72,13 +77,15 @@ class DistanceMeasure(Enum): INTERACTIONS_LOWER_BOUND=3 if socket.gethostname() != 'push' else 1 # new settings (more constrained) +INTERACTION_TYPE_RANGE_LIST=[InteractionType.CUBES, InteractionType.XOR] IRRELEVANT_FEATURES_RANGE_LIST = [0, 1, 2, 4, 8, 16, 32, 64, 99, 3, 6, 12, 24, 48, 82] if socket.gethostname() != 'push' else [0, 1, 2, 3] RELEVANT_FEATURES_RANGE_LIST = [2, 3, 4, 6, 8, 12, 16, 23, 30] if socket.gethostname() == 'push' else [7] INTERACTION_NUMBER_RANGE_LIST = [1, 2, 4, 8, 10] if socket.gethostname() == 'push' else [3] -INTERACTION_TYPES_RANGE_LIST=["c", 'i', "io"] if socket.gethostname() == 'push' else ['i'] +PARTITION_TYPES_RANGE_LIST=["c", 'i', "io"] if socket.gethostname() == 'push' else ['i'] CUBES_LOWER_BOUND=1 CUBES_UPPER_BOUND=3 NAIVE_CHUNKS_NUMBER_RANGE_LIST = [2, 3, 4, 5, 10, 20, 30] +XOR_SIGMA=0.1 BASE = '/local/tmp/ipd_extended_experiments2/' if socket.gethostname() == 'push' \ diff --git a/data_generator.py b/data_generator.py index d9c6571..f1eaee5 100644 --- a/data_generator.py +++ b/data_generator.py @@ -1,3 +1,5 @@ +from abc import abstractmethod + import numpy as np import pandas as pd import random @@ -22,20 +24,90 @@ def __init__(self, rows, loc=None): self.subspaces = [] -class CubesGenerator: - def __init__(self, rel_feature_count, irr_feature_count, radius, file_name): - self.rel_feature_count = rel_feature_count - self.file_name = file_name - self.cube_parameters = [] +class DataGenerator: + def __init__(self, file_name, rel_feature_count, irr_feature_count, radius): + self.radius = radius self.feature_count = rel_feature_count + irr_feature_count + self.irf = irr_feature_count + self.file_name = file_name self.dim_borders = [[-radius, radius] for d in range(self.feature_count)] self.subspaces = [] self.perf_disc = [{d[1]} for d in self.dim_borders[:rel_feature_count]] def __repr__(self): - return 'CubesGenerator(file_name=' + str(self.file_name) \ - + ', rel_feature_count=' + str(self.rel_feature_count) \ - + ', feature_count=' + str(self.feature_count) + ")" + return '(file_name=' + str(self.file_name) + ")" + + @abstractmethod + def build(self): + ... + + @abstractmethod + def get_discs(self): + ... + + @abstractmethod + def get_subspaces(self): + ... + + +class XorGenerator(DataGenerator): + def __init__(self, rf, irf, radius, rows, sigma, file_name): + super().__init__(file_name, rf, irf, radius) + self.rows = rows + self.slave_features = rf - 1 + self.sigma = sigma + self.perf_disc = [[0, radius] for d in self.dim_borders[:rf]] + self.subspaces = [[f for f in range(rf)]] + + def get_discs(self): + return self.perf_disc + + def get_subspaces(self): + return self.subspaces + + + def build(self): + r_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.slave_features)) if self.slave_features > 0 else np.empty((self.rows, self.slave_features)) + parity_dim = -(np.sum(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(self.rows, 1) \ + * np.random.uniform(0, self.radius, (self.rows, 1)) if self.slave_features > 0 else np.empty((self.rows, self.slave_features)) + + irr_dims = np.random.uniform(-self.radius, self.radius, (self.rows, self.irf)) + + + xor_dict = dict() + counter = [0] + curr = [] + + def add_value(r): + if r == 0: + counter[0] += 1 + xor_dict["".join([str(i) for i in curr]) + str(sum(curr) % 2)] = counter[0] + return + + for i in [0, 1]: + curr.append(i) + add_value(r - 1) + curr.pop() + add_value(self.slave_features) + + + class_labels = np.apply_along_axis(lambda a: xor_dict["".join([str(d) for d in a])], 1, + np.concatenate([np.array(r_dims > 0, dtype='int'), + (np.sum(r_dims > 0, axis=1) % 2).reshape(self.rows, 1)], + axis=1)) + class_labels = class_labels.reshape([class_labels.shape[0], 1]) + data = np.concatenate((r_dims, parity_dim, irr_dims, class_labels), axis=1) + if self.sigma: + e = np.concatenate((np.random.normal(0, self.sigma, (self.rows, self.slave_features + self.irf + 1)), np.zeros((self.rows, 1))), axis=1) + data = data + e + return data, self.file_name + +class CubesGenerator(DataGenerator): + def __init__(self, rel_feature_count, irr_feature_count, radius, file_name): + super().__init__(file_name, rel_feature_count, irr_feature_count, radius) + self.rel_feature_count = rel_feature_count + self.cube_parameters = [] + def add_cube_parameter(self, cube_param): if cube_param.loc is None: @@ -155,6 +227,10 @@ def produce_cube_generator(rf, irf, interactions, type, cubes, file_name): return dg +def produce_xor_generator(rf, irf, file_name): + return XorGenerator(rf, irf, RADIUS, ROWS, 0.1, file_name) + + def produce_all_data_generators(): data_generators = [] global basedir @@ -168,13 +244,17 @@ def produce_all_data_generators(): perf_subspaces = dict() perf_discs = dict() - def produce_dg(name, rf, i, type, cubes): + def produce_dg(name, interaction_type, rf, i, type, cubes): - if os.path.exists(basedir + name) and os.path.exists( - perf_disc_dir + 'cut_' + name.replace('csv', 'txt')): + if os.path.exists(basedir + name + ".csv") and os.path.exists(perf_disc_dir + 'cut_' + name + ".txt"): return - dg = produce_cube_generator(rf, cst.IRRELEVANT_FEATURES, i, type, cubes, name + ".csv") + if interaction_type == cst.InteractionType.CUBES: + dg = produce_cube_generator(rf, cst.IRRELEVANT_FEATURES, i, type, cubes, name + ".csv") + elif interaction_type == cst.InteractionType.XOR: + dg = produce_xor_generator(rf, cst.IRRELEVANT_FEATURES, name + ".csv") + else: + raise ValueError("no implementation of data generator for", interaction_type.name) perf_discs[name] = dg.get_discs() perf_subspaces[name] = dg.get_subspaces() data_generators.append(dg) @@ -214,13 +294,10 @@ def store(data): if __name__ == '__main__': - cg = produce_cube_generator(7, 2, 3, 'c', 'bla') - print(cg.subspaces) - cg = produce_cube_generator(7, 2, 3, 'i', 'bla') - print(cg.subspaces) + # l.plot_data_3d(produce_xor_generator(3, 0, 'bla').build()[0]) # print(generate_overlap_partition(7, 3)) - # generators = produce_all_data_generators() - # for g in generators: - # - # store(g.build()) + generators = produce_all_data_generators() + for g in generators: + + store(g.build()) diff --git a/discretization_quality_measure.py b/discretization_quality_measure.py index 7ac065c..2a1a7d5 100644 --- a/discretization_quality_measure.py +++ b/discretization_quality_measure.py @@ -166,7 +166,7 @@ def prepare_compression1(experiment_name): return False return True -def run_compression1(name, rf=None, i=None, type=None, c=None): +def run_compression1(name, it=None, rf=None, i=None, type=None, c=None): # 1. check slim db # convert dat-file to db-file if it does not exist if not os.path.exists(cst.SLIM_DATA_DIR + name + "/" + name + ".db"): diff --git a/experiments_logging.py b/experiments_logging.py index 6e5a877..63a467c 100644 --- a/experiments_logging.py +++ b/experiments_logging.py @@ -5,6 +5,7 @@ from mpl_toolkits.mplot3d import Axes3D import data_generation as dg_old import util +import numpy as np def plot_disc(problem, method): @@ -41,18 +42,17 @@ def plot_data_3d(data): ax = fig.add_subplot(111, projection='3d') # data = data[np.logical_and(data[0] < 0, data[1] > 0)] - ## 3d parity problem - # color_cond = {'b': np.logical_and(data[0] < 0, np.logical_and(data[1] > 0, data[2] < 0)), - # 'k': np.logical_and(data[0] < 0, np.logical_and(data[1] > 0, data[2] > 0)), - # 'g': np.logical_and(data[0] > 0, data[1] < 0), - # 'r': np.logical_and(data[0] < 0, data[1] < 0), - # 'c': np.logical_and(data[0] > 0, data[1] > 0), - # } - # for c in color_cond: - # ax.scatter(data[0][color_cond[c]], data[1][color_cond[c]], data[2][color_cond[c]], c=c, s=1) + # 3d parity problem + color_cond = {'b': data[3] == 1, + 'k': data[3] == 2, + 'r': data[3] == 3, + 'g': data[3] == 4, + } + for c in color_cond: + ax.scatter(data[0][color_cond[c]], data[1][color_cond[c]], data[2][color_cond[c]], c=c, s=1) ## without coloring - ax.scatter(data[0], data[1], data[2], c='k', s=1) + # ax.scatter(data[0], data[1], data[2], c='k', s=1) ax.set_xlabel('X0') ax.set_ylabel('X1') diff --git a/main.py b/main.py index dd4a9e5..f2c3fdd 100644 --- a/main.py +++ b/main.py @@ -727,7 +727,7 @@ def prepare(base_dir, data_file, method, time_mark=False, delim=";", columns=Non def collect_experiment_params(base_dir): - def collect(name, rf, i, type, c): + def collect(name, it, rf, i, type, c): params = [] file_path = cst.DATA_DIR + name + ".csv" @@ -766,10 +766,13 @@ def collect(name, rf, i, type, c): if __name__ == "__main__": + + # print(compute_predefined_subspace_sets_naive(5)) + # exit(1) # cubes_03_10_c # print(compute_predefined_subspace_sets(3, [[0,1,2]])) # exit(1) - params = collect_experiment_params("logs_test3") + params = collect_experiment_params("logs_test") # print(params) # print(compute_subspace_sets("cubes_10_03_i.csv", cst.Method.PREDEFINED_SUBSPACESETS)) # exit(1) diff --git a/runExperiment.py b/runExperiment.py index 304630d..00994b1 100755 --- a/runExperiment.py +++ b/runExperiment.py @@ -94,18 +94,18 @@ def register_ideal_disc(self, name): loader = Loader() # todo items.put(WHATEVER PARAMETERS OF TASK) - # params = dg.produce_all_data_generators() - # for data_generator in params: - # items.put(data_generator) + params = dg.produce_all_data_generators() + for data_generator in params: + items.put(data_generator) - params = main.collect_experiment_params("logs_test") + # params = main.collect_experiment_params("logs_test") if len(params) == 0: print("no parameters collected!") exit(0) - for param in params: - loader.register_dataset(param.data_file) - loader.register_ideal_disc(param.experiment_name) - items.put(param) + # for param in params: + # loader.register_dataset(param.data_file) + # loader.register_ideal_disc(param.experiment_name) + # items.put(param) if onlyListTasks: while not items.empty(): @@ -143,8 +143,8 @@ def worker(worker_id): print('Worker ID ', worker_id, 'is executing', para) # todo generate data sets - # datasets.put(para.build()) - datasets.put(main.execute(para, loader)) + datasets.put(para.build()) + # datasets.put(main.execute(para, loader)) print('Worker ID ', worker_id, ' execution finished') with counterLock: if runningMain: @@ -161,8 +161,9 @@ def datasetWriter(): while True: try: result = datasets.get(block=True, timeout=10) - # dg.store(result) - main.store(result, loader) + # todo store + dg.store(result) + # main.store(result, loader) except queue.Empty: break diff --git a/util.py b/util.py index e60abd1..4f42e37 100644 --- a/util.py +++ b/util.py @@ -23,51 +23,70 @@ def parse_relevant_features(data_file_name): def collect_params(f): params = [] - # relevant features 2 - 30 - # for rf in range(cst.RELEVANT_FEATURES_LOWER_BOUND, cst.RELEVANT_FEATURES_UPPER_BOUND): - for rf in cst.RELEVANT_FEATURES_RANGE_LIST: - # interactions 1 - 10 - for i in cst.INTERACTION_NUMBER_RANGE_LIST: - - # interaction types - # c - 1 interaction out of all relevant features - # i - partition of relevant features in i non-overlapping interactions - # io - partition of relevant features in i overlapping interactions - for t in cst.INTERACTION_TYPES_RANGE_LIST: - # cube number in each of the interactions - # todo random cube number in the constraints - for c in range(cst.CUBES_LOWER_BOUND, cst.CUBES_UPPER_BOUND): + for interaction_type in cst.INTERACTION_TYPE_RANGE_LIST: + for rf in cst.RELEVANT_FEATURES_RANGE_LIST: + + if interaction_type == cst.InteractionType.XOR: + dataset_name = construct_dataset_name(cst.InteractionType.XOR, rf) + param = f(dataset_name, interaction_type, rf, None, None, None) + if not param: + continue + print('collected param:', param) + if type(param) == list: + params.extend(param) + else: + params.append(param) + + if interaction_type != cst.InteractionType.CUBES: + continue + + # interactions 1 - 10 + for i in cst.INTERACTION_NUMBER_RANGE_LIST: + + # partition types + # c - 1 interaction out of all relevant features + # i - partition of relevant features in i non-overlapping interactions + # io - partition of relevant features in i overlapping interactions + for t in cst.PARTITION_TYPES_RANGE_LIST: # only full set of relevant features is possible if t == 'c' and i > 1: continue if (i == 1 or rf / i < 2) and t != 'c': continue - dataset_name = construct_dataset_name(i, rf, t, c) - param = f(dataset_name, rf, i, t, c) - print('collected param:', param) - if not param: - continue - if type(param) == list: - params.extend(param) - else: - params.append(param) + + # cube number in each of the interactions + # todo random cube number in the constraints + for c in range(cst.CUBES_LOWER_BOUND, cst.CUBES_UPPER_BOUND): + dataset_name = construct_dataset_name(cst.InteractionType.CUBES, rf, i, t, c) + param = f(dataset_name, interaction_type, rf, i, t, c) + if not param: + continue + print('collected param:', param) + if type(param) == list: + params.extend(param) + else: + params.append(param) return params -def construct_dataset_name(i, rf, t, c): +def construct_dataset_name(interaction_type, rf, i=None, t=None, c=None): + if interaction_type is cst.InteractionType.XOR: + return interaction_type.name.lower() + '_' + '{0:02d}'.format(rf) + + assert rf and i and t and c if t == 'c': # for example, returns cubes_7_3_c.csv where 3 is a number of cubes - return 'cubes_' + '{0:02d}'.format(rf) + '_' \ + return interaction_type.name.lower() + '_' + '{0:02d}'.format(rf) + '_' \ + '{0:02d}'.format(c) + '_' \ + t if c == 1: # for example, returns cubes_7_3_i.csv where 3 is a number of interactions with 1 cube in each - return 'cubes_' + '{0:02d}'.format(rf) + '_' \ + return interaction_type.name.lower() + '_' + '{0:02d}'.format(rf) + '_' \ + '{0:02d}'.format(i) + '_' \ + t # for example, returns cubes_7_3_2_i.csv where 3 is a number of interactions with 2 cube in each - return 'cubes_' + '{0:02d}'.format(rf) + '_' \ + return interaction_type.name.lower() + '_' + '{0:02d}'.format(rf) + '_' \ + '{0:02d}'.format(i) + '_' \ + '{0:02d}'.format(c) + '_' \ + t