From f9ee841a5c79938049a2dbc3885e16b89ed0de9c Mon Sep 17 00:00:00 2001 From: Tatiana Dembelova Date: Thu, 19 Oct 2017 13:25:22 +0200 Subject: [PATCH] added new baseline method, PREDEFINED_SUBSPACESETS_NAIVE --- ID_correlation_measure.py | 63 ++++++++++++++++++++++++++++++ constants.py | 12 ++++-- data_generation.py | 15 +++++++ data_generator.py | 42 +++++++++++--------- discretization_quality_measure.py | 2 +- experiments_logging.py | 6 ++- main.py | 24 ++++++++++-- runExperiment.py | 15 +++---- run_classification.py | 39 +++++++++++++++++++ uds.py | 8 +++- util.py | 65 +++++++++++++++++++++++-------- 11 files changed, 238 insertions(+), 53 deletions(-) create mode 100644 ID_correlation_measure.py create mode 100644 run_classification.py diff --git a/ID_correlation_measure.py b/ID_correlation_measure.py new file mode 100644 index 0000000..f18d209 --- /dev/null +++ b/ID_correlation_measure.py @@ -0,0 +1,63 @@ +import numpy as np +import experiments_logging as log +import pandas as pd +import interaction_distance as id +import data_generator as dg + +def evidence_ID(): + # no interaction + b = np.matrix(np.random.uniform(1, 2, (4000, 1))) + back = np.matrix(np.random.uniform(0, 2, (4000, 1))) + res = np.append(b, back, axis=0) + b1 = np.matrix(np.random.uniform(0, 2, (8000, 1))) + + # either horizontal or vertical tube + # all = np.append(b1, res, axis=1) + all = np.append(res, b1, axis=1) + df = pd.DataFrame(all) + df = df.sort_values(by=0).reset_index(drop=True) + print(id.compute_ID(df.loc[:100, 1].to_frame(), df.loc[7900:8000, 1].to_frame(), [0, 2])) + # log.plot_data_2d(df) + # log.plot_data_2d(pd.concat([df.loc[:100], df.loc[7900:8000]], axis=0)) + + # cube interaction + b = np.matrix(np.random.uniform(1, 2, (4000, 1))) + back = np.matrix(np.random.uniform(0, 2, (4000, 1))) + res = np.append(b, back, axis=0) + b1 = np.matrix(np.random.uniform(1, 2, (4000, 1))) + back1 = np.matrix(np.random.uniform(0, 2, (4000, 1))) + res1 = np.append(b1, back1, axis=0) + + all = np.append(res, res1, axis=1) + df = pd.DataFrame(all) + df = df.sort_values(by=0).reset_index(drop=True) + print(id.compute_ID(df.loc[:100, 1].to_frame(), df.loc[7900:8000, 1].to_frame(), [0, 2])) + # log.plot_data_2d(df) + # log.plot_data_2d(pd.concat([df.loc[:100], df.loc[7900:8000]], axis=0)) + +cg = dg.produce_cube_generator(7, 0, 2, "i", ".csv") +data, filname = cg.build() +print(cg.subspaces) +print(cg.perf_disc) + +data = pd.DataFrame(data) +dim_count = data.shape[1] +for curr in data[:-1]: + dims = data.columns.tolist() + dims.remove(curr) + dims.remove(dim_count - 1) + curr_data = data.sort_values(by=curr).reset_index(drop=True).loc[:, dims] + rows = curr_data.shape[0] + print('curr dimension', curr) + for dim in dims: + counter = 0 + ids = [] + while(True): + if counter + 280 > rows: + break + ids.append(id.compute_ID(curr_data.loc[counter:counter + 140, dim].to_frame(), + curr_data.loc[counter + 140: counter + 280, dim].to_frame(), [2] * dim_count)) + counter += 1 + # needs data normalization todo + print('interaction with', dim, np.average(ids)) + # break \ No newline at end of file diff --git a/constants.py b/constants.py index a8f79f8..e70c158 100644 --- a/constants.py +++ b/constants.py @@ -15,6 +15,7 @@ class Method(Enum): PREDEFINED_SUBSPACESETS = 9 # # the subspace sets gradually increase number of dimensions in one of the subspaces chosen randomly; subspace sets are chosen with a step of 2 PREDEFINED_SUBSPACESETS_SYNCHRONOUS_GREEDY = 13 # the subspace sets gradually increase number of dimensions in all the subspaces; if a subspace has been used up, it extends to irrelevant dimensions PREDEFINED_SUBSPACESETS_SYNCHRONOUS_OPTIMAL = 14 # the subspace sets gradually increase number of dimensions in all the subspaces; if a subspace has been used up, it stays the same + PREDEFINED_SUBSPACESETS_NAIVE = 15 PREDEFINED_OPTIMAL_SUBSPACESET = 10 PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT = 11 @@ -72,9 +73,12 @@ class DistanceMeasure(Enum): # new settings (more constrained) IRRELEVANT_FEATURES_RANGE_LIST = [0, 1, 2, 4, 8, 16, 32, 64, 99, 3, 6, 12, 24, 48, 82] if socket.gethostname() != 'push' else [0, 1, 2, 3] -RELEVANT_FEATURES_RANGE_LIST = [2, 3, 4, 6, 8, 12, 16, 23, 30] if socket.gethostname() == 'push' else [2] +RELEVANT_FEATURES_RANGE_LIST = [2, 3, 4, 6, 8, 12, 16, 23, 30] if socket.gethostname() == 'push' else [7] INTERACTION_NUMBER_RANGE_LIST = [1, 2, 4, 8, 10] if socket.gethostname() == 'push' else [3] -INTERACTION_TYPES_RANGE_LIST=["c", 'i', "io"] if socket.gethostname() == 'push' else ['c'] +INTERACTION_TYPES_RANGE_LIST=["c", 'i', "io"] if socket.gethostname() == 'push' else ['i'] +CUBES_LOWER_BOUND=1 +CUBES_UPPER_BOUND=3 +NAIVE_CHUNKS_NUMBER_RANGE_LIST = [2, 3, 4, 5, 10, 20, 30] BASE = '/local/tmp/ipd_extended_experiments2/' if socket.gethostname() == 'push' \ @@ -91,4 +95,6 @@ class DistanceMeasure(Enum): SLIM_CONVERT_CONF = SLIM_BASE + "branches/slim/trunk/convertdb.conf" PRECISION_RECALL_FILENAME = "Precision_recall_runtime.csv" -COMPRESSION_FILENAME = "Compression.csv" \ No newline at end of file +COMPRESSION_FILENAME = "Compression.csv" + +WEKA_BIN = "/local/tmp/ipd_extended_experiments2/weka/weka-3-9-1/weka.jar" if socket.gethostname() == 'push' else "/Users/tatyanadembelova/Downloads/weka-3-9-1/weka.jar" \ No newline at end of file diff --git a/data_generation.py b/data_generation.py index cf61838..fe6c696 100644 --- a/data_generation.py +++ b/data_generation.py @@ -8,17 +8,32 @@ # synthetic case from uds def correlated_data(m, n, sigma, f): + # l1 = int(n / 2) + # l2 = n - l1 + # Z = np.random.normal(0, 1, (m, l1)) + # A = np.matrix(np.random.uniform(1, 2, (l1, l1))) + # X1 = Z * A + # B = np.matrix(np.random.uniform(1, 2, (l1, l2))) + # W = X1 * B + # E = np.random.normal(0, sigma, (m, l2)) + # X2 = f(W) + E + # result = np.append(X1, X2, axis=1) + # print(result) + l1 = int(n / 2) l2 = n - l1 Z = np.random.normal(0, 1, (m, l1)) A = np.matrix(np.random.uniform(1, 2, (l1, l1))) X1 = Z * A + # A = np.matrix(np.random.uniform(1, 2, (m, l1))) + # X1 = A B = np.matrix(np.random.uniform(1, 2, (l1, l2))) W = X1 * B E = np.random.normal(0, sigma, (m, l2)) X2 = f(W) + E result = np.append(X1, X2, axis=1) print(result) + return result diff --git a/data_generator.py b/data_generator.py index 4dde7fd..d9c6571 100644 --- a/data_generator.py +++ b/data_generator.py @@ -132,24 +132,25 @@ def generate_overlap_partition(rf, c): return partition -def produce_data_generator(rf, irf, c, type, file_name): +def produce_cube_generator(rf, irf, interactions, type, cubes, file_name): dg = CubesGenerator(rf, irf, RADIUS, file_name) - # same number of records for each of the cubes + background - cube_rows = int(ROWS / (c + 1)) + # same number of records for each of the interactions * cubes + background + cube_rows = int(ROWS / (interactions * cubes + 1)) if type == 'c': - partition = [range(rf) for i in range(c)] + partition = [range(rf) for i in range(interactions)] elif type == 'i': - partition = generate_partition(rf, c) + partition = generate_partition(rf, interactions) elif type == 'io': - partition = generate_overlap_partition(rf, c) + partition = generate_overlap_partition(rf, interactions) else: raise ValueError("no such type!") for p in partition: - location = dict() - for j in p: - location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH) - dg.add_cube_parameter(CubeParameters(cube_rows, location)) + for cube in range(cubes): + location = dict() + for j in p: + location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH) + dg.add_cube_parameter(CubeParameters(cube_rows, location)) dg.add_cube_parameter(CubeParameters(cube_rows)) return dg @@ -167,13 +168,13 @@ def produce_all_data_generators(): perf_subspaces = dict() perf_discs = dict() - def produce_dg(name, rf, c, type): + def produce_dg(name, rf, i, type, cubes): - # if os.path.exists(basedir + name) and os.path.exists( - # perf_disc_dir + 'cut_' + name.replace('csv', 'txt')): - # continue + if os.path.exists(basedir + name) and os.path.exists( + perf_disc_dir + 'cut_' + name.replace('csv', 'txt')): + return - dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name + ".csv") + dg = produce_cube_generator(rf, cst.IRRELEVANT_FEATURES, i, type, cubes, name + ".csv") perf_discs[name] = dg.get_discs() perf_subspaces[name] = dg.get_subspaces() data_generators.append(dg) @@ -213,8 +214,13 @@ def store(data): if __name__ == '__main__': + cg = produce_cube_generator(7, 2, 3, 'c', 'bla') + print(cg.subspaces) + cg = produce_cube_generator(7, 2, 3, 'i', 'bla') + print(cg.subspaces) # print(generate_overlap_partition(7, 3)) - generators = produce_all_data_generators() - for g in generators: - store(g.build()) + # generators = produce_all_data_generators() + # for g in generators: + # + # store(g.build()) diff --git a/discretization_quality_measure.py b/discretization_quality_measure.py index 49fe9d7..7ac065c 100644 --- a/discretization_quality_measure.py +++ b/discretization_quality_measure.py @@ -166,7 +166,7 @@ def prepare_compression1(experiment_name): return False return True -def run_compression1(name, rf=None, c=None, type=None): +def run_compression1(name, rf=None, i=None, type=None, c=None): # 1. check slim db # convert dat-file to db-file if it does not exist if not os.path.exists(cst.SLIM_DATA_DIR + name + "/" + name + ".db"): diff --git a/experiments_logging.py b/experiments_logging.py index 3272d2b..6e5a877 100644 --- a/experiments_logging.py +++ b/experiments_logging.py @@ -1,6 +1,7 @@ import matplotlib.pyplot as plt import pandas as pd import discretization_quality_measure as dq +import data_generation as dg from mpl_toolkits.mplot3d import Axes3D import data_generation as dg_old import util @@ -86,7 +87,7 @@ def save_plot_data_3d(f, data): def plot_data_2d(data): - plt.scatter(data[1], data[2], s=1, c='k') + plt.scatter(data[0], data[1], s=1, c='k') plt.xlabel("dim 0") plt.ylabel("dim 1") plt.show() @@ -158,7 +159,8 @@ def get_cuts(disc_intervals): # data = pd.read_csv("synthetic_cases/blobs/3d_3_blobs_aligned.csv", delimiter=";", header=None, na_values='?') # data = pd.read_csv("new_cubes/cubes_10_100_03_i.csv", delimiter=";", header=None, na_values='?') # data = pd.read_csv("new_cubes/cubes_02_03_c.csv", delimiter=";", na_values='?', header=None) - data = pd.read_csv('synthetic_cases/uds_new.csv', delimiter=',', header=None) + # data = pd.read_csv('synthetic_cases/uds_new.csv', delimiter=',', header=None) + data = pd.DataFrame(dg.correlated_data(4000, 2, 0.1, dg.func3)) # data = pd.DataFrame(dg_old.correlated_data(4000, 3, 0.5, dg_old.func3)) # data = pd.DataFrame(dg.cubes(4000)) plot_data_2d(data) diff --git a/main.py b/main.py index 14bd5c0..dd4a9e5 100644 --- a/main.py +++ b/main.py @@ -447,6 +447,20 @@ def compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subs return subspace_sets +def compute_predefined_subspace_sets_naive(rel_features): + dims = [i for i in range(rel_features + cst.IRRELEVANT_FEATURES)] + random.shuffle(dims) + subspace_sets = [] + for chunks in cst.NAIVE_CHUNKS_NUMBER_RANGE_LIST: + ss = list(util.chunks(dims, chunks)) + # merge the last with the previous subspace, if the last consists only of 1 dimension + if len(ss[-1]) == 1: + ss[-2].extend(ss[-1]) + del ss[-1] + subspace_sets.append(ss) + return subspace_sets + + def compute_subspace_sets(data_file_name, method): rel_features = util.parse_relevant_features(data_file_name) ideal_subspace_set = get_ideal_subspace_set(data_file_name) @@ -476,9 +490,11 @@ def compute_subspace_sets(data_file_name, method): return compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subspace_set, True) elif method is cst.Method.PREDEFINED_SUBSPACESETS_SYNCHRONOUS_OPTIMAL: return compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subspace_set, False) + elif method is cst.Method.PREDEFINED_SUBSPACESETS_NAIVE: + return compute_predefined_subspace_sets_naive(rel_features) else: - raise ValueError("wrong method!") + raise ValueError("the method has not been implemented yet! " + method) def execute(param, loader=None): @@ -709,9 +725,9 @@ def prepare(base_dir, data_file, method, time_mark=False, delim=";", columns=Non return params -def collect_dataset_params(base_dir): +def collect_experiment_params(base_dir): - def collect(name, rf, c, type): + def collect(name, rf, i, type, c): params = [] file_path = cst.DATA_DIR + name + ".csv" @@ -753,7 +769,7 @@ def collect(name, rf, c, type): # cubes_03_10_c # print(compute_predefined_subspace_sets(3, [[0,1,2]])) # exit(1) - params = collect_dataset_params("logs_test3") + params = collect_experiment_params("logs_test3") # print(params) # print(compute_subspace_sets("cubes_10_03_i.csv", cst.Method.PREDEFINED_SUBSPACESETS)) # exit(1) diff --git a/runExperiment.py b/runExperiment.py index 25f5109..304630d 100755 --- a/runExperiment.py +++ b/runExperiment.py @@ -18,6 +18,7 @@ import main import pandas as pd import discretization_quality_measure as dqm +import data_generator as dg newRun = None nbThreads = int(multiprocessing.cpu_count() / 2) @@ -48,11 +49,6 @@ items = multiprocessing.Queue() -# todo items.put(WHATEVER PARAMETERS OF TASK) -# data_generators = dg.produce_all_data_generators() -# for data_generator in data_generators: -# items.put(data_generator) - class UnregisteredItem(Exception): pass @@ -97,7 +93,12 @@ def register_ideal_disc(self, name): loader = Loader() - params = main.collect_dataset_params("logs_test") + # todo items.put(WHATEVER PARAMETERS OF TASK) + # params = dg.produce_all_data_generators() + # for data_generator in params: + # items.put(data_generator) + + params = main.collect_experiment_params("logs_test") if len(params) == 0: print("no parameters collected!") exit(0) @@ -160,7 +161,7 @@ def datasetWriter(): while True: try: result = datasets.get(block=True, timeout=10) - # dg.store(dataset) + # dg.store(result) main.store(result, loader) except queue.Empty: break diff --git a/run_classification.py b/run_classification.py new file mode 100644 index 0000000..7c8f9ec --- /dev/null +++ b/run_classification.py @@ -0,0 +1,39 @@ +import constants as cst +import subprocess as sp +import re +import os + + +def run_random_forest1(base_dir_name, experiment_name): + file_path = cst.BASE + base_dir_name + "/" + experiment_name + "/out.arff" + if not os.path.exists(file_path): + return None + try: + output = str(sp.check_output(["java", "-cp", cst.WEKA_BIN, + "weka.classifiers.trees.RandomForest", '-P', '100', '-I', + '100', '-num-slots', '1', '-K', '0', '-M', '1.0', '-V', '0.001', '-S', '1', + "-t", file_path], timeout=30)) + match = re.search('Correctly Classified Instances\s+\d+\s+(\d+\.\d+)\s+%', output) + if match: + return experiment_name + "," + match.group(1) + return experiment_name + ",?" + except sp.TimeoutExpired: + print("timeout exceeded", experiment_name) + return experiment_name + ",?" + + +def classify_experiments(base_dir_name): + results = [] + for experiment in os.listdir(cst.BASE + base_dir_name): + if 'cubes' not in experiment: + continue + classification = run_random_forest1(base_dir_name, experiment) + results.append(classification) + results.append("\n") + return results + +if __name__ == '__main__': + base_dir_name = "logs_test" + res = classify_experiments(base_dir_name) + with open(cst.BASE + base_dir_name + "/Classification.csv", "w") as f: + f.writelines(res) \ No newline at end of file diff --git a/uds.py b/uds.py index cf79686..f2ddf43 100644 --- a/uds.py +++ b/uds.py @@ -6,6 +6,7 @@ import data_generation as dg from correlation_measures.binning import Binning from data_generation import correlated_data +import experiments_logging as log # bins count UDS_BETA = 20 @@ -151,8 +152,10 @@ def compute_uds(data): if __name__ == "__main__": - # data = pd.read_csv('synthetic_cases/uds_new.csv', delimiter=',') # uds_new.csv 0.361766479055 - data = pd.read_csv('new_cubes/cubes_02_03_c.csv', delimiter=';', header=None) # uds_new.csv 0.361766479055 + # data = pd.read_csv('synthetic_cases/uds_new.csv', delimiter=',', header=None) # uds_new.csv 0.361766479055 + data = pd.DataFrame(dg.correlated_data(4000, 2, 0.1, dg.func2)) + + # data = pd.read_csv('new_cubes/cubes_02_03_c.csv', delimiter=';', header=None) data = data.loc[:, :3] # data = pd.DataFrame(correlated_data(4000, 20, 10, dg.func2)) # data = pd.DataFrame(generate_correlated_data(1000, 10, 2, func1)) @@ -161,6 +164,7 @@ def compute_uds(data): uds = compute_uds(data) print(uds) + log.plot_data_2d(data) # print(es) # compute diff --git a/util.py b/util.py index a2b6a3f..e60abd1 100644 --- a/util.py +++ b/util.py @@ -26,20 +26,53 @@ def collect_params(f): # relevant features 2 - 30 # for rf in range(cst.RELEVANT_FEATURES_LOWER_BOUND, cst.RELEVANT_FEATURES_UPPER_BOUND): for rf in cst.RELEVANT_FEATURES_RANGE_LIST: - # cubes 1 - 10 - # for c in range(cst.CUBES_LOWER_BOUND, cst.CUBES_UPPER_BOUND): - for c in cst.INTERACTION_NUMBER_RANGE_LIST: - # cube types complete, incomplete, incomplete overlapping + # interactions 1 - 10 + for i in cst.INTERACTION_NUMBER_RANGE_LIST: + + # interaction types + # c - 1 interaction out of all relevant features + # i - partition of relevant features in i non-overlapping interactions + # io - partition of relevant features in i overlapping interactions for t in cst.INTERACTION_TYPES_RANGE_LIST: - if (c == 1 or rf / c < 2) and t != 'c': - continue - dataset_name = 'cubes_' + '{0:02d}'.format(rf) + '_' \ - + '{0:02d}'.format(c) + '_' \ - + t - param = f(dataset_name, rf, c, t) - print('collected param:', param) - if type(param) == list: - params.extend(param) - else: - params.append(param) - return params \ No newline at end of file + # cube number in each of the interactions + # todo random cube number in the constraints + for c in range(cst.CUBES_LOWER_BOUND, cst.CUBES_UPPER_BOUND): + # only full set of relevant features is possible + if t == 'c' and i > 1: + continue + if (i == 1 or rf / i < 2) and t != 'c': + continue + dataset_name = construct_dataset_name(i, rf, t, c) + param = f(dataset_name, rf, i, t, c) + print('collected param:', param) + if not param: + continue + if type(param) == list: + params.extend(param) + else: + params.append(param) + return params + + +def construct_dataset_name(i, rf, t, c): + if t == 'c': + # for example, returns cubes_7_3_c.csv where 3 is a number of cubes + return 'cubes_' + '{0:02d}'.format(rf) + '_' \ + + '{0:02d}'.format(c) + '_' \ + + t + if c == 1: + # for example, returns cubes_7_3_i.csv where 3 is a number of interactions with 1 cube in each + return 'cubes_' + '{0:02d}'.format(rf) + '_' \ + + '{0:02d}'.format(i) + '_' \ + + t + + # for example, returns cubes_7_3_2_i.csv where 3 is a number of interactions with 2 cube in each + return 'cubes_' + '{0:02d}'.format(rf) + '_' \ + + '{0:02d}'.format(i) + '_' \ + + '{0:02d}'.format(c) + '_' \ + + t + +def chunks(l, n): + """Yield successive n-sized chunks from l.""" + for i in range(0, len(l), n): + yield l[i:i + n] \ No newline at end of file