From b5d861f40499c1fde5a7962339d5754dd4fc618a Mon Sep 17 00:00:00 2001 From: Tatiana Dembelova Date: Sat, 14 Oct 2017 17:52:25 +0200 Subject: [PATCH] all the discretization quality measures are calculated and stored in the storing phase --- commands.txt | 8 +- constants.py | 9 +- data_generator.py | 68 ++++--- discretization_quality_measure.py | 276 ++++++++++++++++------------ experiments_logging.py | 29 ++- main.py | 292 +++++++++++++++++------------- old/temp_exp.py | 2 - runExperiment.py | 41 ++++- util.py | 30 ++- 9 files changed, 456 insertions(+), 299 deletions(-) diff --git a/commands.txt b/commands.txt index 9ee0fef..a7b1b0d 100644 --- a/commands.txt +++ b/commands.txt @@ -2,7 +2,7 @@ ssh tdembelo@contact.mmci.uni-saarland.de ssh tdembelo@push.mmci.uni-saarland.de -rsync -av --exclude '.idea/' --exclude '.git' --exclude='logs*' --exclude "*.png" --exclude "data/*" --exclude "tableau/" /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/ tdembelo@badr.mpi-inf.mpg.de:/home/tdembelo/ipd_extended/ +rsync -av --exclude 'ideal_disc/' --exclude 'synthetic_cases/' --exclude '.idea/' --exclude '.git' --exclude='logs*' --exclude "*.png" --exclude "data/*" --exclude "tableau/" --exclude "new_cubes/" /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/ tdembelo@contact.mmci.uni-saarland.de:/home/tdembelo/ipd_extended/ rsync -av tdembelo@badr.mpi-inf.mpg.de:/home/tdembelo/ipd_extended/logs_quality/ /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/logs_quality/ @@ -44,4 +44,8 @@ for f in *_*_T*.csv; do mv $f "${f/*_*_T/T}" ;done for f in *_*_CJS*.csv; do mv $f "${f/*_*_C/C}" ;done for f in *_*_CJS*.csv; do echo mv $f "${f/*_*_C/C}" ;done -rsync -av --exclude 'data*' /Users/tatyanadembelova/Documents/study/thesis/code-fic/ tdembelo@push.mmci.uni-saarland.de:/home/tdembelo/code-fic/ \ No newline at end of file +rsync -av --exclude 'data*' /Users/tatyanadembelova/Documents/study/thesis/code-fic/ tdembelo@push.mmci.uni-saarland.de:/home/tdembelo/code-fic/ + +# slim +./bootstrap.sh +make -Cbuild install \ No newline at end of file diff --git a/constants.py b/constants.py index 341bc46..cc3ebc5 100644 --- a/constants.py +++ b/constants.py @@ -15,7 +15,7 @@ class Method(Enum): PREDEFINED_SUBSPACESETS = 9 PREDEFINED_OPTIMAL_SUBSPACESET = 10 PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT = 11 - FULL = 11 + FULL = 12 class CorrelationMeasure(Enum): @@ -50,7 +50,7 @@ class DistanceMeasure(Enum): SUBSPACE_SET_STEP = 2 # todo change later -IRRELEVANT_FEATURES = 3 +IRRELEVANT_FEATURES = 4 BASE = '/local/tmp/ipd_extended_experiments2/' if socket.gethostname() == 'push' \ else '/Users/tatyanadembelova/Documents/study/thesis/ipd_extended/' @@ -62,4 +62,7 @@ class DistanceMeasure(Enum): SLIM_DATA_DIR = SLIM_BASE + "data/" SLIM_BIN = SLIM_BASE + "branches/slim/trunk/fic" SLIM_COMPRESS_CONF = SLIM_BASE + "branches/slim/trunk/compress.conf" -SLIM_CONVERT_CONF = SLIM_BASE + "branches/slim/trunk/convertdb.conf" \ No newline at end of file +SLIM_CONVERT_CONF = SLIM_BASE + "branches/slim/trunk/convertdb.conf" + +PRECISION_RECALL_FILENAME = "Precision_recall_runtime.csv" +COMPRESSION_FILENAME = "Compression.csv" \ No newline at end of file diff --git a/data_generator.py b/data_generator.py index 7d06e9e..09d60cf 100644 --- a/data_generator.py +++ b/data_generator.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import random +import util import time import os import json @@ -13,6 +14,7 @@ ROWS = 6000 OVERLAP_PROBABILITY = 0.6 + class CubeParameters: def __init__(self, rows, loc=None): self.rows = rows @@ -21,13 +23,19 @@ def __init__(self, rows, loc=None): class CubesGenerator: - def __init__(self, feature_count, radius, file_name): + def __init__(self, rel_feature_count, irr_feature_count, radius, file_name): + self.rel_feature_count = rel_feature_count self.file_name = file_name self.cube_parameters = [] - self.feature_count = feature_count - self.dim_borders = [[-radius, radius] for d in range(feature_count)] + self.feature_count = rel_feature_count + irr_feature_count + self.dim_borders = [[-radius, radius] for d in range(self.feature_count)] self.subspaces = [] - self.perf_disc = [{d[1]} for d in self.dim_borders] + self.perf_disc = [{d[1]} for d in self.dim_borders[:rel_feature_count]] + + def __repr__(self): + return 'CubesGenerator(file_name=' + str(self.file_name) \ + + ', rel_feature_count=' + str(self.rel_feature_count) \ + + ', feature_count=' + str(self.feature_count) + ")" def add_cube_parameter(self, cube_param): if cube_param.loc is None: @@ -37,10 +45,11 @@ def add_cube_parameter(self, cube_param): s = list(location_params.keys()) if s and not s in self.subspaces: self.subspaces.append(s) - for feat in range(self.feature_count): + + # perfect discretization + for feat in range(self.rel_feature_count): if feat in cube_param.loc.keys(): dim_params = location_params[feat] - # perfect discretization if dim_params[0] != -RADIUS: self.perf_disc[feat].add(dim_params[0]) self.perf_disc[feat].add(dim_params[0] + dim_params[1]) @@ -61,6 +70,7 @@ def build(self): cube = [] for feat in range(self.feature_count): if feat in location_params.keys(): + assert feat < self.rel_feature_count dim_params = location_params[feat] if dim_params[0] < self.dim_borders[feat][0] \ or dim_params[0] + dim_params[1] > self.dim_borders[feat][1]: @@ -122,9 +132,8 @@ def generate_overlap_partition(rf, c): return partition -def produce_data_generator(rf, irf, c, type, name): - total_f = rf + irf - dg = CubesGenerator(total_f, RADIUS, name) +def produce_data_generator(rf, irf, c, type, file_name): + dg = CubesGenerator(rf, irf, RADIUS, file_name) # same number of records for each of the cubes + background cube_rows = int(ROWS / (c + 1)) if type == 'c': @@ -158,34 +167,21 @@ def produce_all_data_generators(): perf_subspaces = dict() perf_discs = dict() - # relevant features 2 - 30 - for rf in range(2, 3): - # cubes 1 - 10 - for c in range(3, 4): - # cube types complete, incomplete, incomplete overlapping - for type in ['c']: - - # relevant features 2 - 30 - # for rf in range(2, 31): - # # cubes 1 - 10 - # for c in range(1, 11): - # # cube types complete, incomplete, incomplete overlapping - # for type in ['c', 'i', 'io']: - if (c == 1 or rf / c < 2) and type != 'c': - continue - name = 'cubes_' + '{0:02d}'.format(rf) + '_' \ - + '{0:02d}'.format(c) + '_' \ - + type + '.csv' - # if os.path.exists(basedir + name) and os.path.exists( - # perf_disc_dir + 'cut_' + name.replace('csv', 'txt')): - # continue - - dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name) - perf_discs[name] = dg.get_discs() - perf_subspaces[name] = dg.get_subspaces() - data_generators.append(dg) + + def produce_dg(name, rf, c, type): + + # if os.path.exists(basedir + name) and os.path.exists( + # perf_disc_dir + 'cut_' + name.replace('csv', 'txt')): + # continue + + dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name + ".csv") + perf_discs[name] = dg.get_discs() + perf_subspaces[name] = dg.get_subspaces() + data_generators.append(dg) + + util.collect_params(produce_dg) for name in perf_discs: - write_cut_file(perf_disc_dir + 'cut_' + name.replace('csv', 'txt'), perf_discs[name]) + write_cut_file(perf_disc_dir + 'cut_' + name + ".txt", perf_discs[name]) with open(perf_subspaces_file, 'w') as psf: json.dump(perf_subspaces, psf) return data_generators diff --git a/discretization_quality_measure.py b/discretization_quality_measure.py index d684b71..7f745b0 100644 --- a/discretization_quality_measure.py +++ b/discretization_quality_measure.py @@ -10,10 +10,11 @@ MAX_DIM_COUNT = 4 -def parse_cuts(name): +def parse_cuts(experiment_name): + name = re.search("(.+?_.+?_.+?_.+?)_", experiment_name).group(1) try: cuts = [] - with open(name, "r") as f: + with open(cst.PERFECT_DISCRETIZATIONS_DIR + "cut_" + name + ".txt", "r") as f: cut = [] for line in f: if line.startswith("dimension"): @@ -138,32 +139,18 @@ def disc_distance(expected_cuts, cuts): # prepare slim db -def prepare_compression1(directory, name): +def prepare_compression1(experiment_name): try: - dims_count = util.parse_relevant_features(directory + "/" + name) - underline_name = name.replace("-", "_") - if not os.path.exists(directory + "/" + underline_name + ".csv/out.txt"): - return - escaped_name = util.get_escaped_name(underline_name) - if not os.path.exists(cst.SLIM_DATA_DIR + escaped_name): - os.makedirs(cst.SLIM_DATA_DIR + escaped_name) - # create a formatted data file - with open(directory + "/" + underline_name + ".csv/" + cst.FILE_DATA_OUTPUT, "r") as init_file: - with open(cst.SLIM_DATA_DIR + escaped_name + "/" + escaped_name + ".dat", "w") as new_file: - row = 0 - for line in init_file: - if line.startswith("@") or line.strip() == "": - continue - split = line.split(",") - new_file.write(" ".join(split[:dims_count]) + " " + split[-1]) - row += 1 - # modify convert.conf + dat_file = cst.SLIM_DATA_DIR + experiment_name + "/" + experiment_name + ".dat" + if not os.path.exists(dat_file): + print("no initial dat-file for experiment", experiment_name) + return False with open(cst.SLIM_CONVERT_CONF, "r+") as conf_file: new_lines = [] for line in conf_file: if line.startswith("dbName"): - line = "dbName = [" + escaped_name + "]\n" + line = "dbName = [" + experiment_name + "]\n" new_lines.append(line) conf_file.seek(0) conf_file.writelines(new_lines) @@ -171,10 +158,69 @@ def prepare_compression1(directory, name): output = sp.check_output([cst.SLIM_BIN, cst.SLIM_CONVERT_CONF]) if "exception" in str(output): - print('exception during preparation for ' + name) + print('exception during preparation for', experiment_name) + return False + + except sp.CalledProcessError: + print('Prepare compression: conversion failed for', experiment_name) + return False + return True + +def run_compression1(name, rf=None, c=None, type=None): + # 1. check slim db + # convert dat-file to db-file if it does not exist + if not os.path.exists(cst.SLIM_DATA_DIR + name + "/" + name + ".db"): + if not prepare_compression1(name): + print("run_compression failed for", name) + return [name, "", ""] + + # 2. modify compress.conf + with open(cst.SLIM_COMPRESS_CONF, "r+") as conf_file: + new_lines = [] + for line in conf_file: + if line.startswith("iscName"): + line = "iscName = " + name + "-all-1d\n" + new_lines.append(line) + conf_file.seek(0) + conf_file.writelines(new_lines) + conf_file.truncate() + # 3. compress it + output = None + try: + output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=60)) + except sp.TimeoutExpired: + timeout_counter = 0 + while timeout_counter < 5: + try: + output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=60)) + break + except sp.TimeoutExpired: + timeout_counter += 1 + if not output: + print("timeout exceeded " + str(timeout_counter) + " times for " + name) + return [name, "", ""] except sp.CalledProcessError: - print('Prepare compression: conversion failed for ' + name) + return [name, "", ""] + + search_start = re.search('Start:\\\\t\\\\t.+?,(\d+)\)', output) + if search_start: + start_comp = search_start.group(1) + else: + print("compression start is not found", name) + start_comp = "" + search_end = re.search('Result:\\\\t\\\\t.+?,(\d+)\)', output) + if search_end: + result_comp = search_end.group(1) + else: + print("compression end is not found", name) + result_comp = "" + return [name, start_comp, result_comp] + + +def run_compression(): + results = util.collect_params(run_compression1) + return results # returns runtime in seconds and mdl of compression @@ -365,93 +411,95 @@ def disc_f1(expected, current): if __name__ == '__main__': - if len(sys.argv) == 1: - print( - 'Usage: discretization_quality_measure.py ' - '-p= ' - '-m=<[original|greedy_topk|trivial|...]> ' - '-cor=<[uds]> ' - '-dist=<[id, cjs]> ' - '-t= ' - '-r= ') - command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID' - print('Running default: ', command) - command_list = command.split(' ') - else: - command_list = sys.argv[1:] - - problem_arg = list(filter(lambda x: x.startswith("-p="), command_list)) - # if not problem_arg: - # raise ValueError('No problem provided!') - base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list)) - if not base_dir_arg: - raise ValueError('No logs base dir provided!') - method_arg = list(filter(lambda x: x.startswith("-m="), command_list)) - # if not method_arg: - # raise ValueError('No method provided!') - distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list)) - # if not distance_measure_arg: - # raise ValueError('No distance measure provided!') - threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list)) - # if not threshold_arg: - # raise ValueError('No threshold provided!') - # irr_feat_start_arg = list(filter(lambda x: x.startswith("-is="), command_list)) - # irr_feat_end_arg = list(filter(lambda x: x.startswith("-ie="), command_list)) - - base_dir = base_dir_arg[0].replace('-b=', '') - if not os.path.exists(base_dir): - os.makedirs(base_dir) - if problem_arg: - problem = problem_arg[0].replace('-p=', '') - if method_arg: - method = cst.Method[method_arg[0].replace('-m=', '').upper()] - if distance_measure_arg: - distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()] - if threshold_arg: - threshold = float(threshold_arg[0].replace('-t=', '')) - - problems = [ - # "2d_3_cubes_aligned_xor", - # "2d_2_cubes_aligned", - # "2d_2_cubes_xor", - # "3d_2_cubes_aligned", - # "3d_2_cubes_xor", - # "3d_3_cubes_aligned", - # "3d_3_cubes_aligned_xor", - # "3d_3_cubes_xor", - # "3d_4_cubes_1_aligned_xor", - # "3d_4_cubes_2_aligned", - # "3d_4_cubes_xor", - # "4d_2_cubes_aligned", - # "4d_3_cubes_aligned_xor", - # "4d_3_cubes_xor", - # "4d_4_cubes_aligned_xor", - # "4d_4_cubes_2_aligned", - "4d_4_cubes_xor", - ] - - runtime = [] - perf = [] - compression = [] - - cols = ['run-dim', 'precision', 'recall'] - runtime_cols = ['run', 'subspace mining runtime', 'full runtime'] - compression_cols = ['run', 'start compression', 'result compression'] - - disc_distances = [] - for problem in problems: - print('problem:', problem) - - for method in [cst.Method.TRIVIAL, cst.Method.ORIGINAL, cst.Method.PREDEFINED]: - # for method in [cst.Method.PERFECT]: - print('method:', method) - data = compute_problem_quality_measure(base_dir, problem, method=method) - if not data: - continue - runtime.extend(data[0]) - perf.extend(data[1]) - compression.extend(data[2]) - time = util.now() - pd.DataFrame(perf, columns=cols).to_csv(base_dir + "/Precision_recall_" + time + ".csv") - pd.DataFrame(runtime, columns=runtime_cols).to_csv(base_dir + "/Discretization_runtimes_" + time + ".csv") - pd.DataFrame(compression, columns=compression_cols).to_csv(base_dir + "/Compression_" + time + ".csv") + # compression and classification quality measures + run_compression() + # if len(sys.argv) == 1: + # print( + # 'Usage: discretization_quality_measure.py ' + # '-p= ' + # '-m=<[original|greedy_topk|trivial|...]> ' + # '-cor=<[uds]> ' + # '-dist=<[id, cjs]> ' + # '-t= ' + # '-r= ') + # command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID' + # print('Running default: ', command) + # command_list = command.split(' ') + # else: + # command_list = sys.argv[1:] + # + # problem_arg = list(filter(lambda x: x.startswith("-p="), command_list)) + # # if not problem_arg: + # # raise ValueError('No problem provided!') + # base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list)) + # if not base_dir_arg: + # raise ValueError('No logs base dir provided!') + # method_arg = list(filter(lambda x: x.startswith("-m="), command_list)) + # # if not method_arg: + # # raise ValueError('No method provided!') + # distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list)) + # # if not distance_measure_arg: + # # raise ValueError('No distance measure provided!') + # threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list)) + # # if not threshold_arg: + # # raise ValueError('No threshold provided!') + # # irr_feat_start_arg = list(filter(lambda x: x.startswith("-is="), command_list)) + # # irr_feat_end_arg = list(filter(lambda x: x.startswith("-ie="), command_list)) + # + # base_dir = base_dir_arg[0].replace('-b=', '') + # if not os.path.exists(base_dir): + # os.makedirs(base_dir) + # if problem_arg: + # problem = problem_arg[0].replace('-p=', '') + # if method_arg: + # method = cst.Method[method_arg[0].replace('-m=', '').upper()] + # if distance_measure_arg: + # distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()] + # if threshold_arg: + # threshold = float(threshold_arg[0].replace('-t=', '')) + # + # problems = [ + # # "2d_3_cubes_aligned_xor", + # # "2d_2_cubes_aligned", + # # "2d_2_cubes_xor", + # # "3d_2_cubes_aligned", + # # "3d_2_cubes_xor", + # # "3d_3_cubes_aligned", + # # "3d_3_cubes_aligned_xor", + # # "3d_3_cubes_xor", + # # "3d_4_cubes_1_aligned_xor", + # # "3d_4_cubes_2_aligned", + # # "3d_4_cubes_xor", + # # "4d_2_cubes_aligned", + # # "4d_3_cubes_aligned_xor", + # # "4d_3_cubes_xor", + # # "4d_4_cubes_aligned_xor", + # # "4d_4_cubes_2_aligned", + # "4d_4_cubes_xor", + # ] + # + # runtime = [] + # perf = [] + # compression = [] + # + # cols = ['run-dim', 'precision', 'recall'] + # runtime_cols = ['run', 'subspace mining runtime', 'full runtime'] + # compression_cols = ['run', 'start compression', 'result compression'] + # + # disc_distances = [] + # for problem in problems: + # print('problem:', problem) + # + # for method in [cst.Method.TRIVIAL, cst.Method.ORIGINAL, cst.Method.PREDEFINED]: + # # for method in [cst.Method.PERFECT]: + # print('method:', method) + # data = compute_problem_quality_measure(base_dir, problem, method=method) + # if not data: + # continue + # runtime.extend(data[0]) + # perf.extend(data[1]) + # compression.extend(data[2]) + # time = util.now() + # pd.DataFrame(perf, columns=cols).to_csv(base_dir + "/Precision_recall_" + time + ".csv") + # pd.DataFrame(runtime, columns=runtime_cols).to_csv(base_dir + "/Discretization_runtimes_" + time + ".csv") + # pd.DataFrame(compression, columns=compression_cols).to_csv(base_dir + "/Compression_" + time + ".csv") diff --git a/experiments_logging.py b/experiments_logging.py index b3762fd..d9b904f 100644 --- a/experiments_logging.py +++ b/experiments_logging.py @@ -115,18 +115,29 @@ def write_out_file(problem, disc_intervals, disc_points, class_labels): return lines -def write_outdat_file(disc_intervals, disc_points, class_labels, relevant_features): - lines = [] +def get_out_files(experiment_name, disc_intervals, disc_points, class_labels, relevant_features): + dat_lines = [] + arff_lines = ['@relation ' + experiment_name + "\n\n"] + counter = [1] - for i in range(len(disc_intervals)): + for i in range(relevant_features): + arff_lines.append( + '@attribute dim' + str(i) + ' {' + ','.join([str(j + counter[-1]) for j in disc_intervals[i]]) + '}\n') counter.append(counter[-1] + len(disc_intervals[i])) + arff_lines.append('@attribute class {' + ','.join(['"' + str(i) + '"' for i in class_labels.unique()]) + '}\n\n') + arff_lines.append('@data\n') + for i in range(len(disc_points[0])): - line = ' '.join([str(disc_points[j][i] + counter[j]) for j in range(relevant_features)]) - lines.append(line + " " + str(class_labels[i]) + '\n') - return lines + values = [str(disc_points[j][i] + counter[j]) for j in range(relevant_features)] + dat_line = ' '.join(values) + dat_lines.append(dat_line + " " + str(class_labels[i]) + '\n') + + arff_line = ",".join(values) + arff_lines.append(arff_line + ',"' + str(class_labels[i]) + '"\n') + return dat_lines, arff_lines -def write_cut_file(disc_intervals): +def get_cut_file(disc_intervals): lines = [] for i in range(len(disc_intervals)): lines.append('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n') @@ -136,6 +147,10 @@ def write_cut_file(disc_intervals): return lines +def get_cuts(disc_intervals): + return [[disc_intervals[i][bin][1] for bin in disc_intervals[i]] for i in range(len(disc_intervals))] + + if __name__ == '__main__': # rows = 20000 # data = np.concatenate((synthetic_cube_in_cube(rows, 2, 0), np.zeros((rows, 1))), axis=1) diff --git a/main.py b/main.py index 6bd9931..442b6db 100644 --- a/main.py +++ b/main.py @@ -21,7 +21,7 @@ import experiments_logging as el from merging import dynamic_merging import cjs -import discretization_quality_measure as dq +import discretization_quality_measure as dqm import json import random import traceback @@ -77,7 +77,7 @@ def compute_distances(bin_map, curr, data, dim_maxes, log=None): if method == cst.Method.FULL: return (id.compute_IDs(bin_map, curr, data, dim_maxes) if distance_measure == cst.DistanceMeasure.ID - else cjs.compute_CJSs(bin_map, curr, data, dim_maxes)), 0 + else cjs.compute_CJSs(bin_map, curr, data, dim_maxes)), 0 if method.name.startswith("SM"): subspace_mining_start = time.time() if method == cst.Method.GREEDY_TOPK: @@ -105,7 +105,7 @@ def compute_distances(bin_map, curr, data, dim_maxes, dim_maxes = dim_maxes[curr_subspace] return (id.compute_IDs1(bin_map, data, dim_maxes) if distance_measure == cst.DistanceMeasure.ID - else cjs.compute_CJSs1(bin_map, data, dim_maxes)), sm_runtime + else cjs.compute_CJSs1(bin_map, data, dim_maxes)), sm_runtime def compute_IPD(data, rel_features_count, method=cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET, cor_measure=None, @@ -136,7 +136,7 @@ def compute_IPD(data, rel_features_count, method=cst.Method.PREDEFINED_OPTIMAL_S disc_macro_intervals = [] disc_points = [] - subspace_map = get_map_from_subspace_set(subspace_set) + subspace_map = get_map_from_subspace_set(subspace_set) if subspace_set else None distancez = [] # iterate over all the dimensions full_sm_runtime = 0 @@ -372,7 +372,7 @@ def _compute_subspaces(dims, sets): # todo return list of dictionaries def get_ideal_subspace_set(data_file_name): # todo naive implementation - return ideal.get(data_file_name) + return ideal.get(data_file_name.replace(".csv", "")) def get_map_from_subspace_set(subspace_set): @@ -402,6 +402,12 @@ def compute_subspace_sets(data_file_name, method): rss = [ideal_subspace + [rf for rf in range(rel_features, irr)] for ideal_subspace in ideal_subspace_set] redundant_subspace_sets.append(rss) + if cst.IRRELEVANT_FEATURES % 2 == 0: + rss = [ideal_subspace + [rf for rf in range(rel_features, cst.IRRELEVANT_FEATURES + rel_features)] for + ideal_subspace in + ideal_subspace_set] + redundant_subspace_sets.append(rss) + return redundant_subspace_sets elif method is cst.Method.PREDEFINED_SUBSPACESETS: @@ -503,13 +509,16 @@ def execute(param, loader=None): # plot_distances(dir, distances, disc_intervals) # output file for classification measurements - outdat_file_content = el.write_outdat_file(disc_intervals, disc_points, class_labels, relevant_features) + outdat, outarff = el.get_out_files(experiment_name, disc_intervals, disc_points, class_labels, + relevant_features) # output file for compression measurements # slim_dat_content = dq.prepare_slim_dat(base_dir, experiment_name) - cut_file_content = el.write_cut_file(disc_intervals) - return Result(base_dir, experiment_name, outdat_file_content, cut_file_content, runtime, sm_runtime, - init_bins_count) + cut = el.get_cuts(disc_intervals) + cut_file_content = el.get_cut_file(disc_intervals) + return Result(base_dir, experiment_name, outdat, outarff, cut, cut_file_content, runtime, sm_runtime, + init_bins_count, relevant_features) + # return Result(base_dir, experiment_name, None, None, None, None, None, None, relevant_features) except: print("Error in " + experiment_name + ":", sys.exc_info()[0], sys.exc_info()[1]) traceback.print_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2], @@ -521,27 +530,52 @@ def execute(param, loader=None): class Result: - def __init__(self, base_dir, experiment_name, outdat_file_content, cut_file_content, runtime, sm_runtime, - initial_bin_count): + def __init__(self, base_dir, experiment_name, outdat_file_content, outarff_file_content, cut, cut_file_content, runtime, sm_runtime, + initial_bin_count, rel_feature_count): + self.cut_file_content = cut_file_content + self.outarff_file_content = outarff_file_content + self.rel_feature_count = rel_feature_count self.sm_runtime = sm_runtime self.initial_bin_count = initial_bin_count self.runtime = runtime self.base_dir = base_dir self.experiment_name = experiment_name - self.cut_file_content = cut_file_content + self.cut = cut self.outdat_file_content = outdat_file_content - self.dir = base_dir + '/' + experiment_name + "/" + self.dir = base_dir + experiment_name + "/" + def __repr__(self): + return "Result(experiment_name=" + self.experiment_name + ")" -def store(result): + +def append_to_quality_measure_files(result, loader): + assert type(result) is Result + measure_file = result.base_dir + cst.PRECISION_RECALL_FILENAME + ideal_cuts = loader.load_ideal_disc(result.experiment_name) if loader else dqm.parse_cuts(result.experiment_name) + with open(measure_file, "a") as f: + for i in range(result.rel_feature_count): + f.write(",".join([result.experiment_name + "-dim" + str(i + 1), + str(dqm.disc_precision(ideal_cuts[i], result.cut[i])), + str(dqm.disc_recall(ideal_cuts[i], result.cut[i])), + str(result.sm_runtime), + str(result.runtime)])) + f.write("\n") + return + + +def append_to_compression_files(result): + measure_file = result.base_dir + cst.COMPRESSION_FILENAME + with open(measure_file, "a") as f: + f.write(",".join(dqm.run_compression1(result.experiment_name))) + f.write("\n") + + +def store(result, loader=None): if not result: return assert type(result) is Result print('storing experiment', result.experiment_name) - if not os.path.exists(result.base_dir): - os.makedirs(result.base_dir) - if not os.path.exists(result.dir): os.makedirs(result.dir) @@ -549,12 +583,19 @@ def store(result): f.write("initial bins count: " + str(result.initial_bin_count) + "\n") f.write("runtime " + str(result.runtime) + " seconds\n") f.write("sm runtime " + str(result.sm_runtime) + " seconds\n") + + append_to_quality_measure_files(result, loader) + if not os.path.exists(cst.SLIM_DATA_DIR + result.experiment_name): os.makedirs(cst.SLIM_DATA_DIR + result.experiment_name) - with open(cst.SLIM_DATA_DIR + result.experiment_name + "/" + result.experiment_name + ".dat", "w") as f: f.writelines(result.outdat_file_content) + append_to_compression_files(result) + + with open(result.dir + cst.FILE_DATA_OUTPUT, "w") as f: + f.writelines(result.outarff_file_content) + with open(result.dir + cst.FILE_DATA_CUTS, "w") as f: f.writelines(result.cut_file_content) @@ -574,10 +615,14 @@ def __init__(self, base_dir, experiment_name, method, data_file, delim, columns, self.experiment_name = experiment_name self.base_dir = base_dir + def __repr__(self): + return "RunParams(experiment_name=" + self.experiment_name +\ + ", subspace_set=" + str(self.subspace_set) + ")" + def prepare(base_dir, data_file, method, time_mark=False, delim=";", columns=None, rows=None, distance_measure=cst.DistanceMeasure.ID, - cor_measure=None, threshold=cst.ID_THRESHOLD_QUANTILE): + cor_measure=cst.CorrelationMeasure.UDS, threshold=cst.ID_THRESHOLD_QUANTILE): params = [] # # defining prefix for the output files data_file_name = util.get_file_name(data_file) @@ -588,19 +633,21 @@ def prepare(base_dir, data_file, method, time_mark=False, delim=";", columns=Non else: subspace_sets = None - base_dir = cst.BASE + base_dir - if not os.path.exists(base_dir): - os.makedirs(base_dir) + base_dir = cst.BASE + base_dir + "/" # full, trivial, SM methods if not method.name.startswith("PREDEFINED"): experiment_name = data_file_name.replace(".csv", "") + ("_" + str(columns) + "c" if columns else "") + ( "_" + str(rows) + "r" if rows else "") + "_" \ - + method.name.replace("_", "") + + method.name.replace("_", "") \ + + ("_" + cor_measure.name if method.name.startswith("SM") else "") timed_name = (util.now() if time_mark else "") + ("_" if time_mark else "") + experiment_name - params.append( - RunParams(base_dir, timed_name, method, data_file, delim, columns, rows, distance_measure, threshold, - cor_measure, None)) - print("prepared parameters for", experiment_name) + if not os.path.exists(base_dir + timed_name): + params.append( + RunParams(base_dir, timed_name, method, data_file, delim, columns, rows, distance_measure, threshold, + cor_measure, None)) + print("prepared parameters for", experiment_name) + else: + print("experiment", experiment_name, "has already been processed") # predefined subspace sets else: @@ -615,6 +662,9 @@ def prepare(base_dir, data_file, method, time_mark=False, delim=";", columns=Non counter) if method is cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT else "") counter += 1 timed_name = (util.now() + "_" if time_mark else "") + experiment_name + if os.path.exists(base_dir + timed_name): + print("experiment", experiment_name, "has already been processed") + continue params.append( RunParams(base_dir, timed_name, method, data_file, delim, columns, rows, distance_measure, threshold, cor_measure, subspace_set)) @@ -623,107 +673,103 @@ def prepare(base_dir, data_file, method, time_mark=False, delim=";", columns=Non def collect_params(base_dir): - params = [] - # relevant features 2 - 30 - for rf in range(2, 31): - # cubes 1 - 10 - for c in range(1, 11): - # cube types complete, incomplete, incomplete overlapping - for type in ['c', 'i', 'io']: - # for rf in range(2, 3): - # # cubes 1 - 10 - # for c in range(3, 4): - # # cube types complete, incomplete, incomplete overlapping - # for type in ['c']: - if (c == 1 or rf / c < 2) and type != 'c': - continue - filepath = cst.DATA_DIR + 'cubes_' + '{0:02d}'.format(rf) + '_' \ - + '{0:02d}'.format(c) + '_' \ - + type + '.csv' - for method in [cst.Method.PREDEFINED_SUBSPACESETS, - cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT, - cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET - ]: - params.extend(prepare(base_dir, filepath, method)) - return params + + def collect(name, rf, c, type): + params = [] + file_path = cst.DATA_DIR + name + ".csv" + for method in [ + cst.Method.PREDEFINED_SUBSPACESETS, + cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT, + cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET, + cst.Method.TRIVIAL, + cst.Method.FULL + ]: + print("preparing", name, method) + params.extend(prepare(base_dir, file_path, method)) + return params + + return util.collect_params(collect) if __name__ == "__main__": - params = collect_params("logs_test") + # params = collect_params("logs_test") + # print(params) # print(compute_subspace_sets("cubes_10_03_i.csv", cst.Method.PREDEFINED_SUBSPACESETS)) # exit(1) - # if len(sys.argv) == 1: - # # print( - # # 'Usage: main.py ' - # # '-b= ' - # # '-f= ' - # # '-d= ' - # # '-c= ' - # # '-m=<[original|greedy_topk|trivial|...]> ' - # # '-cor=<[uds]> ' - # # '-dist=<[id, cjs]> ' - # # '-t= ' - # # '-s[=] ' - # # '-r= ') - # # command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID' - # # print('Running default: ', command) - # # command_list = command.split(' ') - # raise ValueError("no arguments passed!") - # else: - # command_list = sys.argv[1:] - # - # file_arg = list(filter(lambda x: x.startswith("-f="), command_list)) - # if not file_arg: - # raise ValueError('No data file provided!') - # base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list)) - # if not base_dir_arg: - # raise ValueError('No logs base dir provided!') - # time_mark = len(list(filter(lambda x: x.startswith("-time"), command_list))) != 0 - # delim_arg = list(filter(lambda x: x.startswith("-d="), command_list)) - # columns_arg = list(filter(lambda x: x.startswith("-c="), command_list)) - # rows_arg = list(filter(lambda x: x.startswith("-r="), command_list)) - # method_arg = list(filter(lambda x: x.startswith("-m="), command_list)) - # corr_measure_arg = list(filter(lambda x: x.startswith("-cor="), command_list)) - # distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list)) - # threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list)) - # - # data_file = file_arg[0].replace('-f=', '') - # base_dir = base_dir_arg[0].replace('-b=', '') - # - # if delim_arg: - # delimiter = delim_arg[0].replace('-d=', '') - # else: - # print('using default delimiter ;') - # delimiter = ';' - # columns = int(columns_arg[0].replace('-c=', '')) if columns_arg else None - # rows = int(rows_arg[0].replace('-r=', '')) if rows_arg else None - # if method_arg: - # method = cst.Method[method_arg[0].replace('-m=', '').upper()] - # else: - # print('using default method PREDEFINED_OPTIMAL_SUBSPACESET') - # method = cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET - # - # cor_measure = cst.CorrelationMeasure[corr_measure_arg[0].replace('-cor=', '').upper()] if corr_measure_arg \ - # else None - # if method.name.startswith("SM") and cor_measure is None: - # raise ValueError('A correlation measure should be given!') - # - # if distance_measure_arg: - # distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()] - # print('using distance measure ' + distance_measure.name) - # else: - # distance_measure = cst.DistanceMeasure.ID - # print('using default distance measure ID') - # if threshold_arg: - # threshold = float(threshold_arg[0].replace('-t=', '')) - # - # print('using ID_THRESHOLD_QUANTILE = ', str(threshold)) - # else: - # threshold = cst.ID_THRESHOLD_QUANTILE - # print('using default ID_THRESHOLD_QUANTILE = ', str(threshold)) - # - # params = prepare(base_dir, data_file, method, time_mark, delimiter, columns, rows, distance_measure, cor_measure, - # threshold) + + if len(sys.argv) == 1: + # print( + # 'Usage: main.py ' + # '-b= ' + # '-f= ' + # '-d= ' + # '-c= ' + # '-m=<[original|greedy_topk|trivial|...]> ' + # '-cor=<[uds]> ' + # '-dist=<[id, cjs]> ' + # '-t= ' + # '-s[=] ' + # '-r= ') + # command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID' + # print('Running default: ', command) + # command_list = command.split(' ') + raise ValueError("no arguments passed!") + else: + command_list = sys.argv[1:] + + file_arg = list(filter(lambda x: x.startswith("-f="), command_list)) + if not file_arg: + raise ValueError('No data file provided!') + base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list)) + if not base_dir_arg: + raise ValueError('No logs base dir provided!') + time_mark = len(list(filter(lambda x: x.startswith("-time"), command_list))) != 0 + delim_arg = list(filter(lambda x: x.startswith("-d="), command_list)) + columns_arg = list(filter(lambda x: x.startswith("-c="), command_list)) + rows_arg = list(filter(lambda x: x.startswith("-r="), command_list)) + method_arg = list(filter(lambda x: x.startswith("-m="), command_list)) + corr_measure_arg = list(filter(lambda x: x.startswith("-cor="), command_list)) + distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list)) + threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list)) + + data_file = file_arg[0].replace('-f=', '') + base_dir = base_dir_arg[0].replace('-b=', '') + + if delim_arg: + delimiter = delim_arg[0].replace('-d=', '') + else: + print('using default delimiter ;') + delimiter = ';' + columns = int(columns_arg[0].replace('-c=', '')) if columns_arg else None + rows = int(rows_arg[0].replace('-r=', '')) if rows_arg else None + if method_arg: + method = cst.Method[method_arg[0].replace('-m=', '').upper()] + else: + print('using default method PREDEFINED_OPTIMAL_SUBSPACESET') + method = cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET + + cor_measure = cst.CorrelationMeasure[corr_measure_arg[0].replace('-cor=', '').upper()] if corr_measure_arg \ + else None + if method.name.startswith("SM") and cor_measure is None: + raise ValueError('A correlation measure should be given!') + + if distance_measure_arg: + distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()] + print('using distance measure ' + distance_measure.name) + else: + distance_measure = cst.DistanceMeasure.ID + print('using default distance measure ID') + if threshold_arg: + threshold = float(threshold_arg[0].replace('-t=', '')) + + print('using ID_THRESHOLD_QUANTILE = ', str(threshold)) + else: + threshold = cst.ID_THRESHOLD_QUANTILE + print('using default ID_THRESHOLD_QUANTILE = ', str(threshold)) + + params = prepare(base_dir, data_file, method, time_mark, delimiter, columns, rows, distance_measure, cor_measure, + threshold) + for p in params: result = execute(p) diff --git a/old/temp_exp.py b/old/temp_exp.py index 7b15f62..a83d391 100644 --- a/old/temp_exp.py +++ b/old/temp_exp.py @@ -12,8 +12,6 @@ import interaction_distance as id import util from correlation_measures.binning import Binning -from experiments_logging import write_out_file, write_cut_file -from merging import dynamic_merging # ------------------------------------------------------ diff --git a/runExperiment.py b/runExperiment.py index bf548f0..02baa5b 100755 --- a/runExperiment.py +++ b/runExperiment.py @@ -17,6 +17,7 @@ import psutil import main import pandas as pd +import discretization_quality_measure as dqm newRun = None nbThreads = int(multiprocessing.cpu_count() / 2) @@ -52,37 +53,57 @@ # for data_generator in data_generators: # items.put(data_generator) -class UnregisteredDataset(Exception): +class UnregisteredItem(Exception): pass with multiprocessing.Manager() as manager: class Loader(): def __init__(self): self.dataset = manager.dict() + self.ideal_discs = manager.dict() self.global_lock = multiprocessing.RLock() self.dataset_locks = {}#manager.dict() + self.ideal_disc_locks = {}#manager.dict() + + def load_ideal_disc(self, name): + if not name in self.ideal_disc_locks: + raise UnregisteredItem('Unregistered ideal discretization shall be loaded ', name) + + with self.ideal_disc_locks[name]: + if not name in self.ideal_discs: + self.ideal_discs[name] = dqm.parse_cuts(name) + return self.ideal_discs[name] def load_dataset(self, path, delim): if not path in self.dataset_locks: - raise UnregisteredDataset('Unregistered dataset shall be loaded ', path) + raise UnregisteredItem('Unregistered dataset shall be loaded ', path) with self.dataset_locks[path]: if not path in self.dataset: self.dataset[path] = pd.read_csv(path, delimiter=delim, header=None, na_values='?') return self.dataset[path] + def register_dataset(self, path): with self.global_lock: if path not in self.dataset_locks: self.dataset_locks[path] = multiprocessing.RLock() + def register_ideal_disc(self, name): + with self.global_lock: + if name not in self.ideal_disc_locks: + self.ideal_disc_locks[name] = multiprocessing.RLock() - loader = Loader() + loader = Loader() params = main.collect_params("logs_test") + if len(params) == 0: + print("no parameters collected!") + exit(0) for param in params: loader.register_dataset(param.data_file) + loader.register_ideal_disc(param.experiment_name) items.put(param) if onlyListTasks: @@ -90,8 +111,8 @@ def register_dataset(self, path): para = items.get() print(para) - nbTasksTotal = items.qsize() - nbTasksDone = 0 + nbTasksTotal = len(params) + nbTasksDone = [0] counterLock = multiprocessing.RLock() paramQueueLock = multiprocessing.RLock() runningMain = True @@ -118,7 +139,7 @@ def worker(worker_id): para = items.get(block=False) except queue.Empty: return - print('thread ', threading.get_ident(), ' / started', para) + print('Worker ID ', worker_id, 'is executing', para) # todo generate data sets # datasets.put(para.build()) @@ -126,8 +147,8 @@ def worker(worker_id): print('Worker ID ', worker_id, ' execution finished') with counterLock: if runningMain: - nbTasksDone += 1 - print("Jobs done ", nbTasksDone, "/", nbTasksTotal) + nbTasksDone[0] += 1 + print("Jobs done ", nbTasksDone[0], "/", nbTasksTotal) # items.task_done() @@ -140,7 +161,7 @@ def datasetWriter(): try: result = datasets.get(block=True, timeout=10) # dg.store(dataset) - main.store(result) + main.store(result, loader) except queue.Empty: break @@ -148,7 +169,7 @@ def datasetWriter(): break with counterLock: - if nbTasksDone == nbTasksTotal and datasets.empty() or not runningMain: + if nbTasksDone[0] == nbTasksTotal and datasets.empty() or not runningMain: break diff --git a/util.py b/util.py index 65df0f5..5a65852 100644 --- a/util.py +++ b/util.py @@ -13,10 +13,36 @@ def get_escaped_name(problem): return problem.replace("-", "_").replace(".", "") -def parse_relevant_features(data_file): - data_file_name = get_file_name(data_file) +def parse_relevant_features(data_file_name): search = re.search('cubes_(\d+)_', data_file_name) if not search: raise ValueError("wrong file format!") dims_count = int(search.group(1)) return dims_count + + +def collect_params(f): + params = [] + # relevant features 2 - 30 + for rf in range(2, 31): + # cubes 1 - 10 + for c in range(1, 11): + # cube types complete, incomplete, incomplete overlapping + for t in ["c", 'i', "io"]: + # for rf in range(2, 3): + # # cubes 1 - 10 + # for c in range(3, 4): + # # cube types complete, incomplete, incomplete overlapping + # for type in ['c']: + if (c == 1 or rf / c < 2) and t != 'c': + continue + dataset_name = 'cubes_' + '{0:02d}'.format(rf) + '_' \ + + '{0:02d}'.format(c) + '_' \ + + t + param = f(dataset_name, rf, c, t) + print('collected param:', param) + if type(param) == list: + params.extend(param) + else: + params.append(param) + return params \ No newline at end of file