Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/discretization_quality_measure.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
543 lines (494 sloc)
19.6 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import os | |
import signal | |
import sys | |
import constants as cst | |
import re | |
import subprocess as sp | |
import util | |
global_min = -2 | |
MAX_DIM_COUNT = 4 | |
def parse_ideal_cuts(experiment_name): | |
name = util.parse_dataset_name(experiment_name) | |
try: | |
cuts = [] | |
with open(cst.PERFECT_DISCRETIZATIONS_DIR + "cut_" + name + ".txt", "r") as f: | |
cut = [] | |
for line in f: | |
if line.startswith("dimension"): | |
continue | |
if line.startswith("---"): | |
cuts.append(cut) | |
cut = [] | |
continue | |
cut.append(float(line.strip())) | |
return cuts | |
except FileNotFoundError: | |
return None | |
def _find_min_dist_cut(cut, cuts, start_id=0): | |
min_dist = float('Inf') | |
min_cut_id = None | |
for i, c in enumerate(cuts[start_id:], start=start_id): | |
if i > 0 and cuts[i - 1] > c: | |
raise ValueError("cuts" + str(cuts) + " is not ordered!") | |
temp_dist = abs(c - cut) | |
if min_dist > temp_dist: | |
min_dist = temp_dist | |
min_cut_id = i | |
# elif min_cut_id: | |
# break | |
return min_dist, min_cut_id | |
def _find_max_sim_cut(cut, cuts, start_id=0): | |
max_sim = -float("Inf") | |
max_sim_cut_id = None | |
for i, c in enumerate(cuts[start_id:], start=start_id): | |
if i > 0 and cuts[i - 1] > c: | |
raise ValueError("cuts" + str(cuts) + " is not ordered!") | |
if c == cut: | |
temp_sim = 1 | |
else: | |
if c > cut: | |
middle_dist = (c - cuts[i - 1]) / 2 | |
temp_sim = (middle_dist - (c - cut)) / middle_dist | |
else: | |
# if the cut is further ahead the last cut from cuts | |
if i == len(cuts) - 1: | |
middle_dist = (c - cuts[i - 1]) / 2 | |
else: | |
middle_dist = (cuts[i + 1] - c) / 2 | |
# if (cuts[i + 1] - c) / 2 > (cut - c): | |
temp_sim = (middle_dist - (cut - c)) / middle_dist | |
if max_sim < temp_sim: | |
max_sim = temp_sim | |
max_sim_cut_id = i | |
elif max_sim_cut_id is not None and cut < c: | |
break | |
return max_sim, max_sim_cut_id | |
def disc_similarity(expected_cuts, cuts): | |
cuts = cuts.copy() | |
cuts.insert(0, min(expected_cuts)) | |
# if abs(expected_cuts[-1] - cuts[-1]) > 0.02: | |
# raise ValueError("expected_cuts and cuts have very different last cut: ", expected_cuts[-1], cuts[-1]) | |
# if len(expected_cuts) == 1 and len(cuts) == 1: | |
# return 1 | |
# | |
# # don't check the same last cut | |
# expected_cuts = expected_cuts[:-1] | |
# cuts = cuts[:-1] | |
similarity = 0 | |
sim_exp_cut_id = _find_max_sim_cut(cuts[0], expected_cuts) | |
prev_cut_id = sim_exp_cut_id[1] | |
temp_sim = sim_exp_cut_id[0] | |
exp_match = 0 | |
for i, cut in enumerate(cuts[1:], start=1): | |
if i > 0 and cuts[i - 1] > cut: | |
raise ValueError("cuts" + str(cuts) + " is not ordered!") | |
sim_exp_cut_id = _find_max_sim_cut(cut, expected_cuts, prev_cut_id) | |
if sim_exp_cut_id[1] == prev_cut_id: | |
# counter += 1 | |
temp_sim *= sim_exp_cut_id[0] | |
else: | |
# print("temp_sim:", temp_sim) | |
similarity += temp_sim | |
temp_sim = sim_exp_cut_id[0] | |
exp_match += 1 | |
prev_cut_id = sim_exp_cut_id[1] | |
# print("temp_sim:", temp_sim) | |
similarity += temp_sim | |
exp_match += 1 | |
return similarity, exp_match | |
# def disc_distance(expected_cuts, cuts): | |
# distance = 0 | |
# dist_exp_cut_id = _find_min_dist_cut(cuts[0], expected_cuts) | |
# prev_cut_id = dist_exp_cut_id[1] | |
# temp_distance = dist_exp_cut_id[0] | |
# | |
# counter = 0 | |
# | |
# for i, cut in enumerate(cuts[1:], start=1): | |
# if i > 0 and cuts[i - 1] > cut: | |
# raise ValueError("cuts" + str(cuts) + " is not ordered!") | |
# | |
# dist_exp_cut_id = _find_min_dist_cut(cut, expected_cuts, prev_cut_id) | |
# if dist_exp_cut_id[1] == prev_cut_id: | |
# # counter += 1 | |
# temp_distance += dist_exp_cut_id[0] | |
# else: | |
# distance += temp_distance * 2 ** counter | |
# temp_distance = dist_exp_cut_id[0] | |
# counter = 0 | |
# | |
# prev_cut_id = dist_exp_cut_id[1] | |
# | |
# distance += temp_distance * 2 ** counter | |
# | |
# return distance | |
# prepare slim db | |
def prepare_compression1(experiment_name): | |
try: | |
dat_file = cst.SLIM_DATA_DIR + experiment_name + "/" + experiment_name + ".dat" | |
if not os.path.exists(dat_file): | |
print("no initial dat-file for experiment", experiment_name) | |
return False | |
with open(cst.SLIM_CONVERT_CONF, "r+") as conf_file: | |
new_lines = [] | |
for line in conf_file: | |
if line.startswith("dbName"): | |
line = "dbName = [" + experiment_name + "]\n" | |
new_lines.append(line) | |
conf_file.seek(0) | |
conf_file.writelines(new_lines) | |
conf_file.truncate() | |
output = sp.check_output([cst.SLIM_BIN, cst.SLIM_CONVERT_CONF]) | |
if "exception" in str(output): | |
print('exception during preparation for', experiment_name) | |
return False | |
except sp.CalledProcessError: | |
print('Prepare compression: conversion failed for', experiment_name) | |
return False | |
return True | |
def run_compression1(name, interaction_type=None, rows=None, rf=None, i=None, c=None, offset=None): | |
# 1. check slim db | |
# convert dat-file to db-file if it does not exist | |
if not os.path.exists(cst.SLIM_DATA_DIR + name + "/" + name + ".db"): | |
if not prepare_compression1(name): | |
print("run_compression failed for", name) | |
return [name, "", ""] | |
# 2. modify compress.conf | |
with open(cst.SLIM_COMPRESS_CONF, "r+") as conf_file: | |
new_lines = [] | |
for line in conf_file: | |
if line.startswith("iscName"): | |
line = "iscName = " + name + "-all-1d\n" | |
new_lines.append(line) | |
conf_file.seek(0) | |
conf_file.writelines(new_lines) | |
conf_file.truncate() | |
# 3. compress it | |
output = None | |
try: | |
output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=30)) | |
except sp.TimeoutExpired: | |
# timeout_counter = 0 | |
# while timeout_counter < 5: | |
# try: | |
# output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=60)) | |
# break | |
# except sp.TimeoutExpired: | |
# timeout_counter += 1 | |
# if not output: | |
# print("timeout exceeded " + str(timeout_counter) + " times for " + name) | |
# return [name, "", ""] | |
print("timeout exceeded", name) | |
return [name, "", ""] | |
except sp.CalledProcessError: | |
return [name, "", ""] | |
search_start = re.search('Start:\\\\t\\\\t.+?,(\d+)\)', output) | |
if search_start: | |
start_comp = search_start.group(1) | |
else: | |
print("compression start is not found", name) | |
start_comp = "" | |
search_end = re.search('Result:\\\\t\\\\t.+?,(\d+)\)', output) | |
if search_end: | |
result_comp = search_end.group(1) | |
else: | |
print("compression end is not found", name) | |
result_comp = "" | |
return [name, start_comp, result_comp] | |
def run_compression(base_dir): | |
global results | |
base_dir = cst.BASE + base_dir + "/" | |
comp_dict = util.read_csv(base_dir + "Compression.csv") | |
results = [] | |
for root, dirs, files in os.walk(base_dir): | |
dir_num = len(dirs) | |
counter = 0 | |
for experiment_name in dirs: | |
if comp_dict is not None and experiment_name in comp_dict: | |
continue | |
counter += 1 | |
print("compressing", experiment_name, counter, "/", dir_num) | |
res = ",".join(run_compression1(experiment_name)) | |
results.append(res + "\n") | |
print('processing finished') | |
with open(base_dir + "Compression.csv", "a") as f: | |
f.writelines(results) | |
# returns runtime in seconds and mdl of compression | |
# def compute_compression(name): | |
# escaped_name = util.get_escaped_name(name) | |
# # 1. check slim db | |
# if not os.path.exists(cst.SLIM_DATA_DIR + escaped_name + "/" + escaped_name + ".db"): | |
# print("no slim db file for " + escaped_name) | |
# return [name, None] | |
# | |
# # 2. modify compress.conf | |
# with open(cst.SLIM_COMPRESS_CONF, "r+") as conf_file: | |
# new_lines = [] | |
# for line in conf_file: | |
# if line.startswith("iscName"): | |
# line = "iscName = " + escaped_name + "-all-1d\n" | |
# new_lines.append(line) | |
# conf_file.seek(0) | |
# conf_file.writelines(new_lines) | |
# conf_file.truncate() | |
# | |
# # 3. compress it | |
# output = None | |
# try: | |
# output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=5)) | |
# except sp.TimeoutExpired: | |
# timeout_counter = 0 | |
# while timeout_counter < 5: | |
# try: | |
# output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=5)) | |
# break | |
# except sp.TimeoutExpired: | |
# timeout_counter += 1 | |
# if not output: | |
# print("timeout exceeded " + str(timeout_counter) + " times for " + name) | |
# return [name, None] | |
# except sp.CalledProcessError: | |
# return [name, None] | |
# | |
# start_comp = re.search('Start:\\\\t\\\\t.+?,(\d+)\)', output).group(1) | |
# result_comp = re.search('Result:\\\\t\\\\t.+?,(\d+)\)', output).group(1) | |
# return [name, start_comp, result_comp] | |
# def compute_problem_quality_measure(directory, | |
# problem, | |
# method, | |
# distances=('ID', 'CJS'), | |
# threshold_range=(0.3, 0.5, 0.8), | |
# irr_features_range=range(11)): | |
# ideal_cuts = parse_cuts("ideal_disc/cut_" + problem + ".txt") | |
# if method == cst.Method.TRIVIAL or method is cst.Method.PERFECT: | |
# name = method.name + "-" + problem | |
# print('compute_measures', name) | |
# values = compute_precision_recall_runtime(ideal_cuts, directory, name) | |
# compression = compute_compression(name) | |
# | |
# if not values: | |
# print('no value') | |
# | |
# return ([values[0]], values[1], [compression]) if values else None | |
# # return ([values[0]], values[1]) if values else None | |
# | |
# runtime_values = [] | |
# values = [] | |
# compression = [] | |
# for dist in distances: | |
# for threshold in threshold_range: | |
# threshold = str(threshold) | |
# if method == cst.Method.PREDEFINED: | |
# counter = 1 | |
# while counter < 11: | |
# name = dist + "-" + method.name + "-s" + str(counter) + "-" + threshold + "-" + problem | |
# counter += 1 | |
# | |
# print('compute_measures', name) | |
# value = compute_precision_recall_runtime(ideal_cuts, directory, name) | |
# if not value: | |
# print('no value') | |
# break | |
# runtime_values.append(value[0]) | |
# values.extend(value[1]) | |
# compression.append(compute_compression(name)) | |
# | |
# elif method == cst.Method.ORIGINAL: | |
# # else: | |
# for irr_feat in irr_features_range: | |
# name = dist + "-" + method.name + "-" + threshold + "-" + problem + ( | |
# "" if irr_feat == 0 else "-" + str(irr_feat)) | |
# | |
# print('compute_measures', name) | |
# value = compute_precision_recall_runtime(ideal_cuts, directory, name) | |
# if not value: | |
# print('no value') | |
# continue | |
# runtime_values.append(value[0]) | |
# values.extend(value[1]) | |
# compression.append(compute_compression(name)) | |
# return runtime_values, values, compression | |
# # return runtime_values, values | |
# def prepare_compression(directory, | |
# problem, | |
# method, | |
# distances=('ID', 'CJS'), | |
# threshold_range=(0.3, 0.5, 0.8), | |
# irr_features_range=range(11)): | |
# if method == cst.Method.TRIVIAL: | |
# name = "TRIVIAL-" + problem | |
# print('prepare compression', name) | |
# prepare_compression1(directory, name) | |
# return | |
# | |
# for dist in distances: | |
# for threshold in threshold_range: | |
# threshold = str(threshold) | |
# if method == cst.Method.PREDEFINED: | |
# counter = 1 | |
# while counter < 11: | |
# name = dist + "-" + method.name + "-s" + str(counter) + "-" + threshold + "-" + problem | |
# counter += 1 | |
# | |
# print('prepare compression', name) | |
# prepare_compression1(directory, name) | |
# | |
# # elif method == cst.Method.ORIGINAL: | |
# else: | |
# for irr_feat in irr_features_range: | |
# name = dist + "-" + method.name + "-" + threshold + "-" + problem + ( | |
# "" if irr_feat == 0 else "-" + str(irr_feat)) | |
# | |
# print('prepare compression', name) | |
# prepare_compression1(directory, name) | |
def parse_runtimes(name): | |
try: | |
runtimes = [] | |
with open(name, "r") as f: | |
for line in f: | |
if line.startswith("subspace mining runtime:"): | |
runtimes.append(float(re.search("(?:subspace mining runtime:) (.*)(?: seconds)", line).group(1))) | |
if line.startswith("full runtime:"): | |
if len(runtimes) == 0: | |
runtimes.append(0) | |
runtimes.append(float(re.search("(?:full runtime:) (.*)(?: seconds)", line).group(1))) | |
if len(runtimes) == 0: | |
return [0, 0] | |
return runtimes | |
except FileNotFoundError: | |
return None | |
# def compute_precision_recall_runtime(ideal_cuts, directory, name): | |
# data_dir = name.replace("-", "_") | |
# cuts = parse_cuts(directory + "/" + data_dir + ".csv/cut.txt") | |
# if not cuts: | |
# return None | |
# | |
# runtimes = parse_runtimes(directory + "/" + data_dir + ".csv/log.txt") | |
# runtime_values = [name] | |
# runtime_values.extend(runtimes) | |
# values = [] | |
# for i in range(MAX_DIM_COUNT): | |
# if len(ideal_cuts) <= i: | |
# break | |
# values.append( | |
# [name + "-dim" + str(i + 1), disc_precision(ideal_cuts[i], cuts[i]), disc_recall(ideal_cuts[i], cuts[i])]) | |
# return runtime_values, values | |
def disc_precision(expected, current): | |
similarity = disc_similarity(expected, current) | |
return similarity[0] / (len(current) + 1) | |
def disc_recall(expected, current): | |
similarity = disc_similarity(expected, current) | |
# todo should be without + 1 | |
return similarity[0] / (len(expected) + 1) | |
def disc_f1(expected, current): | |
similarity = disc_similarity(expected, current) | |
recall = similarity[0] / (len(expected) + 1) | |
precision = similarity[0] / (len(current) + 1) | |
return (2 * precision * recall) / (precision + recall) | |
def signal_handler(signal, frame): | |
global stop_signal, results | |
print('Writing down Compression.csv') | |
print(cst.BASE + base_dir + "Compression.csv") | |
with open(cst.BASE + base_dir + "/Compression.csv", "a") as f: | |
f.writelines(results) | |
sys.exit(0) | |
global stop_signal | |
stop_signal = False | |
if __name__ == '__main__': | |
global base_dir | |
base_dir = sys.argv[1] | |
signal.signal(signal.SIGINT, signal_handler) | |
print('signal registered') | |
# compression and classification quality measures | |
run_compression(base_dir) | |
# if len(sys.argv) == 1: | |
# print( | |
# 'Usage: discretization_quality_measure.py ' | |
# '-p=<problem> ' | |
# '-m=<[original|greedy_topk|trivial|...]> ' | |
# '-cor=<[uds]> ' | |
# '-dist=<[id, cjs]> ' | |
# '-t=<threshold float> ' | |
# '-r=<number of rows> ') | |
# command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID' | |
# print('Running default: ', command) | |
# command_list = command.split(' ') | |
# else: | |
# command_list = sys.argv[1:] | |
# | |
# problem_arg = list(filter(lambda x: x.startswith("-p="), command_list)) | |
# # if not problem_arg: | |
# # raise ValueError('No problem provided!') | |
# base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list)) | |
# if not base_dir_arg: | |
# raise ValueError('No logs base dir provided!') | |
# method_arg = list(filter(lambda x: x.startswith("-m="), command_list)) | |
# # if not method_arg: | |
# # raise ValueError('No method provided!') | |
# distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list)) | |
# # if not distance_measure_arg: | |
# # raise ValueError('No distance measure provided!') | |
# threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list)) | |
# # if not threshold_arg: | |
# # raise ValueError('No threshold provided!') | |
# # irr_feat_start_arg = list(filter(lambda x: x.startswith("-is="), command_list)) | |
# # irr_feat_end_arg = list(filter(lambda x: x.startswith("-ie="), command_list)) | |
# | |
# base_dir = base_dir_arg[0].replace('-b=', '') | |
# if not os.path.exists(base_dir): | |
# os.makedirs(base_dir) | |
# if problem_arg: | |
# problem = problem_arg[0].replace('-p=', '') | |
# if method_arg: | |
# method = cst.Method[method_arg[0].replace('-m=', '').upper()] | |
# if distance_measure_arg: | |
# distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()] | |
# if threshold_arg: | |
# threshold = float(threshold_arg[0].replace('-t=', '')) | |
# | |
# problems = [ | |
# # "2d_3_cubes_aligned_xor", | |
# # "2d_2_cubes_aligned", | |
# # "2d_2_cubes_xor", | |
# # "3d_2_cubes_aligned", | |
# # "3d_2_cubes_xor", | |
# # "3d_3_cubes_aligned", | |
# # "3d_3_cubes_aligned_xor", | |
# # "3d_3_cubes_xor", | |
# # "3d_4_cubes_1_aligned_xor", | |
# # "3d_4_cubes_2_aligned", | |
# # "3d_4_cubes_xor", | |
# # "4d_2_cubes_aligned", | |
# # "4d_3_cubes_aligned_xor", | |
# # "4d_3_cubes_xor", | |
# # "4d_4_cubes_aligned_xor", | |
# # "4d_4_cubes_2_aligned", | |
# "4d_4_cubes_xor", | |
# ] | |
# | |
# runtime = [] | |
# perf = [] | |
# compression = [] | |
# | |
# cols = ['run-dim', 'precision', 'recall'] | |
# runtime_cols = ['run', 'subspace mining runtime', 'full runtime'] | |
# compression_cols = ['run', 'start compression', 'result compression'] | |
# | |
# disc_distances = [] | |
# for problem in problems: | |
# print('problem:', problem) | |
# | |
# for method in [cst.Method.TRIVIAL, cst.Method.ORIGINAL, cst.Method.PREDEFINED]: | |
# # for method in [cst.Method.PERFECT]: | |
# print('method:', method) | |
# data = compute_problem_quality_measure(base_dir, problem, method=method) | |
# if not data: | |
# continue | |
# runtime.extend(data[0]) | |
# perf.extend(data[1]) | |
# compression.extend(data[2]) | |
# time = util.now() | |
# pd.DataFrame(perf, columns=cols).to_csv(base_dir + "/Precision_recall_" + time + ".csv") | |
# pd.DataFrame(runtime, columns=runtime_cols).to_csv(base_dir + "/Discretization_runtimes_" + time + ".csv") | |
# pd.DataFrame(compression, columns=compression_cols).to_csv(base_dir + "/Compression_" + time + ".csv") |