discretization_quality_measure.py

import pandas as pd
import os
import signal
import sys
import constants as cst
import re
import subprocess as sp
import util

global_min = -2
MAX_DIM_COUNT = 4


def parse_ideal_cuts(experiment_name):
    name = util.parse_dataset_name(experiment_name)
    try:
        cuts = []
        with open(cst.PERFECT_DISCRETIZATIONS_DIR + "cut_" + name + ".txt", "r") as f:
            cut = []
            for line in f:
                if line.startswith("dimension"):
                    continue
                if line.startswith("---"):
                    cuts.append(cut)
                    cut = []
                    continue
                cut.append(float(line.strip()))
        return cuts
    except FileNotFoundError:
        return None


def _find_min_dist_cut(cut, cuts, start_id=0):
    min_dist = float('Inf')
    min_cut_id = None
    for i, c in enumerate(cuts[start_id:], start=start_id):
        if i > 0 and cuts[i - 1] > c:
            raise ValueError("cuts" + str(cuts) + " is not ordered!")

        temp_dist = abs(c - cut)
        if min_dist > temp_dist:
            min_dist = temp_dist
            min_cut_id = i
            # elif min_cut_id:
            #     break
    return min_dist, min_cut_id


def _find_max_sim_cut(cut, cuts, start_id=0):
    max_sim = -float("Inf")
    max_sim_cut_id = None
    for i, c in enumerate(cuts[start_id:], start=start_id):
        if i > 0 and cuts[i - 1] > c:
            raise ValueError("cuts" + str(cuts) + " is not ordered!")

        if c == cut:
            temp_sim = 1
        else:
            if c > cut:
                middle_dist = (c - cuts[i - 1]) / 2
                temp_sim = (middle_dist - (c - cut)) / middle_dist
            else:
                # if the cut is further ahead the last cut from cuts
                if i == len(cuts) - 1:
                    middle_dist = (c - cuts[i - 1]) / 2
                else:
                    middle_dist = (cuts[i + 1] - c) / 2
                # if (cuts[i + 1] - c) / 2 > (cut - c):
                temp_sim = (middle_dist - (cut - c)) / middle_dist
        if max_sim < temp_sim:
            max_sim = temp_sim
            max_sim_cut_id = i
        elif max_sim_cut_id is not None and cut < c:
            break
    return max_sim, max_sim_cut_id


def disc_similarity(expected_cuts, cuts):
    cuts = cuts.copy()
    cuts.insert(0, min(expected_cuts))
    # if abs(expected_cuts[-1] - cuts[-1]) > 0.02:
    #     raise ValueError("expected_cuts and cuts have very different last cut: ", expected_cuts[-1], cuts[-1])
    # if len(expected_cuts) == 1 and len(cuts) == 1:
    #     return 1
    #
    # # don't check the same last cut
    # expected_cuts = expected_cuts[:-1]
    # cuts = cuts[:-1]

    similarity = 0
    sim_exp_cut_id = _find_max_sim_cut(cuts[0], expected_cuts)
    prev_cut_id = sim_exp_cut_id[1]
    temp_sim = sim_exp_cut_id[0]
    exp_match = 0

    for i, cut in enumerate(cuts[1:], start=1):
        if i > 0 and cuts[i - 1] > cut:
            raise ValueError("cuts" + str(cuts) + " is not ordered!")

        sim_exp_cut_id = _find_max_sim_cut(cut, expected_cuts, prev_cut_id)
        if sim_exp_cut_id[1] == prev_cut_id:
            # counter += 1
            temp_sim *= sim_exp_cut_id[0]
        else:
            # print("temp_sim:", temp_sim)
            similarity += temp_sim
            temp_sim = sim_exp_cut_id[0]
            exp_match += 1

        prev_cut_id = sim_exp_cut_id[1]

    # print("temp_sim:", temp_sim)
    similarity += temp_sim
    exp_match += 1
    return similarity, exp_match


# def disc_distance(expected_cuts, cuts):
#     distance = 0
#     dist_exp_cut_id = _find_min_dist_cut(cuts[0], expected_cuts)
#     prev_cut_id = dist_exp_cut_id[1]
#     temp_distance = dist_exp_cut_id[0]
#
#     counter = 0
#
#     for i, cut in enumerate(cuts[1:], start=1):
#         if i > 0 and cuts[i - 1] > cut:
#             raise ValueError("cuts" + str(cuts) + " is not ordered!")
#
#         dist_exp_cut_id = _find_min_dist_cut(cut, expected_cuts, prev_cut_id)
#         if dist_exp_cut_id[1] == prev_cut_id:
#             # counter += 1
#             temp_distance += dist_exp_cut_id[0]
#         else:
#             distance += temp_distance * 2 ** counter
#             temp_distance = dist_exp_cut_id[0]
#             counter = 0
#
#         prev_cut_id = dist_exp_cut_id[1]
#
#     distance += temp_distance * 2 ** counter
#
#     return distance


# prepare slim db
def prepare_compression1(experiment_name):
    try:
        dat_file = cst.SLIM_DATA_DIR + experiment_name + "/" + experiment_name + ".dat"
        if not os.path.exists(dat_file):
            print("no initial dat-file for experiment", experiment_name)
            return False

        with open(cst.SLIM_CONVERT_CONF, "r+") as conf_file:
            new_lines = []
            for line in conf_file:
                if line.startswith("dbName"):
                    line = "dbName = [" + experiment_name + "]\n"
                new_lines.append(line)
            conf_file.seek(0)
            conf_file.writelines(new_lines)
            conf_file.truncate()

        output = sp.check_output([cst.SLIM_BIN, cst.SLIM_CONVERT_CONF])
        if "exception" in str(output):
            print('exception during preparation for', experiment_name)
            return False

    except sp.CalledProcessError:
        print('Prepare compression: conversion failed for', experiment_name)
        return False
    return True

def run_compression1(name, interaction_type=None, rows=None, rf=None, i=None, c=None, offset=None):
    # 1. check slim db
    # convert dat-file to db-file if it does not exist
    if not os.path.exists(cst.SLIM_DATA_DIR + name + "/" + name + ".db"):
        if not prepare_compression1(name):
            print("run_compression failed for", name)
            return [name, "", ""]

    # 2. modify compress.conf
    with open(cst.SLIM_COMPRESS_CONF, "r+") as conf_file:
        new_lines = []
        for line in conf_file:
            if line.startswith("iscName"):
                line = "iscName = " + name + "-all-1d\n"
            new_lines.append(line)
        conf_file.seek(0)
        conf_file.writelines(new_lines)
        conf_file.truncate()

    # 3. compress it
    output = None
    try:
        output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=30))
    except sp.TimeoutExpired:
        # timeout_counter = 0
        # while timeout_counter < 5:
        #     try:
        #         output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=60))
        #         break
        #     except sp.TimeoutExpired:
        #         timeout_counter += 1
        # if not output:
        #     print("timeout exceeded " + str(timeout_counter) + " times for " + name)
        #     return [name, "", ""]
        print("timeout exceeded", name)
        return [name, "", ""]
    except sp.CalledProcessError:
        return [name, "", ""]

    search_start = re.search('Start:\\\\t\\\\t.+?,(\d+)\)', output)
    if search_start:
        start_comp = search_start.group(1)
    else:
        print("compression start is not found", name)
        start_comp = ""
    search_end = re.search('Result:\\\\t\\\\t.+?,(\d+)\)', output)
    if search_end:
        result_comp = search_end.group(1)
    else:
        print("compression end is not found", name)
        result_comp = ""
    return [name, start_comp, result_comp]


def run_compression(base_dir):
    global results
    base_dir = cst.BASE + base_dir + "/"

    comp_dict = util.read_csv(base_dir + "Compression.csv")
    results = []
    for root, dirs, files in os.walk(base_dir):
        dir_num = len(dirs)
        counter = 0
        for experiment_name in dirs:
            if comp_dict is not None and experiment_name in comp_dict:
                continue
            counter += 1
            print("compressing", experiment_name, counter, "/", dir_num)
            res = ",".join(run_compression1(experiment_name))
            results.append(res + "\n")
    print('processing finished')
    with open(base_dir + "Compression.csv", "a") as f:
        f.writelines(results)


# returns runtime in seconds and mdl of compression
# def compute_compression(name):
#     escaped_name = util.get_escaped_name(name)
#     # 1. check slim db
#     if not os.path.exists(cst.SLIM_DATA_DIR + escaped_name + "/" + escaped_name + ".db"):
#         print("no slim db file for " + escaped_name)
#         return [name, None]
#
#     # 2. modify compress.conf
#     with open(cst.SLIM_COMPRESS_CONF, "r+") as conf_file:
#         new_lines = []
#         for line in conf_file:
#             if line.startswith("iscName"):
#                 line = "iscName = " + escaped_name + "-all-1d\n"
#             new_lines.append(line)
#         conf_file.seek(0)
#         conf_file.writelines(new_lines)
#         conf_file.truncate()
#
#     # 3. compress it
#     output = None
#     try:
#         output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=5))
#     except sp.TimeoutExpired:
#         timeout_counter = 0
#         while timeout_counter < 5:
#             try:
#                 output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=5))
#                 break
#             except sp.TimeoutExpired:
#                 timeout_counter += 1
#         if not output:
#             print("timeout exceeded " + str(timeout_counter) + " times for " + name)
#             return [name, None]
#     except sp.CalledProcessError:
#         return [name, None]
#
#     start_comp = re.search('Start:\\\\t\\\\t.+?,(\d+)\)', output).group(1)
#     result_comp = re.search('Result:\\\\t\\\\t.+?,(\d+)\)', output).group(1)
#     return [name, start_comp, result_comp]


# def compute_problem_quality_measure(directory,
#                                     problem,
#                                     method,
#                                     distances=('ID', 'CJS'),
#                                     threshold_range=(0.3, 0.5, 0.8),
#                                     irr_features_range=range(11)):
#     ideal_cuts = parse_cuts("ideal_disc/cut_" + problem + ".txt")
#     if method == cst.Method.TRIVIAL or method is cst.Method.PERFECT:
#         name = method.name + "-" + problem
#         print('compute_measures', name)
#         values = compute_precision_recall_runtime(ideal_cuts, directory, name)
#         compression = compute_compression(name)
#
#         if not values:
#             print('no value')
#
#         return ([values[0]], values[1], [compression]) if values else None
#         # return ([values[0]], values[1]) if values else None
#
#     runtime_values = []
#     values = []
#     compression = []
#     for dist in distances:
#         for threshold in threshold_range:
#             threshold = str(threshold)
#             if method == cst.Method.PREDEFINED:
#                 counter = 1
#                 while counter < 11:
#                     name = dist + "-" + method.name + "-s" + str(counter) + "-" + threshold + "-" + problem
#                     counter += 1
#
#                     print('compute_measures', name)
#                     value = compute_precision_recall_runtime(ideal_cuts, directory, name)
#                     if not value:
#                         print('no value')
#                         break
#                     runtime_values.append(value[0])
#                     values.extend(value[1])
#                     compression.append(compute_compression(name))
#
#             elif method == cst.Method.ORIGINAL:
#             # else:
#                 for irr_feat in irr_features_range:
#                     name = dist + "-" + method.name + "-" + threshold + "-" + problem + (
#                         "" if irr_feat == 0 else "-" + str(irr_feat))
#
#                     print('compute_measures', name)
#                     value = compute_precision_recall_runtime(ideal_cuts, directory, name)
#                     if not value:
#                         print('no value')
#                         continue
#                     runtime_values.append(value[0])
#                     values.extend(value[1])
#                     compression.append(compute_compression(name))
#     return runtime_values, values, compression
#     # return runtime_values, values


# def prepare_compression(directory,
#                         problem,
#                         method,
#                         distances=('ID', 'CJS'),
#                         threshold_range=(0.3, 0.5, 0.8),
#                         irr_features_range=range(11)):
#     if method == cst.Method.TRIVIAL:
#         name = "TRIVIAL-" + problem
#         print('prepare compression', name)
#         prepare_compression1(directory, name)
#         return
#
#     for dist in distances:
#         for threshold in threshold_range:
#             threshold = str(threshold)
#             if method == cst.Method.PREDEFINED:
#                 counter = 1
#                 while counter < 11:
#                     name = dist + "-" + method.name + "-s" + str(counter) + "-" + threshold + "-" + problem
#                     counter += 1
#
#                     print('prepare compression', name)
#                     prepare_compression1(directory, name)
#
#             # elif method == cst.Method.ORIGINAL:
#             else:
#                 for irr_feat in irr_features_range:
#                     name = dist + "-" + method.name + "-" + threshold + "-" + problem + (
#                         "" if irr_feat == 0 else "-" + str(irr_feat))
#
#                     print('prepare compression', name)
#                     prepare_compression1(directory, name)


def parse_runtimes(name):
    try:
        runtimes = []
        with open(name, "r") as f:
            for line in f:
                if line.startswith("subspace mining runtime:"):
                    runtimes.append(float(re.search("(?:subspace mining runtime:) (.*)(?: seconds)", line).group(1)))
                if line.startswith("full runtime:"):
                    if len(runtimes) == 0:
                        runtimes.append(0)
                    runtimes.append(float(re.search("(?:full runtime:) (.*)(?: seconds)", line).group(1)))
        if len(runtimes) == 0:
            return [0, 0]
        return runtimes
    except FileNotFoundError:
        return None


# def compute_precision_recall_runtime(ideal_cuts, directory, name):
#     data_dir = name.replace("-", "_")
#     cuts = parse_cuts(directory + "/" + data_dir + ".csv/cut.txt")
#     if not cuts:
#         return None
#
#     runtimes = parse_runtimes(directory + "/" + data_dir + ".csv/log.txt")
#     runtime_values = [name]
#     runtime_values.extend(runtimes)
#     values = []
#     for i in range(MAX_DIM_COUNT):
#         if len(ideal_cuts) <= i:
#             break
#         values.append(
#             [name + "-dim" + str(i + 1), disc_precision(ideal_cuts[i], cuts[i]), disc_recall(ideal_cuts[i], cuts[i])])
#     return runtime_values, values


def disc_precision(expected, current):
    similarity = disc_similarity(expected, current)
    return similarity[0] / (len(current) + 1)

def disc_recall(expected, current):
    similarity = disc_similarity(expected, current)
    # todo should be without + 1
    return similarity[0] / (len(expected) + 1)


def disc_f1(expected, current):
    similarity = disc_similarity(expected, current)
    recall = similarity[0] / (len(expected) + 1)
    precision = similarity[0] / (len(current) + 1)
    return (2 * precision * recall) / (precision + recall)


def signal_handler(signal, frame):
    global stop_signal, results
    print('Writing down Compression.csv')
    print(cst.BASE +  base_dir + "Compression.csv")
    with open(cst.BASE +  base_dir + "/Compression.csv", "a") as f:
        f.writelines(results)
    sys.exit(0)

global stop_signal
stop_signal = False

if __name__ == '__main__':
    global base_dir
    base_dir = sys.argv[1]
    signal.signal(signal.SIGINT, signal_handler)
    print('signal registered')
    # compression and classification quality measures
    run_compression(base_dir)
    # if len(sys.argv) == 1:
    #     print(
    #         'Usage: discretization_quality_measure.py '
    #         '-p=<problem> '
    #         '-m=<[original|greedy_topk|trivial|...]> '
    #         '-cor=<[uds]> '
    #         '-dist=<[id, cjs]> '
    #         '-t=<threshold float> '
    #         '-r=<number of rows> ')
    #     command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID'
    #     print('Running default: ', command)
    #     command_list = command.split(' ')
    # else:
    #     command_list = sys.argv[1:]
    #
    # problem_arg = list(filter(lambda x: x.startswith("-p="), command_list))
    # # if not problem_arg:
    # #     raise ValueError('No problem provided!')
    # base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list))
    # if not base_dir_arg:
    #     raise ValueError('No logs base dir provided!')
    # method_arg = list(filter(lambda x: x.startswith("-m="), command_list))
    # # if not method_arg:
    # #     raise ValueError('No method provided!')
    # distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list))
    # # if not distance_measure_arg:
    # #     raise ValueError('No distance measure provided!')
    # threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list))
    # # if not threshold_arg:
    # #     raise ValueError('No threshold provided!')
    # # irr_feat_start_arg = list(filter(lambda x: x.startswith("-is="), command_list))
    # # irr_feat_end_arg = list(filter(lambda x: x.startswith("-ie="), command_list))
    #
    # base_dir = base_dir_arg[0].replace('-b=', '')
    # if not os.path.exists(base_dir):
    #     os.makedirs(base_dir)
    # if problem_arg:
    #     problem = problem_arg[0].replace('-p=', '')
    # if method_arg:
    #     method = cst.Method[method_arg[0].replace('-m=', '').upper()]
    # if distance_measure_arg:
    #     distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()]
    # if threshold_arg:
    #     threshold = float(threshold_arg[0].replace('-t=', ''))
    #
    # problems = [
    #     # "2d_3_cubes_aligned_xor",
    #     # "2d_2_cubes_aligned",
    #     # "2d_2_cubes_xor",
    #     # "3d_2_cubes_aligned",
    #     # "3d_2_cubes_xor",
    #     # "3d_3_cubes_aligned",
    #     # "3d_3_cubes_aligned_xor",
    #     # "3d_3_cubes_xor",
    #     # "3d_4_cubes_1_aligned_xor",
    #     # "3d_4_cubes_2_aligned",
    #     # "3d_4_cubes_xor",
    #     # "4d_2_cubes_aligned",
    #     # "4d_3_cubes_aligned_xor",
    #     # "4d_3_cubes_xor",
    #     # "4d_4_cubes_aligned_xor",
    #     # "4d_4_cubes_2_aligned",
    #     "4d_4_cubes_xor",
    # ]
    #
    # runtime = []
    # perf = []
    # compression = []
    #
    # cols = ['run-dim', 'precision', 'recall']
    # runtime_cols = ['run', 'subspace mining runtime', 'full runtime']
    # compression_cols = ['run', 'start compression', 'result compression']
    #
    # disc_distances = []
    # for problem in problems:
    #     print('problem:', problem)
    #
    #     for method in [cst.Method.TRIVIAL, cst.Method.ORIGINAL, cst.Method.PREDEFINED]:
    #     # for method in [cst.Method.PERFECT]:
    #         print('method:', method)
    #         data = compute_problem_quality_measure(base_dir, problem, method=method)
    #         if not data:
    #             continue
    #         runtime.extend(data[0])
    #         perf.extend(data[1])
    #         compression.extend(data[2])
    # time = util.now()
    # pd.DataFrame(perf, columns=cols).to_csv(base_dir + "/Precision_recall_" + time + ".csv")
    # pd.DataFrame(runtime, columns=runtime_cols).to_csv(base_dir + "/Discretization_runtimes_" + time + ".csv")
    # pd.DataFrame(compression, columns=compression_cols).to_csv(base_dir + "/Compression_" + time + ".csv")
	import pandas as pd
	import os
	import signal
	import sys
	import constants as cst
	import re
	import subprocess as sp
	import util

	global_min = -2
	MAX_DIM_COUNT = 4


	def parse_ideal_cuts(experiment_name):
	name = util.parse_dataset_name(experiment_name)
	try:
	cuts = []
	with open(cst.PERFECT_DISCRETIZATIONS_DIR + "cut_" + name + ".txt", "r") as f:
	cut = []
	for line in f:
	if line.startswith("dimension"):
	continue
	if line.startswith("---"):
	cuts.append(cut)
	cut = []
	continue
	cut.append(float(line.strip()))
	return cuts
	except FileNotFoundError:
	return None


	def _find_min_dist_cut(cut, cuts, start_id=0):
	min_dist = float('Inf')
	min_cut_id = None
	for i, c in enumerate(cuts[start_id:], start=start_id):
	if i > 0 and cuts[i - 1] > c:
	raise ValueError("cuts" + str(cuts) + " is not ordered!")

	temp_dist = abs(c - cut)
	if min_dist > temp_dist:
	min_dist = temp_dist
	min_cut_id = i
	# elif min_cut_id:
	# break
	return min_dist, min_cut_id


	def _find_max_sim_cut(cut, cuts, start_id=0):
	max_sim = -float("Inf")
	max_sim_cut_id = None
	for i, c in enumerate(cuts[start_id:], start=start_id):
	if i > 0 and cuts[i - 1] > c:
	raise ValueError("cuts" + str(cuts) + " is not ordered!")

	if c == cut:
	temp_sim = 1
	else:
	if c > cut:
	middle_dist = (c - cuts[i - 1]) / 2
	temp_sim = (middle_dist - (c - cut)) / middle_dist
	else:
	# if the cut is further ahead the last cut from cuts
	if i == len(cuts) - 1:
	middle_dist = (c - cuts[i - 1]) / 2
	else:
	middle_dist = (cuts[i + 1] - c) / 2
	# if (cuts[i + 1] - c) / 2 > (cut - c):
	temp_sim = (middle_dist - (cut - c)) / middle_dist
	if max_sim < temp_sim:
	max_sim = temp_sim
	max_sim_cut_id = i
	elif max_sim_cut_id is not None and cut < c:
	break
	return max_sim, max_sim_cut_id


	def disc_similarity(expected_cuts, cuts):
	cuts = cuts.copy()
	cuts.insert(0, min(expected_cuts))
	# if abs(expected_cuts[-1] - cuts[-1]) > 0.02:
	# raise ValueError("expected_cuts and cuts have very different last cut: ", expected_cuts[-1], cuts[-1])
	# if len(expected_cuts) == 1 and len(cuts) == 1:
	# return 1
	#
	# # don't check the same last cut
	# expected_cuts = expected_cuts[:-1]
	# cuts = cuts[:-1]

	similarity = 0
	sim_exp_cut_id = _find_max_sim_cut(cuts[0], expected_cuts)
	prev_cut_id = sim_exp_cut_id[1]
	temp_sim = sim_exp_cut_id[0]
	exp_match = 0

	for i, cut in enumerate(cuts[1:], start=1):
	if i > 0 and cuts[i - 1] > cut:
	raise ValueError("cuts" + str(cuts) + " is not ordered!")

	sim_exp_cut_id = _find_max_sim_cut(cut, expected_cuts, prev_cut_id)
	if sim_exp_cut_id[1] == prev_cut_id:
	# counter += 1
	temp_sim *= sim_exp_cut_id[0]
	else:
	# print("temp_sim:", temp_sim)
	similarity += temp_sim
	temp_sim = sim_exp_cut_id[0]
	exp_match += 1

	prev_cut_id = sim_exp_cut_id[1]

	# print("temp_sim:", temp_sim)
	similarity += temp_sim
	exp_match += 1
	return similarity, exp_match


	# def disc_distance(expected_cuts, cuts):
	# distance = 0
	# dist_exp_cut_id = _find_min_dist_cut(cuts[0], expected_cuts)
	# prev_cut_id = dist_exp_cut_id[1]
	# temp_distance = dist_exp_cut_id[0]
	#
	# counter = 0
	#
	# for i, cut in enumerate(cuts[1:], start=1):
	# if i > 0 and cuts[i - 1] > cut:
	# raise ValueError("cuts" + str(cuts) + " is not ordered!")
	#
	# dist_exp_cut_id = _find_min_dist_cut(cut, expected_cuts, prev_cut_id)
	# if dist_exp_cut_id[1] == prev_cut_id:
	# # counter += 1
	# temp_distance += dist_exp_cut_id[0]
	# else:
	# distance += temp_distance * 2 ** counter
	# temp_distance = dist_exp_cut_id[0]
	# counter = 0
	#
	# prev_cut_id = dist_exp_cut_id[1]
	#
	# distance += temp_distance * 2 ** counter
	#
	# return distance


	# prepare slim db
	def prepare_compression1(experiment_name):
	try:
	dat_file = cst.SLIM_DATA_DIR + experiment_name + "/" + experiment_name + ".dat"
	if not os.path.exists(dat_file):
	print("no initial dat-file for experiment", experiment_name)
	return False

	with open(cst.SLIM_CONVERT_CONF, "r+") as conf_file:
	new_lines = []
	for line in conf_file:
	if line.startswith("dbName"):
	line = "dbName = [" + experiment_name + "]\n"
	new_lines.append(line)
	conf_file.seek(0)
	conf_file.writelines(new_lines)
	conf_file.truncate()

	output = sp.check_output([cst.SLIM_BIN, cst.SLIM_CONVERT_CONF])
	if "exception" in str(output):
	print('exception during preparation for', experiment_name)
	return False

	except sp.CalledProcessError:
	print('Prepare compression: conversion failed for', experiment_name)
	return False
	return True

	def run_compression1(name, interaction_type=None, rows=None, rf=None, i=None, c=None, offset=None):
	# 1. check slim db
	# convert dat-file to db-file if it does not exist
	if not os.path.exists(cst.SLIM_DATA_DIR + name + "/" + name + ".db"):
	if not prepare_compression1(name):
	print("run_compression failed for", name)
	return [name, "", ""]

	# 2. modify compress.conf
	with open(cst.SLIM_COMPRESS_CONF, "r+") as conf_file:
	new_lines = []
	for line in conf_file:
	if line.startswith("iscName"):
	line = "iscName = " + name + "-all-1d\n"
	new_lines.append(line)
	conf_file.seek(0)
	conf_file.writelines(new_lines)
	conf_file.truncate()

	# 3. compress it
	output = None
	try:
	output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=30))
	except sp.TimeoutExpired:
	# timeout_counter = 0
	# while timeout_counter < 5:
	# try:
	# output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=60))
	# break
	# except sp.TimeoutExpired:
	# timeout_counter += 1
	# if not output:
	# print("timeout exceeded " + str(timeout_counter) + " times for " + name)
	# return [name, "", ""]
	print("timeout exceeded", name)
	return [name, "", ""]
	except sp.CalledProcessError:
	return [name, "", ""]

	search_start = re.search('Start:\\\\t\\\\t.+?,(\d+)\)', output)
	if search_start:
	start_comp = search_start.group(1)
	else:
	print("compression start is not found", name)
	start_comp = ""
	search_end = re.search('Result:\\\\t\\\\t.+?,(\d+)\)', output)
	if search_end:
	result_comp = search_end.group(1)
	else:
	print("compression end is not found", name)
	result_comp = ""
	return [name, start_comp, result_comp]


	def run_compression(base_dir):
	global results
	base_dir = cst.BASE + base_dir + "/"

	comp_dict = util.read_csv(base_dir + "Compression.csv")
	results = []
	for root, dirs, files in os.walk(base_dir):
	dir_num = len(dirs)
	counter = 0
	for experiment_name in dirs:
	if comp_dict is not None and experiment_name in comp_dict:
	continue
	counter += 1
	print("compressing", experiment_name, counter, "/", dir_num)
	res = ",".join(run_compression1(experiment_name))
	results.append(res + "\n")
	print('processing finished')
	with open(base_dir + "Compression.csv", "a") as f:
	f.writelines(results)


	# returns runtime in seconds and mdl of compression
	# def compute_compression(name):
	# escaped_name = util.get_escaped_name(name)
	# # 1. check slim db
	# if not os.path.exists(cst.SLIM_DATA_DIR + escaped_name + "/" + escaped_name + ".db"):
	# print("no slim db file for " + escaped_name)
	# return [name, None]
	#
	# # 2. modify compress.conf
	# with open(cst.SLIM_COMPRESS_CONF, "r+") as conf_file:
	# new_lines = []
	# for line in conf_file:
	# if line.startswith("iscName"):
	# line = "iscName = " + escaped_name + "-all-1d\n"
	# new_lines.append(line)
	# conf_file.seek(0)
	# conf_file.writelines(new_lines)
	# conf_file.truncate()
	#
	# # 3. compress it
	# output = None
	# try:
	# output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=5))
	# except sp.TimeoutExpired:
	# timeout_counter = 0
	# while timeout_counter < 5:
	# try:
	# output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=5))
	# break
	# except sp.TimeoutExpired:
	# timeout_counter += 1
	# if not output:
	# print("timeout exceeded " + str(timeout_counter) + " times for " + name)
	# return [name, None]
	# except sp.CalledProcessError:
	# return [name, None]
	#
	# start_comp = re.search('Start:\\\\t\\\\t.+?,(\d+)\)', output).group(1)
	# result_comp = re.search('Result:\\\\t\\\\t.+?,(\d+)\)', output).group(1)
	# return [name, start_comp, result_comp]


	# def compute_problem_quality_measure(directory,
	# problem,
	# method,
	# distances=('ID', 'CJS'),
	# threshold_range=(0.3, 0.5, 0.8),
	# irr_features_range=range(11)):
	# ideal_cuts = parse_cuts("ideal_disc/cut_" + problem + ".txt")
	# if method == cst.Method.TRIVIAL or method is cst.Method.PERFECT:
	# name = method.name + "-" + problem
	# print('compute_measures', name)
	# values = compute_precision_recall_runtime(ideal_cuts, directory, name)
	# compression = compute_compression(name)
	#
	# if not values:
	# print('no value')
	#
	# return ([values[0]], values[1], [compression]) if values else None
	# # return ([values[0]], values[1]) if values else None
	#
	# runtime_values = []
	# values = []
	# compression = []
	# for dist in distances:
	# for threshold in threshold_range:
	# threshold = str(threshold)
	# if method == cst.Method.PREDEFINED:
	# counter = 1
	# while counter < 11:
	# name = dist + "-" + method.name + "-s" + str(counter) + "-" + threshold + "-" + problem
	# counter += 1
	#
	# print('compute_measures', name)
	# value = compute_precision_recall_runtime(ideal_cuts, directory, name)
	# if not value:
	# print('no value')
	# break
	# runtime_values.append(value[0])
	# values.extend(value[1])
	# compression.append(compute_compression(name))
	#
	# elif method == cst.Method.ORIGINAL:
	# # else:
	# for irr_feat in irr_features_range:
	# name = dist + "-" + method.name + "-" + threshold + "-" + problem + (
	# "" if irr_feat == 0 else "-" + str(irr_feat))
	#
	# print('compute_measures', name)
	# value = compute_precision_recall_runtime(ideal_cuts, directory, name)
	# if not value:
	# print('no value')
	# continue
	# runtime_values.append(value[0])
	# values.extend(value[1])
	# compression.append(compute_compression(name))
	# return runtime_values, values, compression
	# # return runtime_values, values


	# def prepare_compression(directory,
	# problem,
	# method,
	# distances=('ID', 'CJS'),
	# threshold_range=(0.3, 0.5, 0.8),
	# irr_features_range=range(11)):
	# if method == cst.Method.TRIVIAL:
	# name = "TRIVIAL-" + problem
	# print('prepare compression', name)
	# prepare_compression1(directory, name)
	# return
	#
	# for dist in distances:
	# for threshold in threshold_range:
	# threshold = str(threshold)
	# if method == cst.Method.PREDEFINED:
	# counter = 1
	# while counter < 11:
	# name = dist + "-" + method.name + "-s" + str(counter) + "-" + threshold + "-" + problem
	# counter += 1
	#
	# print('prepare compression', name)
	# prepare_compression1(directory, name)
	#
	# # elif method == cst.Method.ORIGINAL:
	# else:
	# for irr_feat in irr_features_range:
	# name = dist + "-" + method.name + "-" + threshold + "-" + problem + (
	# "" if irr_feat == 0 else "-" + str(irr_feat))
	#
	# print('prepare compression', name)
	# prepare_compression1(directory, name)


	def parse_runtimes(name):
	try:
	runtimes = []
	with open(name, "r") as f:
	for line in f:
	if line.startswith("subspace mining runtime:"):
	runtimes.append(float(re.search("(?:subspace mining runtime:) (.*)(?: seconds)", line).group(1)))
	if line.startswith("full runtime:"):
	if len(runtimes) == 0:
	runtimes.append(0)
	runtimes.append(float(re.search("(?:full runtime:) (.*)(?: seconds)", line).group(1)))
	if len(runtimes) == 0:
	return [0, 0]
	return runtimes
	except FileNotFoundError:
	return None


	# def compute_precision_recall_runtime(ideal_cuts, directory, name):
	# data_dir = name.replace("-", "_")
	# cuts = parse_cuts(directory + "/" + data_dir + ".csv/cut.txt")
	# if not cuts:
	# return None
	#
	# runtimes = parse_runtimes(directory + "/" + data_dir + ".csv/log.txt")
	# runtime_values = [name]
	# runtime_values.extend(runtimes)
	# values = []
	# for i in range(MAX_DIM_COUNT):
	# if len(ideal_cuts) <= i:
	# break
	# values.append(
	# [name + "-dim" + str(i + 1), disc_precision(ideal_cuts[i], cuts[i]), disc_recall(ideal_cuts[i], cuts[i])])
	# return runtime_values, values


	def disc_precision(expected, current):
	similarity = disc_similarity(expected, current)
	return similarity[0] / (len(current) + 1)

	def disc_recall(expected, current):
	similarity = disc_similarity(expected, current)
	# todo should be without + 1
	return similarity[0] / (len(expected) + 1)


	def disc_f1(expected, current):
	similarity = disc_similarity(expected, current)
	recall = similarity[0] / (len(expected) + 1)
	precision = similarity[0] / (len(current) + 1)
	return (2 * precision * recall) / (precision + recall)


	def signal_handler(signal, frame):
	global stop_signal, results
	print('Writing down Compression.csv')
	print(cst.BASE + base_dir + "Compression.csv")
	with open(cst.BASE + base_dir + "/Compression.csv", "a") as f:
	f.writelines(results)
	sys.exit(0)

	global stop_signal
	stop_signal = False

	if __name__ == '__main__':
	global base_dir
	base_dir = sys.argv[1]
	signal.signal(signal.SIGINT, signal_handler)
	print('signal registered')
	# compression and classification quality measures
	run_compression(base_dir)
	# if len(sys.argv) == 1:
	# print(
	# 'Usage: discretization_quality_measure.py '
	# '-p=<problem> '
	# '-m=<[original\|greedy_topk\|trivial\|...]> '
	# '-cor=<[uds]> '
	# '-dist=<[id, cjs]> '
	# '-t=<threshold float> '
	# '-r=<number of rows> ')
	# command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID'
	# print('Running default: ', command)
	# command_list = command.split(' ')
	# else:
	# command_list = sys.argv[1:]
	#
	# problem_arg = list(filter(lambda x: x.startswith("-p="), command_list))
	# # if not problem_arg:
	# # raise ValueError('No problem provided!')
	# base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list))
	# if not base_dir_arg:
	# raise ValueError('No logs base dir provided!')
	# method_arg = list(filter(lambda x: x.startswith("-m="), command_list))
	# # if not method_arg:
	# # raise ValueError('No method provided!')
	# distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list))
	# # if not distance_measure_arg:
	# # raise ValueError('No distance measure provided!')
	# threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list))
	# # if not threshold_arg:
	# # raise ValueError('No threshold provided!')
	# # irr_feat_start_arg = list(filter(lambda x: x.startswith("-is="), command_list))
	# # irr_feat_end_arg = list(filter(lambda x: x.startswith("-ie="), command_list))
	#
	# base_dir = base_dir_arg[0].replace('-b=', '')
	# if not os.path.exists(base_dir):
	# os.makedirs(base_dir)
	# if problem_arg:
	# problem = problem_arg[0].replace('-p=', '')
	# if method_arg:
	# method = cst.Method[method_arg[0].replace('-m=', '').upper()]
	# if distance_measure_arg:
	# distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()]
	# if threshold_arg:
	# threshold = float(threshold_arg[0].replace('-t=', ''))
	#
	# problems = [
	# # "2d_3_cubes_aligned_xor",
	# # "2d_2_cubes_aligned",
	# # "2d_2_cubes_xor",
	# # "3d_2_cubes_aligned",
	# # "3d_2_cubes_xor",
	# # "3d_3_cubes_aligned",
	# # "3d_3_cubes_aligned_xor",
	# # "3d_3_cubes_xor",
	# # "3d_4_cubes_1_aligned_xor",
	# # "3d_4_cubes_2_aligned",
	# # "3d_4_cubes_xor",
	# # "4d_2_cubes_aligned",
	# # "4d_3_cubes_aligned_xor",
	# # "4d_3_cubes_xor",
	# # "4d_4_cubes_aligned_xor",
	# # "4d_4_cubes_2_aligned",
	# "4d_4_cubes_xor",
	# ]
	#
	# runtime = []
	# perf = []
	# compression = []
	#
	# cols = ['run-dim', 'precision', 'recall']
	# runtime_cols = ['run', 'subspace mining runtime', 'full runtime']
	# compression_cols = ['run', 'start compression', 'result compression']
	#
	# disc_distances = []
	# for problem in problems:
	# print('problem:', problem)
	#
	# for method in [cst.Method.TRIVIAL, cst.Method.ORIGINAL, cst.Method.PREDEFINED]:
	# # for method in [cst.Method.PERFECT]:
	# print('method:', method)
	# data = compute_problem_quality_measure(base_dir, problem, method=method)
	# if not data:
	# continue
	# runtime.extend(data[0])
	# perf.extend(data[1])
	# compression.extend(data[2])
	# time = util.now()
	# pd.DataFrame(perf, columns=cols).to_csv(base_dir + "/Precision_recall_" + time + ".csv")
	# pd.DataFrame(runtime, columns=runtime_cols).to_csv(base_dir + "/Discretization_runtimes_" + time + ".csv")
	# pd.DataFrame(compression, columns=compression_cols).to_csv(base_dir + "/Compression_" + time + ".csv")