discretization_quality_measure.py

import pandas as pd
import os
import sys
import constants as cst
import re
import subprocess as sp
import util

global_min = -2
MAX_DIM_COUNT = 4


def parse_cuts(experiment_name):
    name = re.search("(.+?_.+?_.+?_.+?)_", experiment_name).group(1)
    try:
        cuts = []
        with open(cst.PERFECT_DISCRETIZATIONS_DIR + "cut_" + name + ".txt", "r") as f:
            cut = []
            for line in f:
                if line.startswith("dimension"):
                    continue
                if line.startswith("---"):
                    cuts.append(cut)
                    cut = []
                    continue
                cut.append(float(line.strip()))
        return cuts
    except FileNotFoundError:
        return None


def _find_min_dist_cut(cut, cuts, start_id=0):
    min_dist = float('Inf')
    min_cut_id = None
    for i, c in enumerate(cuts[start_id:], start=start_id):
        if i > 0 and cuts[i - 1] > c:
            raise ValueError("cuts" + str(cuts) + " is not ordered!")

        temp_dist = abs(c - cut)
        if min_dist > temp_dist:
            min_dist = temp_dist
            min_cut_id = i
            # elif min_cut_id:
            #     break
    return min_dist, min_cut_id


def _find_max_sim_cut(cut, cuts, start_id=0):
    max_sim = -float("Inf")
    max_sim_cut_id = None
    for i, c in enumerate(cuts[start_id:], start=start_id):
        if i > 0 and cuts[i - 1] > c:
            raise ValueError("cuts" + str(cuts) + " is not ordered!")

        if c == cut:
            temp_sim = 1
        else:
            if c > cut:
                temp_sim = ((c - cuts[i - 1]) / 2 - (c - cut)) / ((c - cuts[i - 1]) / 2)
            else:
                # if (cuts[i + 1] - c) / 2 > (cut - c):
                temp_sim = ((cuts[i + 1] - c) / 2 - (cut - c)) / ((cuts[i + 1] - c) / 2)
        if max_sim < temp_sim:
            max_sim = temp_sim
            max_sim_cut_id = i
        elif max_sim_cut_id is not None and cut < c:
            break
    return max_sim, max_sim_cut_id


def disc_similarity(expected_cuts, cuts):
    cuts = cuts.copy()
    cuts.insert(0, global_min)
    expected_cuts = expected_cuts.copy()
    expected_cuts.insert(0, global_min)
    # if abs(expected_cuts[-1] - cuts[-1]) > 0.02:
    #     raise ValueError("expected_cuts and cuts have very different last cut: ", expected_cuts[-1], cuts[-1])
    # if len(expected_cuts) == 1 and len(cuts) == 1:
    #     return 1
    #
    # # don't check the same last cut
    # expected_cuts = expected_cuts[:-1]
    # cuts = cuts[:-1]

    similarity = 0
    sim_exp_cut_id = _find_max_sim_cut(cuts[0], expected_cuts)
    prev_cut_id = sim_exp_cut_id[1]
    temp_sim = sim_exp_cut_id[0]
    exp_match = 0

    for i, cut in enumerate(cuts[1:], start=1):
        if i > 0 and cuts[i - 1] > cut:
            raise ValueError("cuts" + str(cuts) + " is not ordered!")

        sim_exp_cut_id = _find_max_sim_cut(cut, expected_cuts, prev_cut_id)
        if sim_exp_cut_id[1] == prev_cut_id:
            # counter += 1
            temp_sim *= sim_exp_cut_id[0]
        else:
            # print("temp_sim:", temp_sim)
            similarity += temp_sim
            temp_sim = sim_exp_cut_id[0]
            exp_match += 1

        prev_cut_id = sim_exp_cut_id[1]

    # print("temp_sim:", temp_sim)
    similarity += temp_sim
    exp_match += 1
    return similarity, exp_match


def disc_distance(expected_cuts, cuts):
    distance = 0
    dist_exp_cut_id = _find_min_dist_cut(cuts[0], expected_cuts)
    prev_cut_id = dist_exp_cut_id[1]
    temp_distance = dist_exp_cut_id[0]

    counter = 0

    for i, cut in enumerate(cuts[1:], start=1):
        if i > 0 and cuts[i - 1] > cut:
            raise ValueError("cuts" + str(cuts) + " is not ordered!")

        dist_exp_cut_id = _find_min_dist_cut(cut, expected_cuts, prev_cut_id)
        if dist_exp_cut_id[1] == prev_cut_id:
            # counter += 1
            temp_distance += dist_exp_cut_id[0]
        else:
            distance += temp_distance * 2 ** counter
            temp_distance = dist_exp_cut_id[0]
            counter = 0

        prev_cut_id = dist_exp_cut_id[1]

    distance += temp_distance * 2 ** counter

    return distance


# prepare slim db
def prepare_compression1(experiment_name):
    try:
        dat_file = cst.SLIM_DATA_DIR + experiment_name + "/" + experiment_name + ".dat"
        if not os.path.exists(dat_file):
            print("no initial dat-file for experiment", experiment_name)
            return False

        with open(cst.SLIM_CONVERT_CONF, "r+") as conf_file:
            new_lines = []
            for line in conf_file:
                if line.startswith("dbName"):
                    line = "dbName = [" + experiment_name + "]\n"
                new_lines.append(line)
            conf_file.seek(0)
            conf_file.writelines(new_lines)
            conf_file.truncate()

        output = sp.check_output([cst.SLIM_BIN, cst.SLIM_CONVERT_CONF])
        if "exception" in str(output):
            print('exception during preparation for', experiment_name)
            return False

    except sp.CalledProcessError:
        print('Prepare compression: conversion failed for', experiment_name)
        return False
    return True

def run_compression1(name, it=None, rf=None, i=None, type=None, c=None):
    # 1. check slim db
    # convert dat-file to db-file if it does not exist
    if not os.path.exists(cst.SLIM_DATA_DIR + name + "/" + name + ".db"):
        if not prepare_compression1(name):
            print("run_compression failed for", name)
            return [name, "", ""]

    # 2. modify compress.conf
    with open(cst.SLIM_COMPRESS_CONF, "r+") as conf_file:
        new_lines = []
        for line in conf_file:
            if line.startswith("iscName"):
                line = "iscName = " + name + "-all-1d\n"
            new_lines.append(line)
        conf_file.seek(0)
        conf_file.writelines(new_lines)
        conf_file.truncate()

    # 3. compress it
    output = None
    try:
        output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=30))
    except sp.TimeoutExpired:
        # timeout_counter = 0
        # while timeout_counter < 5:
        #     try:
        #         output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=60))
        #         break
        #     except sp.TimeoutExpired:
        #         timeout_counter += 1
        # if not output:
        #     print("timeout exceeded " + str(timeout_counter) + " times for " + name)
        #     return [name, "", ""]
        print("timeout exceeded", name)
        return [name, "", ""]
    except sp.CalledProcessError:
        return [name, "", ""]

    search_start = re.search('Start:\\\\t\\\\t.+?,(\d+)\)', output)
    if search_start:
        start_comp = search_start.group(1)
    else:
        print("compression start is not found", name)
        start_comp = ""
    search_end = re.search('Result:\\\\t\\\\t.+?,(\d+)\)', output)
    if search_end:
        result_comp = search_end.group(1)
    else:
        print("compression end is not found", name)
        result_comp = ""
    return [name, start_comp, result_comp]


def run_compression():
    results = util.collect_params(run_compression1)
    return results


# returns runtime in seconds and mdl of compression
def compute_compression(name):
    escaped_name = util.get_escaped_name(name)
    # 1. check slim db
    if not os.path.exists(cst.SLIM_DATA_DIR + escaped_name + "/" + escaped_name + ".db"):
        print("no slim db file for " + escaped_name)
        return [name, None]

    # 2. modify compress.conf
    with open(cst.SLIM_COMPRESS_CONF, "r+") as conf_file:
        new_lines = []
        for line in conf_file:
            if line.startswith("iscName"):
                line = "iscName = " + escaped_name + "-all-1d\n"
            new_lines.append(line)
        conf_file.seek(0)
        conf_file.writelines(new_lines)
        conf_file.truncate()

    # 3. compress it
    output = None
    try:
        output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=5))
    except sp.TimeoutExpired:
        timeout_counter = 0
        while timeout_counter < 5:
            try:
                output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=5))
                break
            except sp.TimeoutExpired:
                timeout_counter += 1
        if not output:
            print("timeout exceeded " + str(timeout_counter) + " times for " + name)
            return [name, None]
    except sp.CalledProcessError:
        return [name, None]

    start_comp = re.search('Start:\\\\t\\\\t.+?,(\d+)\)', output).group(1)
    result_comp = re.search('Result:\\\\t\\\\t.+?,(\d+)\)', output).group(1)
    return [name, start_comp, result_comp]


def compute_problem_quality_measure(directory,
                                    problem,
                                    method,
                                    distances=('ID', 'CJS'),
                                    threshold_range=(0.3, 0.5, 0.8),
                                    irr_features_range=range(11)):
    ideal_cuts = parse_cuts("ideal_disc/cut_" + problem + ".txt")
    if method == cst.Method.TRIVIAL or method is cst.Method.PERFECT:
        name = method.name + "-" + problem
        print('compute_measures', name)
        values = compute_precision_recall_runtime(ideal_cuts, directory, name)
        compression = compute_compression(name)

        if not values:
            print('no value')

        return ([values[0]], values[1], [compression]) if values else None
        # return ([values[0]], values[1]) if values else None

    runtime_values = []
    values = []
    compression = []
    for dist in distances:
        for threshold in threshold_range:
            threshold = str(threshold)
            if method == cst.Method.PREDEFINED:
                counter = 1
                while counter < 11:
                    name = dist + "-" + method.name + "-s" + str(counter) + "-" + threshold + "-" + problem
                    counter += 1

                    print('compute_measures', name)
                    value = compute_precision_recall_runtime(ideal_cuts, directory, name)
                    if not value:
                        print('no value')
                        break
                    runtime_values.append(value[0])
                    values.extend(value[1])
                    compression.append(compute_compression(name))

            elif method == cst.Method.ORIGINAL:
            # else:
                for irr_feat in irr_features_range:
                    name = dist + "-" + method.name + "-" + threshold + "-" + problem + (
                        "" if irr_feat == 0 else "-" + str(irr_feat))

                    print('compute_measures', name)
                    value = compute_precision_recall_runtime(ideal_cuts, directory, name)
                    if not value:
                        print('no value')
                        continue
                    runtime_values.append(value[0])
                    values.extend(value[1])
                    compression.append(compute_compression(name))
    return runtime_values, values, compression
    # return runtime_values, values


def prepare_compression(directory,
                        problem,
                        method,
                        distances=('ID', 'CJS'),
                        threshold_range=(0.3, 0.5, 0.8),
                        irr_features_range=range(11)):
    if method == cst.Method.TRIVIAL:
        name = "TRIVIAL-" + problem
        print('prepare compression', name)
        prepare_compression1(directory, name)
        return

    for dist in distances:
        for threshold in threshold_range:
            threshold = str(threshold)
            if method == cst.Method.PREDEFINED:
                counter = 1
                while counter < 11:
                    name = dist + "-" + method.name + "-s" + str(counter) + "-" + threshold + "-" + problem
                    counter += 1

                    print('prepare compression', name)
                    prepare_compression1(directory, name)

            # elif method == cst.Method.ORIGINAL:
            else:
                for irr_feat in irr_features_range:
                    name = dist + "-" + method.name + "-" + threshold + "-" + problem + (
                        "" if irr_feat == 0 else "-" + str(irr_feat))

                    print('prepare compression', name)
                    prepare_compression1(directory, name)


def parse_runtimes(name):
    try:
        runtimes = []
        with open(name, "r") as f:
            for line in f:
                if line.startswith("subspace mining runtime:"):
                    runtimes.append(float(re.search("(?:subspace mining runtime:) (.*)(?: seconds)", line).group(1)))
                if line.startswith("full runtime:"):
                    if len(runtimes) == 0:
                        runtimes.append(0)
                    runtimes.append(float(re.search("(?:full runtime:) (.*)(?: seconds)", line).group(1)))
        if len(runtimes) == 0:
            return [0, 0]
        return runtimes
    except FileNotFoundError:
        return None


def compute_precision_recall_runtime(ideal_cuts, directory, name):
    data_dir = name.replace("-", "_")
    cuts = parse_cuts(directory + "/" + data_dir + ".csv/cut.txt")
    if not cuts:
        return None

    runtimes = parse_runtimes(directory + "/" + data_dir + ".csv/log.txt")
    runtime_values = [name]
    runtime_values.extend(runtimes)
    values = []
    for i in range(MAX_DIM_COUNT):
        if len(ideal_cuts) <= i:
            break
        values.append(
            [name + "-dim" + str(i + 1), disc_precision(ideal_cuts[i], cuts[i]), disc_recall(ideal_cuts[i], cuts[i])])
    return runtime_values, values


def disc_precision(expected, current):
    similarity = disc_similarity(expected, current)
    return similarity[0] / (len(current) + 1)


def disc_recall(expected, current):
    similarity = disc_similarity(expected, current)
    return similarity[0] / (len(expected) + 1)


def disc_f1(expected, current):
    similarity = disc_similarity(expected, current)
    recall = similarity[0] / (len(expected) + 1)
    precision = similarity[0] / (len(current) + 1)
    return (2 * precision * recall) / (precision + recall)


if __name__ == '__main__':
    # compression and classification quality measures
    run_compression()
    # if len(sys.argv) == 1:
    #     print(
    #         'Usage: discretization_quality_measure.py '
    #         '-p=<problem> '
    #         '-m=<[original|greedy_topk|trivial|...]> '
    #         '-cor=<[uds]> '
    #         '-dist=<[id, cjs]> '
    #         '-t=<threshold float> '
    #         '-r=<number of rows> ')
    #     command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID'
    #     print('Running default: ', command)
    #     command_list = command.split(' ')
    # else:
    #     command_list = sys.argv[1:]
    #
    # problem_arg = list(filter(lambda x: x.startswith("-p="), command_list))
    # # if not problem_arg:
    # #     raise ValueError('No problem provided!')
    # base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list))
    # if not base_dir_arg:
    #     raise ValueError('No logs base dir provided!')
    # method_arg = list(filter(lambda x: x.startswith("-m="), command_list))
    # # if not method_arg:
    # #     raise ValueError('No method provided!')
    # distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list))
    # # if not distance_measure_arg:
    # #     raise ValueError('No distance measure provided!')
    # threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list))
    # # if not threshold_arg:
    # #     raise ValueError('No threshold provided!')
    # # irr_feat_start_arg = list(filter(lambda x: x.startswith("-is="), command_list))
    # # irr_feat_end_arg = list(filter(lambda x: x.startswith("-ie="), command_list))
    #
    # base_dir = base_dir_arg[0].replace('-b=', '')
    # if not os.path.exists(base_dir):
    #     os.makedirs(base_dir)
    # if problem_arg:
    #     problem = problem_arg[0].replace('-p=', '')
    # if method_arg:
    #     method = cst.Method[method_arg[0].replace('-m=', '').upper()]
    # if distance_measure_arg:
    #     distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()]
    # if threshold_arg:
    #     threshold = float(threshold_arg[0].replace('-t=', ''))
    #
    # problems = [
    #     # "2d_3_cubes_aligned_xor",
    #     # "2d_2_cubes_aligned",
    #     # "2d_2_cubes_xor",
    #     # "3d_2_cubes_aligned",
    #     # "3d_2_cubes_xor",
    #     # "3d_3_cubes_aligned",
    #     # "3d_3_cubes_aligned_xor",
    #     # "3d_3_cubes_xor",
    #     # "3d_4_cubes_1_aligned_xor",
    #     # "3d_4_cubes_2_aligned",
    #     # "3d_4_cubes_xor",
    #     # "4d_2_cubes_aligned",
    #     # "4d_3_cubes_aligned_xor",
    #     # "4d_3_cubes_xor",
    #     # "4d_4_cubes_aligned_xor",
    #     # "4d_4_cubes_2_aligned",
    #     "4d_4_cubes_xor",
    # ]
    #
    # runtime = []
    # perf = []
    # compression = []
    #
    # cols = ['run-dim', 'precision', 'recall']
    # runtime_cols = ['run', 'subspace mining runtime', 'full runtime']
    # compression_cols = ['run', 'start compression', 'result compression']
    #
    # disc_distances = []
    # for problem in problems:
    #     print('problem:', problem)
    #
    #     for method in [cst.Method.TRIVIAL, cst.Method.ORIGINAL, cst.Method.PREDEFINED]:
    #     # for method in [cst.Method.PERFECT]:
    #         print('method:', method)
    #         data = compute_problem_quality_measure(base_dir, problem, method=method)
    #         if not data:
    #             continue
    #         runtime.extend(data[0])
    #         perf.extend(data[1])
    #         compression.extend(data[2])
    # time = util.now()
    # pd.DataFrame(perf, columns=cols).to_csv(base_dir + "/Precision_recall_" + time + ".csv")
    # pd.DataFrame(runtime, columns=runtime_cols).to_csv(base_dir + "/Discretization_runtimes_" + time + ".csv")
    # pd.DataFrame(compression, columns=compression_cols).to_csv(base_dir + "/Compression_" + time + ".csv")
	import pandas as pd
	import os
	import sys
	import constants as cst
	import re
	import subprocess as sp
	import util

	global_min = -2
	MAX_DIM_COUNT = 4


	def parse_cuts(experiment_name):
	name = re.search("(.+?_.+?_.+?_.+?)_", experiment_name).group(1)
	try:
	cuts = []
	with open(cst.PERFECT_DISCRETIZATIONS_DIR + "cut_" + name + ".txt", "r") as f:
	cut = []
	for line in f:
	if line.startswith("dimension"):
	continue
	if line.startswith("---"):
	cuts.append(cut)
	cut = []
	continue
	cut.append(float(line.strip()))
	return cuts
	except FileNotFoundError:
	return None


	def _find_min_dist_cut(cut, cuts, start_id=0):
	min_dist = float('Inf')
	min_cut_id = None
	for i, c in enumerate(cuts[start_id:], start=start_id):
	if i > 0 and cuts[i - 1] > c:
	raise ValueError("cuts" + str(cuts) + " is not ordered!")

	temp_dist = abs(c - cut)
	if min_dist > temp_dist:
	min_dist = temp_dist
	min_cut_id = i
	# elif min_cut_id:
	# break
	return min_dist, min_cut_id


	def _find_max_sim_cut(cut, cuts, start_id=0):
	max_sim = -float("Inf")
	max_sim_cut_id = None
	for i, c in enumerate(cuts[start_id:], start=start_id):
	if i > 0 and cuts[i - 1] > c:
	raise ValueError("cuts" + str(cuts) + " is not ordered!")

	if c == cut:
	temp_sim = 1
	else:
	if c > cut:
	temp_sim = ((c - cuts[i - 1]) / 2 - (c - cut)) / ((c - cuts[i - 1]) / 2)
	else:
	# if (cuts[i + 1] - c) / 2 > (cut - c):
	temp_sim = ((cuts[i + 1] - c) / 2 - (cut - c)) / ((cuts[i + 1] - c) / 2)
	if max_sim < temp_sim:
	max_sim = temp_sim
	max_sim_cut_id = i
	elif max_sim_cut_id is not None and cut < c:
	break
	return max_sim, max_sim_cut_id


	def disc_similarity(expected_cuts, cuts):
	cuts = cuts.copy()
	cuts.insert(0, global_min)
	expected_cuts = expected_cuts.copy()
	expected_cuts.insert(0, global_min)
	# if abs(expected_cuts[-1] - cuts[-1]) > 0.02:
	# raise ValueError("expected_cuts and cuts have very different last cut: ", expected_cuts[-1], cuts[-1])
	# if len(expected_cuts) == 1 and len(cuts) == 1:
	# return 1
	#
	# # don't check the same last cut
	# expected_cuts = expected_cuts[:-1]
	# cuts = cuts[:-1]

	similarity = 0
	sim_exp_cut_id = _find_max_sim_cut(cuts[0], expected_cuts)
	prev_cut_id = sim_exp_cut_id[1]
	temp_sim = sim_exp_cut_id[0]
	exp_match = 0

	for i, cut in enumerate(cuts[1:], start=1):
	if i > 0 and cuts[i - 1] > cut:
	raise ValueError("cuts" + str(cuts) + " is not ordered!")

	sim_exp_cut_id = _find_max_sim_cut(cut, expected_cuts, prev_cut_id)
	if sim_exp_cut_id[1] == prev_cut_id:
	# counter += 1
	temp_sim *= sim_exp_cut_id[0]
	else:
	# print("temp_sim:", temp_sim)
	similarity += temp_sim
	temp_sim = sim_exp_cut_id[0]
	exp_match += 1

	prev_cut_id = sim_exp_cut_id[1]

	# print("temp_sim:", temp_sim)
	similarity += temp_sim
	exp_match += 1
	return similarity, exp_match


	def disc_distance(expected_cuts, cuts):
	distance = 0
	dist_exp_cut_id = _find_min_dist_cut(cuts[0], expected_cuts)
	prev_cut_id = dist_exp_cut_id[1]
	temp_distance = dist_exp_cut_id[0]

	counter = 0

	for i, cut in enumerate(cuts[1:], start=1):
	if i > 0 and cuts[i - 1] > cut:
	raise ValueError("cuts" + str(cuts) + " is not ordered!")

	dist_exp_cut_id = _find_min_dist_cut(cut, expected_cuts, prev_cut_id)
	if dist_exp_cut_id[1] == prev_cut_id:
	# counter += 1
	temp_distance += dist_exp_cut_id[0]
	else:
	distance += temp_distance * 2 ** counter
	temp_distance = dist_exp_cut_id[0]
	counter = 0

	prev_cut_id = dist_exp_cut_id[1]

	distance += temp_distance * 2 ** counter

	return distance


	# prepare slim db
	def prepare_compression1(experiment_name):
	try:
	dat_file = cst.SLIM_DATA_DIR + experiment_name + "/" + experiment_name + ".dat"
	if not os.path.exists(dat_file):
	print("no initial dat-file for experiment", experiment_name)
	return False

	with open(cst.SLIM_CONVERT_CONF, "r+") as conf_file:
	new_lines = []
	for line in conf_file:
	if line.startswith("dbName"):
	line = "dbName = [" + experiment_name + "]\n"
	new_lines.append(line)
	conf_file.seek(0)
	conf_file.writelines(new_lines)
	conf_file.truncate()

	output = sp.check_output([cst.SLIM_BIN, cst.SLIM_CONVERT_CONF])
	if "exception" in str(output):
	print('exception during preparation for', experiment_name)
	return False

	except sp.CalledProcessError:
	print('Prepare compression: conversion failed for', experiment_name)
	return False
	return True

	def run_compression1(name, it=None, rf=None, i=None, type=None, c=None):
	# 1. check slim db
	# convert dat-file to db-file if it does not exist
	if not os.path.exists(cst.SLIM_DATA_DIR + name + "/" + name + ".db"):
	if not prepare_compression1(name):
	print("run_compression failed for", name)
	return [name, "", ""]

	# 2. modify compress.conf
	with open(cst.SLIM_COMPRESS_CONF, "r+") as conf_file:
	new_lines = []
	for line in conf_file:
	if line.startswith("iscName"):
	line = "iscName = " + name + "-all-1d\n"
	new_lines.append(line)
	conf_file.seek(0)
	conf_file.writelines(new_lines)
	conf_file.truncate()

	# 3. compress it
	output = None
	try:
	output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=30))
	except sp.TimeoutExpired:
	# timeout_counter = 0
	# while timeout_counter < 5:
	# try:
	# output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=60))
	# break
	# except sp.TimeoutExpired:
	# timeout_counter += 1
	# if not output:
	# print("timeout exceeded " + str(timeout_counter) + " times for " + name)
	# return [name, "", ""]
	print("timeout exceeded", name)
	return [name, "", ""]
	except sp.CalledProcessError:
	return [name, "", ""]

	search_start = re.search('Start:\\\\t\\\\t.+?,(\d+)\)', output)
	if search_start:
	start_comp = search_start.group(1)
	else:
	print("compression start is not found", name)
	start_comp = ""
	search_end = re.search('Result:\\\\t\\\\t.+?,(\d+)\)', output)
	if search_end:
	result_comp = search_end.group(1)
	else:
	print("compression end is not found", name)
	result_comp = ""
	return [name, start_comp, result_comp]


	def run_compression():
	results = util.collect_params(run_compression1)
	return results


	# returns runtime in seconds and mdl of compression
	def compute_compression(name):
	escaped_name = util.get_escaped_name(name)
	# 1. check slim db
	if not os.path.exists(cst.SLIM_DATA_DIR + escaped_name + "/" + escaped_name + ".db"):
	print("no slim db file for " + escaped_name)
	return [name, None]

	# 2. modify compress.conf
	with open(cst.SLIM_COMPRESS_CONF, "r+") as conf_file:
	new_lines = []
	for line in conf_file:
	if line.startswith("iscName"):
	line = "iscName = " + escaped_name + "-all-1d\n"
	new_lines.append(line)
	conf_file.seek(0)
	conf_file.writelines(new_lines)
	conf_file.truncate()

	# 3. compress it
	output = None
	try:
	output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=5))
	except sp.TimeoutExpired:
	timeout_counter = 0
	while timeout_counter < 5:
	try:
	output = str(sp.check_output([cst.SLIM_BIN, cst.SLIM_COMPRESS_CONF], timeout=5))
	break
	except sp.TimeoutExpired:
	timeout_counter += 1
	if not output:
	print("timeout exceeded " + str(timeout_counter) + " times for " + name)
	return [name, None]
	except sp.CalledProcessError:
	return [name, None]

	start_comp = re.search('Start:\\\\t\\\\t.+?,(\d+)\)', output).group(1)
	result_comp = re.search('Result:\\\\t\\\\t.+?,(\d+)\)', output).group(1)
	return [name, start_comp, result_comp]


	def compute_problem_quality_measure(directory,
	problem,
	method,
	distances=('ID', 'CJS'),
	threshold_range=(0.3, 0.5, 0.8),
	irr_features_range=range(11)):
	ideal_cuts = parse_cuts("ideal_disc/cut_" + problem + ".txt")
	if method == cst.Method.TRIVIAL or method is cst.Method.PERFECT:
	name = method.name + "-" + problem
	print('compute_measures', name)
	values = compute_precision_recall_runtime(ideal_cuts, directory, name)
	compression = compute_compression(name)

	if not values:
	print('no value')

	return ([values[0]], values[1], [compression]) if values else None
	# return ([values[0]], values[1]) if values else None

	runtime_values = []
	values = []
	compression = []
	for dist in distances:
	for threshold in threshold_range:
	threshold = str(threshold)
	if method == cst.Method.PREDEFINED:
	counter = 1
	while counter < 11:
	name = dist + "-" + method.name + "-s" + str(counter) + "-" + threshold + "-" + problem
	counter += 1

	print('compute_measures', name)
	value = compute_precision_recall_runtime(ideal_cuts, directory, name)
	if not value:
	print('no value')
	break
	runtime_values.append(value[0])
	values.extend(value[1])
	compression.append(compute_compression(name))

	elif method == cst.Method.ORIGINAL:
	# else:
	for irr_feat in irr_features_range:
	name = dist + "-" + method.name + "-" + threshold + "-" + problem + (
	"" if irr_feat == 0 else "-" + str(irr_feat))

	print('compute_measures', name)
	value = compute_precision_recall_runtime(ideal_cuts, directory, name)
	if not value:
	print('no value')
	continue
	runtime_values.append(value[0])
	values.extend(value[1])
	compression.append(compute_compression(name))
	return runtime_values, values, compression
	# return runtime_values, values


	def prepare_compression(directory,
	problem,
	method,
	distances=('ID', 'CJS'),
	threshold_range=(0.3, 0.5, 0.8),
	irr_features_range=range(11)):
	if method == cst.Method.TRIVIAL:
	name = "TRIVIAL-" + problem
	print('prepare compression', name)
	prepare_compression1(directory, name)
	return

	for dist in distances:
	for threshold in threshold_range:
	threshold = str(threshold)
	if method == cst.Method.PREDEFINED:
	counter = 1
	while counter < 11:
	name = dist + "-" + method.name + "-s" + str(counter) + "-" + threshold + "-" + problem
	counter += 1

	print('prepare compression', name)
	prepare_compression1(directory, name)

	# elif method == cst.Method.ORIGINAL:
	else:
	for irr_feat in irr_features_range:
	name = dist + "-" + method.name + "-" + threshold + "-" + problem + (
	"" if irr_feat == 0 else "-" + str(irr_feat))

	print('prepare compression', name)
	prepare_compression1(directory, name)


	def parse_runtimes(name):
	try:
	runtimes = []
	with open(name, "r") as f:
	for line in f:
	if line.startswith("subspace mining runtime:"):
	runtimes.append(float(re.search("(?:subspace mining runtime:) (.*)(?: seconds)", line).group(1)))
	if line.startswith("full runtime:"):
	if len(runtimes) == 0:
	runtimes.append(0)
	runtimes.append(float(re.search("(?:full runtime:) (.*)(?: seconds)", line).group(1)))
	if len(runtimes) == 0:
	return [0, 0]
	return runtimes
	except FileNotFoundError:
	return None


	def compute_precision_recall_runtime(ideal_cuts, directory, name):
	data_dir = name.replace("-", "_")
	cuts = parse_cuts(directory + "/" + data_dir + ".csv/cut.txt")
	if not cuts:
	return None

	runtimes = parse_runtimes(directory + "/" + data_dir + ".csv/log.txt")
	runtime_values = [name]
	runtime_values.extend(runtimes)
	values = []
	for i in range(MAX_DIM_COUNT):
	if len(ideal_cuts) <= i:
	break
	values.append(
	[name + "-dim" + str(i + 1), disc_precision(ideal_cuts[i], cuts[i]), disc_recall(ideal_cuts[i], cuts[i])])
	return runtime_values, values


	def disc_precision(expected, current):
	similarity = disc_similarity(expected, current)
	return similarity[0] / (len(current) + 1)


	def disc_recall(expected, current):
	similarity = disc_similarity(expected, current)
	return similarity[0] / (len(expected) + 1)


	def disc_f1(expected, current):
	similarity = disc_similarity(expected, current)
	recall = similarity[0] / (len(expected) + 1)
	precision = similarity[0] / (len(current) + 1)
	return (2 * precision * recall) / (precision + recall)


	if __name__ == '__main__':
	# compression and classification quality measures
	run_compression()
	# if len(sys.argv) == 1:
	# print(
	# 'Usage: discretization_quality_measure.py '
	# '-p=<problem> '
	# '-m=<[original\|greedy_topk\|trivial\|...]> '
	# '-cor=<[uds]> '
	# '-dist=<[id, cjs]> '
	# '-t=<threshold float> '
	# '-r=<number of rows> ')
	# command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID'
	# print('Running default: ', command)
	# command_list = command.split(' ')
	# else:
	# command_list = sys.argv[1:]
	#
	# problem_arg = list(filter(lambda x: x.startswith("-p="), command_list))
	# # if not problem_arg:
	# # raise ValueError('No problem provided!')
	# base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list))
	# if not base_dir_arg:
	# raise ValueError('No logs base dir provided!')
	# method_arg = list(filter(lambda x: x.startswith("-m="), command_list))
	# # if not method_arg:
	# # raise ValueError('No method provided!')
	# distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list))
	# # if not distance_measure_arg:
	# # raise ValueError('No distance measure provided!')
	# threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list))
	# # if not threshold_arg:
	# # raise ValueError('No threshold provided!')
	# # irr_feat_start_arg = list(filter(lambda x: x.startswith("-is="), command_list))
	# # irr_feat_end_arg = list(filter(lambda x: x.startswith("-ie="), command_list))
	#
	# base_dir = base_dir_arg[0].replace('-b=', '')
	# if not os.path.exists(base_dir):
	# os.makedirs(base_dir)
	# if problem_arg:
	# problem = problem_arg[0].replace('-p=', '')
	# if method_arg:
	# method = cst.Method[method_arg[0].replace('-m=', '').upper()]
	# if distance_measure_arg:
	# distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()]
	# if threshold_arg:
	# threshold = float(threshold_arg[0].replace('-t=', ''))
	#
	# problems = [
	# # "2d_3_cubes_aligned_xor",
	# # "2d_2_cubes_aligned",
	# # "2d_2_cubes_xor",
	# # "3d_2_cubes_aligned",
	# # "3d_2_cubes_xor",
	# # "3d_3_cubes_aligned",
	# # "3d_3_cubes_aligned_xor",
	# # "3d_3_cubes_xor",
	# # "3d_4_cubes_1_aligned_xor",
	# # "3d_4_cubes_2_aligned",
	# # "3d_4_cubes_xor",
	# # "4d_2_cubes_aligned",
	# # "4d_3_cubes_aligned_xor",
	# # "4d_3_cubes_xor",
	# # "4d_4_cubes_aligned_xor",
	# # "4d_4_cubes_2_aligned",
	# "4d_4_cubes_xor",
	# ]
	#
	# runtime = []
	# perf = []
	# compression = []
	#
	# cols = ['run-dim', 'precision', 'recall']
	# runtime_cols = ['run', 'subspace mining runtime', 'full runtime']
	# compression_cols = ['run', 'start compression', 'result compression']
	#
	# disc_distances = []
	# for problem in problems:
	# print('problem:', problem)
	#
	# for method in [cst.Method.TRIVIAL, cst.Method.ORIGINAL, cst.Method.PREDEFINED]:
	# # for method in [cst.Method.PERFECT]:
	# print('method:', method)
	# data = compute_problem_quality_measure(base_dir, problem, method=method)
	# if not data:
	# continue
	# runtime.extend(data[0])
	# perf.extend(data[1])
	# compression.extend(data[2])
	# time = util.now()
	# pd.DataFrame(perf, columns=cols).to_csv(base_dir + "/Precision_recall_" + time + ".csv")
	# pd.DataFrame(runtime, columns=runtime_cols).to_csv(base_dir + "/Discretization_runtimes_" + time + ".csv")
	# pd.DataFrame(compression, columns=compression_cols).to_csv(base_dir + "/Compression_" + time + ".csv")