Skip to content
Permalink
f06394675f
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
352 lines (301 sloc) 12.5 KB
import pandas as pd
import os
import sys
import constants as cst
import re
global_min = -2
MAX_DIM_COUNT = 4
def parse_cuts(name):
try:
cuts = []
with open(name, "r") as f:
cut = []
for line in f:
if line.startswith("dimension"):
continue
if line.startswith("---"):
cuts.append(cut)
cut = []
continue
cut.append(float(line.strip()))
return cuts
except FileNotFoundError:
return None
def _find_min_dist_cut(cut, cuts, start_id=0):
min_dist = float('Inf')
min_cut_id = None
for i, c in enumerate(cuts[start_id:], start=start_id):
if i > 0 and cuts[i - 1] > c:
raise ValueError("cuts" + str(cuts) + " is not ordered!")
temp_dist = abs(c - cut)
if min_dist > temp_dist:
min_dist = temp_dist
min_cut_id = i
# elif min_cut_id:
# break
return min_dist, min_cut_id
def _find_max_sim_cut(cut, cuts, start_id=0):
max_sim = -float("Inf")
max_sim_cut_id = None
for i, c in enumerate(cuts[start_id:], start=start_id):
if i > 0 and cuts[i - 1] > c:
raise ValueError("cuts" + str(cuts) + " is not ordered!")
if c == cut:
temp_sim = 1
else:
if c > cut:
temp_sim = ((c - cuts[i - 1]) / 2 - (c - cut)) / ((c - cuts[i - 1]) / 2)
else:
# if (cuts[i + 1] - c) / 2 > (cut - c):
temp_sim = ((cuts[i + 1] - c) / 2 - (cut - c)) / ((cuts[i + 1] - c) / 2)
if max_sim < temp_sim:
max_sim = temp_sim
max_sim_cut_id = i
elif max_sim_cut_id is not None and cut < c:
break
return max_sim, max_sim_cut_id
def disc_similarity(expected_cuts, cuts):
cuts = cuts.copy()
cuts.insert(0, global_min)
expected_cuts = expected_cuts.copy()
expected_cuts.insert(0, global_min)
# if abs(expected_cuts[-1] - cuts[-1]) > 0.02:
# raise ValueError("expected_cuts and cuts have very different last cut: ", expected_cuts[-1], cuts[-1])
# if len(expected_cuts) == 1 and len(cuts) == 1:
# return 1
#
# # don't check the same last cut
# expected_cuts = expected_cuts[:-1]
# cuts = cuts[:-1]
similarity = 0
sim_exp_cut_id = _find_max_sim_cut(cuts[0], expected_cuts)
prev_cut_id = sim_exp_cut_id[1]
temp_sim = sim_exp_cut_id[0]
exp_match = 0
for i, cut in enumerate(cuts[1:], start=1):
if i > 0 and cuts[i - 1] > cut:
raise ValueError("cuts" + str(cuts) + " is not ordered!")
sim_exp_cut_id = _find_max_sim_cut(cut, expected_cuts, prev_cut_id)
if sim_exp_cut_id[1] == prev_cut_id:
# counter += 1
temp_sim *= sim_exp_cut_id[0]
else:
# print("temp_sim:", temp_sim)
similarity += temp_sim
temp_sim = sim_exp_cut_id[0]
exp_match += 1
prev_cut_id = sim_exp_cut_id[1]
# print("temp_sim:", temp_sim)
similarity += temp_sim
exp_match += 1
return similarity, exp_match
def disc_distance(expected_cuts, cuts):
distance = 0
dist_exp_cut_id = _find_min_dist_cut(cuts[0], expected_cuts)
prev_cut_id = dist_exp_cut_id[1]
temp_distance = dist_exp_cut_id[0]
counter = 0
for i, cut in enumerate(cuts[1:], start=1):
if i > 0 and cuts[i - 1] > cut:
raise ValueError("cuts" + str(cuts) + " is not ordered!")
dist_exp_cut_id = _find_min_dist_cut(cut, expected_cuts, prev_cut_id)
if dist_exp_cut_id[1] == prev_cut_id:
# counter += 1
temp_distance += dist_exp_cut_id[0]
else:
distance += temp_distance * 2 ** counter
temp_distance = dist_exp_cut_id[0]
counter = 0
prev_cut_id = dist_exp_cut_id[1]
distance += temp_distance * 2 ** counter
return distance
def compute_problem_quality_measure(directory,
problem,
method,
distances=('ID', 'CJS'),
threshold_range=(0.3, 0.5, 0.8),
irr_features_range=range(11)):
ideal_cuts = parse_cuts("ideal_disc/cut_" + problem + ".txt")
if method == cst.Method.TRIVIAL:
name = "TRIVIAL-" + problem
values = compute_measures(ideal_cuts, directory, name)
return ([values[0]], values[1]) if values else None
runtime_values = []
values = []
for dist in distances:
for threshold in threshold_range:
threshold = str(threshold)
if method == cst.Method.PREDEFINED:
counter = 1
while counter < 11:
name = dist + "-" + method.name + "-s" + str(counter) + "-" + threshold + "-" + problem
counter += 1
value = compute_measures(ideal_cuts, directory, name)
if not value:
break
runtime_values.append(value[0])
values.extend(value[1])
elif method == cst.Method.ORIGINAL:
for irr_feat in irr_features_range:
name = dist + "-" + method.name + "-" + threshold + "-" + problem + (
"" if irr_feat == 0 else "-" + str(irr_feat))
value = compute_measures(ideal_cuts, directory, name)
if not value:
continue
runtime_values.append(value[0])
values.extend(value[1])
return runtime_values, values
def parse_runtimes(name):
try:
runtimes = []
with open(name, "r") as f:
for line in f:
if line.startswith("subspace mining runtime:"):
runtimes.append(float(re.search("(?:subspace mining runtime:) (.*)(?: seconds)", line).group(1)))
if line.startswith("full runtime:"):
if len(runtimes) == 0:
runtimes.append(0)
runtimes.append(float(re.search("(?:full runtime:) (.*)(?: seconds)", line).group(1)))
if len(runtimes) == 0:
return [0, 0]
return runtimes
except FileNotFoundError:
return None
def compute_measures(ideal_cuts, directory, name):
data_dir = name.replace("-", "_")
cuts = parse_cuts(directory + "/" + data_dir + ".csv/cut.txt")
if not cuts:
return None
runtimes = parse_runtimes(directory + "/" + data_dir + ".csv/log.txt")
runtime_values = [name]
runtime_values.extend(runtimes)
values = []
for i in range(MAX_DIM_COUNT):
# dtw_value += fastdtw(ideal_cuts[i] if irr_feat == 0 else cuts[0], cuts[i], dist=euclidean)[0]
# dtw_value += dtw.distance(cuts[0], cuts[i]) / (ideal_cuts[i][-1] + 2)
if len(ideal_cuts) <= i:
# values.append([name + "-dim" + str(i + 1), None, None])
break
values.append(
[name + "-dim" + str(i + 1), disc_precision(ideal_cuts[i], cuts[i]), disc_recall(ideal_cuts[i], cuts[i])])
return runtime_values, values
def disc_precision(expected, current):
similarity = disc_similarity(expected, current)
return similarity[0] / (len(current) + 1)
def disc_recall(expected, current):
similarity = disc_similarity(expected, current)
return similarity[0] / (len(expected) + 1)
def disc_f1(expected, current):
similarity = disc_similarity(expected, current)
recall = similarity[0] / (len(expected) + 1)
precision = similarity[0] / (len(current) + 1)
return (2 * precision * recall) / (precision + recall)
if __name__ == '__main__':
if len(sys.argv) == 1:
print(
'Usage: discretization_quality_measure.py '
'-p=<problem> '
'-m=<[original|greedy_topk|trivial|...]> '
'-cor=<[uds]> '
'-dist=<[id, cjs]> '
'-t=<threshold float> '
'-r=<number of rows> ')
command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID'
print('Running default: ', command)
command_list = command.split(' ')
else:
command_list = sys.argv[1:]
problem_arg = list(filter(lambda x: x.startswith("-p="), command_list))
# if not problem_arg:
# raise ValueError('No problem provided!')
base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list))
if not base_dir_arg:
raise ValueError('No logs base dir provided!')
method_arg = list(filter(lambda x: x.startswith("-m="), command_list))
# if not method_arg:
# raise ValueError('No method provided!')
distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list))
# if not distance_measure_arg:
# raise ValueError('No distance measure provided!')
threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list))
# if not threshold_arg:
# raise ValueError('No threshold provided!')
# irr_feat_start_arg = list(filter(lambda x: x.startswith("-is="), command_list))
# irr_feat_end_arg = list(filter(lambda x: x.startswith("-ie="), command_list))
base_dir = base_dir_arg[0].replace('-b=', '')
if not os.path.exists(base_dir):
os.makedirs(base_dir)
if problem_arg:
problem = problem_arg[0].replace('-p=', '')
if method_arg:
method = cst.Method[method_arg[0].replace('-m=', '').upper()]
if distance_measure_arg:
distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()]
if threshold_arg:
threshold = float(threshold_arg[0].replace('-t=', ''))
problems = [
"2d_3_cubes_aligned_xor",
"2d_2_cubes_aligned",
# "2d_2_cubes_xor",
# "3d_2_cubes_aligned",
# "3d_2_cubes_xor",
# "3d_3_cubes_aligned",
# "3d_3_cubes_aligned_xor",
# "3d_3_cubes_xor",
# "3d_4_cubes_1_aligned_xor",
# "3d_4_cubes_2_aligned",
# "3d_4_cubes_xor",
# "4d_2_cubes_aligned",
# "4d_3_cubes_aligned_xor",
# "4d_3_cubes_xor",
# "4d_4_cubes_aligned_xor",
# "4d_4_cubes_2_aligned",
# "4d_4_cubes_xor",
]
disc_distances = []
for problem in problems:
runtime = []
perf = []
for method in [cst.Method.TRIVIAL, cst.Method.ORIGINAL, cst.Method.PREDEFINED]:
data = compute_problem_quality_measure(base_dir, problem, method=method)
if not data:
continue
runtime.extend(data[0])
perf.extend(data[1])
cols = ['run-dim', 'precision', 'recall']
runtime_cols = ['run', 'subspace mining runtime', 'full runtime']
pd.DataFrame(perf, columns=cols).to_csv(
base_dir + "/Precision_recall.csv")
pd.DataFrame(runtime, columns=runtime_cols).to_csv(
base_dir + "/Runtimes.csv")
# print(str(compute_problem_quality_measure("2d_2_cubes_aligned", method=cst.Method.TRIVIAL)))
# print(str(compute_problem_quality_measure("2d_2_cubes_aligned", method=cst.Method.ORIGINAL,
# threshold_range=[0.8],
# distances=['ID'],
# irr_features_range=range(11))))
# expected = [10, 20, 30, 40]
# print(expected)
# # current = [0, 11.0, 12.0, 12.02, 12.03, 12.04, 12.05, 13.0, 14.0, 31.0, 32.0, 32.02, 32.03, 32.04, 32.05, 33.0,
# # 34.0, 34.05, 34.06, 40]
# current = [0, 12.0, 22.03, 31.0, 40]
# current = [30, 40]
# print(current)
# similarity = disc_similarity(expected, current)
# print("similarity:", str(similarity))
# recall = similarity[0] / (len(expected) + 1)
# print('disc recall:', str(recall))
# precision = similarity[0] / (len(current) + 1)
# print('disc precision:', str(precision))
# print("F:", (2 * precision * recall) / (precision + recall))
# print()
#
# current = [0, 12, 23, 31, 40]
# print(current)
# similarity = disc_similarity(expected, current)
# print("similarity:", str(similarity))
# recall = similarity[0] / len(expected)
# print('disc recall:', str(recall))
# precision = similarity[0] / similarity[1]
# print('disc precision:', str(precision))
# print("F:", (2 * precision * recall) / (precision + recall))
# print()