Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
added discretization quality measures
- Loading branch information
Tatiana Dembelova
committed
Sep 14, 2017
1 parent
e0d51dc
commit f063946
Showing
145 changed files
with
645,371 additions
and
107 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,352 @@ | ||
import pandas as pd | ||
import os | ||
import sys | ||
import constants as cst | ||
import re | ||
|
||
global_min = -2 | ||
MAX_DIM_COUNT = 4 | ||
|
||
|
||
def parse_cuts(name): | ||
try: | ||
cuts = [] | ||
with open(name, "r") as f: | ||
cut = [] | ||
for line in f: | ||
if line.startswith("dimension"): | ||
continue | ||
if line.startswith("---"): | ||
cuts.append(cut) | ||
cut = [] | ||
continue | ||
cut.append(float(line.strip())) | ||
return cuts | ||
except FileNotFoundError: | ||
return None | ||
|
||
|
||
def _find_min_dist_cut(cut, cuts, start_id=0): | ||
min_dist = float('Inf') | ||
min_cut_id = None | ||
for i, c in enumerate(cuts[start_id:], start=start_id): | ||
if i > 0 and cuts[i - 1] > c: | ||
raise ValueError("cuts" + str(cuts) + " is not ordered!") | ||
|
||
temp_dist = abs(c - cut) | ||
if min_dist > temp_dist: | ||
min_dist = temp_dist | ||
min_cut_id = i | ||
# elif min_cut_id: | ||
# break | ||
return min_dist, min_cut_id | ||
|
||
|
||
def _find_max_sim_cut(cut, cuts, start_id=0): | ||
max_sim = -float("Inf") | ||
max_sim_cut_id = None | ||
for i, c in enumerate(cuts[start_id:], start=start_id): | ||
if i > 0 and cuts[i - 1] > c: | ||
raise ValueError("cuts" + str(cuts) + " is not ordered!") | ||
|
||
if c == cut: | ||
temp_sim = 1 | ||
else: | ||
if c > cut: | ||
temp_sim = ((c - cuts[i - 1]) / 2 - (c - cut)) / ((c - cuts[i - 1]) / 2) | ||
else: | ||
# if (cuts[i + 1] - c) / 2 > (cut - c): | ||
temp_sim = ((cuts[i + 1] - c) / 2 - (cut - c)) / ((cuts[i + 1] - c) / 2) | ||
if max_sim < temp_sim: | ||
max_sim = temp_sim | ||
max_sim_cut_id = i | ||
elif max_sim_cut_id is not None and cut < c: | ||
break | ||
return max_sim, max_sim_cut_id | ||
|
||
|
||
def disc_similarity(expected_cuts, cuts): | ||
cuts = cuts.copy() | ||
cuts.insert(0, global_min) | ||
expected_cuts = expected_cuts.copy() | ||
expected_cuts.insert(0, global_min) | ||
# if abs(expected_cuts[-1] - cuts[-1]) > 0.02: | ||
# raise ValueError("expected_cuts and cuts have very different last cut: ", expected_cuts[-1], cuts[-1]) | ||
# if len(expected_cuts) == 1 and len(cuts) == 1: | ||
# return 1 | ||
# | ||
# # don't check the same last cut | ||
# expected_cuts = expected_cuts[:-1] | ||
# cuts = cuts[:-1] | ||
|
||
similarity = 0 | ||
sim_exp_cut_id = _find_max_sim_cut(cuts[0], expected_cuts) | ||
prev_cut_id = sim_exp_cut_id[1] | ||
temp_sim = sim_exp_cut_id[0] | ||
exp_match = 0 | ||
|
||
for i, cut in enumerate(cuts[1:], start=1): | ||
if i > 0 and cuts[i - 1] > cut: | ||
raise ValueError("cuts" + str(cuts) + " is not ordered!") | ||
|
||
sim_exp_cut_id = _find_max_sim_cut(cut, expected_cuts, prev_cut_id) | ||
if sim_exp_cut_id[1] == prev_cut_id: | ||
# counter += 1 | ||
temp_sim *= sim_exp_cut_id[0] | ||
else: | ||
# print("temp_sim:", temp_sim) | ||
similarity += temp_sim | ||
temp_sim = sim_exp_cut_id[0] | ||
exp_match += 1 | ||
|
||
prev_cut_id = sim_exp_cut_id[1] | ||
|
||
# print("temp_sim:", temp_sim) | ||
similarity += temp_sim | ||
exp_match += 1 | ||
return similarity, exp_match | ||
|
||
|
||
def disc_distance(expected_cuts, cuts): | ||
distance = 0 | ||
dist_exp_cut_id = _find_min_dist_cut(cuts[0], expected_cuts) | ||
prev_cut_id = dist_exp_cut_id[1] | ||
temp_distance = dist_exp_cut_id[0] | ||
|
||
counter = 0 | ||
|
||
for i, cut in enumerate(cuts[1:], start=1): | ||
if i > 0 and cuts[i - 1] > cut: | ||
raise ValueError("cuts" + str(cuts) + " is not ordered!") | ||
|
||
dist_exp_cut_id = _find_min_dist_cut(cut, expected_cuts, prev_cut_id) | ||
if dist_exp_cut_id[1] == prev_cut_id: | ||
# counter += 1 | ||
temp_distance += dist_exp_cut_id[0] | ||
else: | ||
distance += temp_distance * 2 ** counter | ||
temp_distance = dist_exp_cut_id[0] | ||
counter = 0 | ||
|
||
prev_cut_id = dist_exp_cut_id[1] | ||
|
||
distance += temp_distance * 2 ** counter | ||
|
||
return distance | ||
|
||
|
||
def compute_problem_quality_measure(directory, | ||
problem, | ||
method, | ||
distances=('ID', 'CJS'), | ||
threshold_range=(0.3, 0.5, 0.8), | ||
irr_features_range=range(11)): | ||
ideal_cuts = parse_cuts("ideal_disc/cut_" + problem + ".txt") | ||
if method == cst.Method.TRIVIAL: | ||
name = "TRIVIAL-" + problem | ||
values = compute_measures(ideal_cuts, directory, name) | ||
return ([values[0]], values[1]) if values else None | ||
|
||
runtime_values = [] | ||
values = [] | ||
for dist in distances: | ||
for threshold in threshold_range: | ||
threshold = str(threshold) | ||
if method == cst.Method.PREDEFINED: | ||
counter = 1 | ||
while counter < 11: | ||
name = dist + "-" + method.name + "-s" + str(counter) + "-" + threshold + "-" + problem | ||
counter += 1 | ||
|
||
value = compute_measures(ideal_cuts, directory, name) | ||
if not value: | ||
break | ||
runtime_values.append(value[0]) | ||
values.extend(value[1]) | ||
|
||
elif method == cst.Method.ORIGINAL: | ||
for irr_feat in irr_features_range: | ||
name = dist + "-" + method.name + "-" + threshold + "-" + problem + ( | ||
"" if irr_feat == 0 else "-" + str(irr_feat)) | ||
|
||
value = compute_measures(ideal_cuts, directory, name) | ||
if not value: | ||
continue | ||
runtime_values.append(value[0]) | ||
values.extend(value[1]) | ||
return runtime_values, values | ||
|
||
|
||
def parse_runtimes(name): | ||
try: | ||
runtimes = [] | ||
with open(name, "r") as f: | ||
for line in f: | ||
if line.startswith("subspace mining runtime:"): | ||
runtimes.append(float(re.search("(?:subspace mining runtime:) (.*)(?: seconds)", line).group(1))) | ||
if line.startswith("full runtime:"): | ||
if len(runtimes) == 0: | ||
runtimes.append(0) | ||
runtimes.append(float(re.search("(?:full runtime:) (.*)(?: seconds)", line).group(1))) | ||
if len(runtimes) == 0: | ||
return [0, 0] | ||
return runtimes | ||
except FileNotFoundError: | ||
return None | ||
|
||
|
||
def compute_measures(ideal_cuts, directory, name): | ||
data_dir = name.replace("-", "_") | ||
cuts = parse_cuts(directory + "/" + data_dir + ".csv/cut.txt") | ||
if not cuts: | ||
return None | ||
|
||
runtimes = parse_runtimes(directory + "/" + data_dir + ".csv/log.txt") | ||
runtime_values = [name] | ||
runtime_values.extend(runtimes) | ||
values = [] | ||
for i in range(MAX_DIM_COUNT): | ||
# dtw_value += fastdtw(ideal_cuts[i] if irr_feat == 0 else cuts[0], cuts[i], dist=euclidean)[0] | ||
# dtw_value += dtw.distance(cuts[0], cuts[i]) / (ideal_cuts[i][-1] + 2) | ||
if len(ideal_cuts) <= i: | ||
# values.append([name + "-dim" + str(i + 1), None, None]) | ||
break | ||
values.append( | ||
[name + "-dim" + str(i + 1), disc_precision(ideal_cuts[i], cuts[i]), disc_recall(ideal_cuts[i], cuts[i])]) | ||
return runtime_values, values | ||
|
||
|
||
def disc_precision(expected, current): | ||
similarity = disc_similarity(expected, current) | ||
return similarity[0] / (len(current) + 1) | ||
|
||
|
||
def disc_recall(expected, current): | ||
similarity = disc_similarity(expected, current) | ||
return similarity[0] / (len(expected) + 1) | ||
|
||
|
||
def disc_f1(expected, current): | ||
similarity = disc_similarity(expected, current) | ||
recall = similarity[0] / (len(expected) + 1) | ||
precision = similarity[0] / (len(current) + 1) | ||
return (2 * precision * recall) / (precision + recall) | ||
|
||
|
||
if __name__ == '__main__': | ||
if len(sys.argv) == 1: | ||
print( | ||
'Usage: discretization_quality_measure.py ' | ||
'-p=<problem> ' | ||
'-m=<[original|greedy_topk|trivial|...]> ' | ||
'-cor=<[uds]> ' | ||
'-dist=<[id, cjs]> ' | ||
'-t=<threshold float> ' | ||
'-r=<number of rows> ') | ||
command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID' | ||
print('Running default: ', command) | ||
command_list = command.split(' ') | ||
else: | ||
command_list = sys.argv[1:] | ||
|
||
problem_arg = list(filter(lambda x: x.startswith("-p="), command_list)) | ||
# if not problem_arg: | ||
# raise ValueError('No problem provided!') | ||
base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list)) | ||
if not base_dir_arg: | ||
raise ValueError('No logs base dir provided!') | ||
method_arg = list(filter(lambda x: x.startswith("-m="), command_list)) | ||
# if not method_arg: | ||
# raise ValueError('No method provided!') | ||
distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list)) | ||
# if not distance_measure_arg: | ||
# raise ValueError('No distance measure provided!') | ||
threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list)) | ||
# if not threshold_arg: | ||
# raise ValueError('No threshold provided!') | ||
# irr_feat_start_arg = list(filter(lambda x: x.startswith("-is="), command_list)) | ||
# irr_feat_end_arg = list(filter(lambda x: x.startswith("-ie="), command_list)) | ||
|
||
base_dir = base_dir_arg[0].replace('-b=', '') | ||
if not os.path.exists(base_dir): | ||
os.makedirs(base_dir) | ||
if problem_arg: | ||
problem = problem_arg[0].replace('-p=', '') | ||
if method_arg: | ||
method = cst.Method[method_arg[0].replace('-m=', '').upper()] | ||
if distance_measure_arg: | ||
distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()] | ||
if threshold_arg: | ||
threshold = float(threshold_arg[0].replace('-t=', '')) | ||
|
||
problems = [ | ||
"2d_3_cubes_aligned_xor", | ||
"2d_2_cubes_aligned", | ||
# "2d_2_cubes_xor", | ||
# "3d_2_cubes_aligned", | ||
# "3d_2_cubes_xor", | ||
# "3d_3_cubes_aligned", | ||
# "3d_3_cubes_aligned_xor", | ||
# "3d_3_cubes_xor", | ||
# "3d_4_cubes_1_aligned_xor", | ||
# "3d_4_cubes_2_aligned", | ||
# "3d_4_cubes_xor", | ||
# "4d_2_cubes_aligned", | ||
# "4d_3_cubes_aligned_xor", | ||
# "4d_3_cubes_xor", | ||
# "4d_4_cubes_aligned_xor", | ||
# "4d_4_cubes_2_aligned", | ||
# "4d_4_cubes_xor", | ||
] | ||
|
||
disc_distances = [] | ||
for problem in problems: | ||
|
||
runtime = [] | ||
perf = [] | ||
for method in [cst.Method.TRIVIAL, cst.Method.ORIGINAL, cst.Method.PREDEFINED]: | ||
data = compute_problem_quality_measure(base_dir, problem, method=method) | ||
if not data: | ||
continue | ||
runtime.extend(data[0]) | ||
perf.extend(data[1]) | ||
cols = ['run-dim', 'precision', 'recall'] | ||
runtime_cols = ['run', 'subspace mining runtime', 'full runtime'] | ||
|
||
pd.DataFrame(perf, columns=cols).to_csv( | ||
base_dir + "/Precision_recall.csv") | ||
pd.DataFrame(runtime, columns=runtime_cols).to_csv( | ||
base_dir + "/Runtimes.csv") | ||
# print(str(compute_problem_quality_measure("2d_2_cubes_aligned", method=cst.Method.TRIVIAL))) | ||
# print(str(compute_problem_quality_measure("2d_2_cubes_aligned", method=cst.Method.ORIGINAL, | ||
# threshold_range=[0.8], | ||
# distances=['ID'], | ||
# irr_features_range=range(11)))) | ||
|
||
# expected = [10, 20, 30, 40] | ||
# print(expected) | ||
|
||
# # current = [0, 11.0, 12.0, 12.02, 12.03, 12.04, 12.05, 13.0, 14.0, 31.0, 32.0, 32.02, 32.03, 32.04, 32.05, 33.0, | ||
# # 34.0, 34.05, 34.06, 40] | ||
# current = [0, 12.0, 22.03, 31.0, 40] | ||
# current = [30, 40] | ||
# print(current) | ||
# similarity = disc_similarity(expected, current) | ||
# print("similarity:", str(similarity)) | ||
# recall = similarity[0] / (len(expected) + 1) | ||
# print('disc recall:', str(recall)) | ||
# precision = similarity[0] / (len(current) + 1) | ||
# print('disc precision:', str(precision)) | ||
# print("F:", (2 * precision * recall) / (precision + recall)) | ||
# print() | ||
# | ||
# current = [0, 12, 23, 31, 40] | ||
# print(current) | ||
# similarity = disc_similarity(expected, current) | ||
# print("similarity:", str(similarity)) | ||
# recall = similarity[0] / len(expected) | ||
# print('disc recall:', str(recall)) | ||
# precision = similarity[0] / similarity[1] | ||
# print('disc precision:', str(precision)) | ||
# print("F:", (2 * precision * recall) / (precision + recall)) | ||
# print() |
Oops, something went wrong.