Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
added discretization quality measures
  • Loading branch information
Tatiana Dembelova committed Sep 14, 2017
1 parent e0d51dc commit f063946
Show file tree
Hide file tree
Showing 145 changed files with 645,371 additions and 107 deletions.
10 changes: 7 additions & 3 deletions commands.txt
Expand Up @@ -2,9 +2,11 @@ ssh tdembelo@contact.mmci.uni-saarland.de

ssh tdembelo@push.mmci.uni-saarland.de

rsync -av --exclude '.idea/' --exclude '.git' --exclude='data/' --exclude='logs3/' /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/ tdembelo@badr.mpi-inf.mpg.de:/home/tdembelo/ipd_extended/
rsync -av --exclude '.idea/' --exclude '.git' --exclude='logs*' /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/ tdembelo@badr.mpi-inf.mpg.de:/home/tdembelo/ipd_extended/

rsync -av tdembelo@badr.mpi-inf.mpg.de:/home/tdembelo/ipd_extended/logs4/ /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/logs4/
rsync -av tdembelo@badr.mpi-inf.mpg.de:/home/tdembelo/ipd_extended/logs_trivial/ /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/logs_trivial/

rsync -av --include='*20170904_*0.5*/' --exclude='*/' tdembelo@badr.mpi-inf.mpg.de:/home/tdembelo/ipd_extended/logs4/ /Users/tatyanadembelova/Documents/study/thesis/ipd_extended/logs4/

source ipd_ext/bin/activate

Expand Down Expand Up @@ -35,4 +37,6 @@ for pid in $(ps aux | grep 'CJS' | grep -v '\-m=' | grep -v grep | grep -v USER
for pid in $(ps aux | grep 'python' | grep -v grep | grep -v USER | awk '{print $2}'); do kill -9 $pid; done

# renaming a scope of log directories - removing a date part
for f in *CJS*.csv; do mv $f "${f/*_*_C/C}" ;done
for f in *_*_T*.csv; do mv $f "${f/*_*_T/T}" ;done
for f in *_*_CJS*.csv; do mv $f "${f/*_*_C/C}" ;done
for f in *_*_CJS*.csv; do echo mv $f "${f/*_*_C/C}" ;done
2 changes: 2 additions & 0 deletions constants.py
@@ -1,12 +1,14 @@
from enum import Enum

class Method(Enum):
TRIVIAL = 0
ORIGINAL = 1
GREEDY_TOPK = 2
HET_GREEDY_TOPK = 3
BEST_FIRST = 4
BEAM_SEARCH = 5
HET_BEAM_SEARCH = 6
PREDEFINED = 7


class CorrelationMeasure(Enum):
Expand Down
2 changes: 1 addition & 1 deletion data_generation.py
Expand Up @@ -186,7 +186,7 @@ def generate():
"4d_4_cubes_aligned_xor.csv",
"4d_4_cubes_xor.csv",
]:
for i in [1,2,3]:
for i in [4,5,6,7,8,9,10]:
# file = 'synthetic_cases/synthetic_cube_in_cube_10.csv'
source = 'synthetic_cases/cubes/' + source_name
file = 'synthetic_cases/cubes/' + source_name.replace(".csv", "") + '_' + str(i) + '.csv'
Expand Down
352 changes: 352 additions & 0 deletions discretization_quality_measure.py
@@ -0,0 +1,352 @@
import pandas as pd
import os
import sys
import constants as cst
import re

global_min = -2
MAX_DIM_COUNT = 4


def parse_cuts(name):
try:
cuts = []
with open(name, "r") as f:
cut = []
for line in f:
if line.startswith("dimension"):
continue
if line.startswith("---"):
cuts.append(cut)
cut = []
continue
cut.append(float(line.strip()))
return cuts
except FileNotFoundError:
return None


def _find_min_dist_cut(cut, cuts, start_id=0):
min_dist = float('Inf')
min_cut_id = None
for i, c in enumerate(cuts[start_id:], start=start_id):
if i > 0 and cuts[i - 1] > c:
raise ValueError("cuts" + str(cuts) + " is not ordered!")

temp_dist = abs(c - cut)
if min_dist > temp_dist:
min_dist = temp_dist
min_cut_id = i
# elif min_cut_id:
# break
return min_dist, min_cut_id


def _find_max_sim_cut(cut, cuts, start_id=0):
max_sim = -float("Inf")
max_sim_cut_id = None
for i, c in enumerate(cuts[start_id:], start=start_id):
if i > 0 and cuts[i - 1] > c:
raise ValueError("cuts" + str(cuts) + " is not ordered!")

if c == cut:
temp_sim = 1
else:
if c > cut:
temp_sim = ((c - cuts[i - 1]) / 2 - (c - cut)) / ((c - cuts[i - 1]) / 2)
else:
# if (cuts[i + 1] - c) / 2 > (cut - c):
temp_sim = ((cuts[i + 1] - c) / 2 - (cut - c)) / ((cuts[i + 1] - c) / 2)
if max_sim < temp_sim:
max_sim = temp_sim
max_sim_cut_id = i
elif max_sim_cut_id is not None and cut < c:
break
return max_sim, max_sim_cut_id


def disc_similarity(expected_cuts, cuts):
cuts = cuts.copy()
cuts.insert(0, global_min)
expected_cuts = expected_cuts.copy()
expected_cuts.insert(0, global_min)
# if abs(expected_cuts[-1] - cuts[-1]) > 0.02:
# raise ValueError("expected_cuts and cuts have very different last cut: ", expected_cuts[-1], cuts[-1])
# if len(expected_cuts) == 1 and len(cuts) == 1:
# return 1
#
# # don't check the same last cut
# expected_cuts = expected_cuts[:-1]
# cuts = cuts[:-1]

similarity = 0
sim_exp_cut_id = _find_max_sim_cut(cuts[0], expected_cuts)
prev_cut_id = sim_exp_cut_id[1]
temp_sim = sim_exp_cut_id[0]
exp_match = 0

for i, cut in enumerate(cuts[1:], start=1):
if i > 0 and cuts[i - 1] > cut:
raise ValueError("cuts" + str(cuts) + " is not ordered!")

sim_exp_cut_id = _find_max_sim_cut(cut, expected_cuts, prev_cut_id)
if sim_exp_cut_id[1] == prev_cut_id:
# counter += 1
temp_sim *= sim_exp_cut_id[0]
else:
# print("temp_sim:", temp_sim)
similarity += temp_sim
temp_sim = sim_exp_cut_id[0]
exp_match += 1

prev_cut_id = sim_exp_cut_id[1]

# print("temp_sim:", temp_sim)
similarity += temp_sim
exp_match += 1
return similarity, exp_match


def disc_distance(expected_cuts, cuts):
distance = 0
dist_exp_cut_id = _find_min_dist_cut(cuts[0], expected_cuts)
prev_cut_id = dist_exp_cut_id[1]
temp_distance = dist_exp_cut_id[0]

counter = 0

for i, cut in enumerate(cuts[1:], start=1):
if i > 0 and cuts[i - 1] > cut:
raise ValueError("cuts" + str(cuts) + " is not ordered!")

dist_exp_cut_id = _find_min_dist_cut(cut, expected_cuts, prev_cut_id)
if dist_exp_cut_id[1] == prev_cut_id:
# counter += 1
temp_distance += dist_exp_cut_id[0]
else:
distance += temp_distance * 2 ** counter
temp_distance = dist_exp_cut_id[0]
counter = 0

prev_cut_id = dist_exp_cut_id[1]

distance += temp_distance * 2 ** counter

return distance


def compute_problem_quality_measure(directory,
problem,
method,
distances=('ID', 'CJS'),
threshold_range=(0.3, 0.5, 0.8),
irr_features_range=range(11)):
ideal_cuts = parse_cuts("ideal_disc/cut_" + problem + ".txt")
if method == cst.Method.TRIVIAL:
name = "TRIVIAL-" + problem
values = compute_measures(ideal_cuts, directory, name)
return ([values[0]], values[1]) if values else None

runtime_values = []
values = []
for dist in distances:
for threshold in threshold_range:
threshold = str(threshold)
if method == cst.Method.PREDEFINED:
counter = 1
while counter < 11:
name = dist + "-" + method.name + "-s" + str(counter) + "-" + threshold + "-" + problem
counter += 1

value = compute_measures(ideal_cuts, directory, name)
if not value:
break
runtime_values.append(value[0])
values.extend(value[1])

elif method == cst.Method.ORIGINAL:
for irr_feat in irr_features_range:
name = dist + "-" + method.name + "-" + threshold + "-" + problem + (
"" if irr_feat == 0 else "-" + str(irr_feat))

value = compute_measures(ideal_cuts, directory, name)
if not value:
continue
runtime_values.append(value[0])
values.extend(value[1])
return runtime_values, values


def parse_runtimes(name):
try:
runtimes = []
with open(name, "r") as f:
for line in f:
if line.startswith("subspace mining runtime:"):
runtimes.append(float(re.search("(?:subspace mining runtime:) (.*)(?: seconds)", line).group(1)))
if line.startswith("full runtime:"):
if len(runtimes) == 0:
runtimes.append(0)
runtimes.append(float(re.search("(?:full runtime:) (.*)(?: seconds)", line).group(1)))
if len(runtimes) == 0:
return [0, 0]
return runtimes
except FileNotFoundError:
return None


def compute_measures(ideal_cuts, directory, name):
data_dir = name.replace("-", "_")
cuts = parse_cuts(directory + "/" + data_dir + ".csv/cut.txt")
if not cuts:
return None

runtimes = parse_runtimes(directory + "/" + data_dir + ".csv/log.txt")
runtime_values = [name]
runtime_values.extend(runtimes)
values = []
for i in range(MAX_DIM_COUNT):
# dtw_value += fastdtw(ideal_cuts[i] if irr_feat == 0 else cuts[0], cuts[i], dist=euclidean)[0]
# dtw_value += dtw.distance(cuts[0], cuts[i]) / (ideal_cuts[i][-1] + 2)
if len(ideal_cuts) <= i:
# values.append([name + "-dim" + str(i + 1), None, None])
break
values.append(
[name + "-dim" + str(i + 1), disc_precision(ideal_cuts[i], cuts[i]), disc_recall(ideal_cuts[i], cuts[i])])
return runtime_values, values


def disc_precision(expected, current):
similarity = disc_similarity(expected, current)
return similarity[0] / (len(current) + 1)


def disc_recall(expected, current):
similarity = disc_similarity(expected, current)
return similarity[0] / (len(expected) + 1)


def disc_f1(expected, current):
similarity = disc_similarity(expected, current)
recall = similarity[0] / (len(expected) + 1)
precision = similarity[0] / (len(current) + 1)
return (2 * precision * recall) / (precision + recall)


if __name__ == '__main__':
if len(sys.argv) == 1:
print(
'Usage: discretization_quality_measure.py '
'-p=<problem> '
'-m=<[original|greedy_topk|trivial|...]> '
'-cor=<[uds]> '
'-dist=<[id, cjs]> '
'-t=<threshold float> '
'-r=<number of rows> ')
command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID'
print('Running default: ', command)
command_list = command.split(' ')
else:
command_list = sys.argv[1:]

problem_arg = list(filter(lambda x: x.startswith("-p="), command_list))
# if not problem_arg:
# raise ValueError('No problem provided!')
base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list))
if not base_dir_arg:
raise ValueError('No logs base dir provided!')
method_arg = list(filter(lambda x: x.startswith("-m="), command_list))
# if not method_arg:
# raise ValueError('No method provided!')
distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list))
# if not distance_measure_arg:
# raise ValueError('No distance measure provided!')
threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list))
# if not threshold_arg:
# raise ValueError('No threshold provided!')
# irr_feat_start_arg = list(filter(lambda x: x.startswith("-is="), command_list))
# irr_feat_end_arg = list(filter(lambda x: x.startswith("-ie="), command_list))

base_dir = base_dir_arg[0].replace('-b=', '')
if not os.path.exists(base_dir):
os.makedirs(base_dir)
if problem_arg:
problem = problem_arg[0].replace('-p=', '')
if method_arg:
method = cst.Method[method_arg[0].replace('-m=', '').upper()]
if distance_measure_arg:
distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()]
if threshold_arg:
threshold = float(threshold_arg[0].replace('-t=', ''))

problems = [
"2d_3_cubes_aligned_xor",
"2d_2_cubes_aligned",
# "2d_2_cubes_xor",
# "3d_2_cubes_aligned",
# "3d_2_cubes_xor",
# "3d_3_cubes_aligned",
# "3d_3_cubes_aligned_xor",
# "3d_3_cubes_xor",
# "3d_4_cubes_1_aligned_xor",
# "3d_4_cubes_2_aligned",
# "3d_4_cubes_xor",
# "4d_2_cubes_aligned",
# "4d_3_cubes_aligned_xor",
# "4d_3_cubes_xor",
# "4d_4_cubes_aligned_xor",
# "4d_4_cubes_2_aligned",
# "4d_4_cubes_xor",
]

disc_distances = []
for problem in problems:

runtime = []
perf = []
for method in [cst.Method.TRIVIAL, cst.Method.ORIGINAL, cst.Method.PREDEFINED]:
data = compute_problem_quality_measure(base_dir, problem, method=method)
if not data:
continue
runtime.extend(data[0])
perf.extend(data[1])
cols = ['run-dim', 'precision', 'recall']
runtime_cols = ['run', 'subspace mining runtime', 'full runtime']

pd.DataFrame(perf, columns=cols).to_csv(
base_dir + "/Precision_recall.csv")
pd.DataFrame(runtime, columns=runtime_cols).to_csv(
base_dir + "/Runtimes.csv")
# print(str(compute_problem_quality_measure("2d_2_cubes_aligned", method=cst.Method.TRIVIAL)))
# print(str(compute_problem_quality_measure("2d_2_cubes_aligned", method=cst.Method.ORIGINAL,
# threshold_range=[0.8],
# distances=['ID'],
# irr_features_range=range(11))))

# expected = [10, 20, 30, 40]
# print(expected)

# # current = [0, 11.0, 12.0, 12.02, 12.03, 12.04, 12.05, 13.0, 14.0, 31.0, 32.0, 32.02, 32.03, 32.04, 32.05, 33.0,
# # 34.0, 34.05, 34.06, 40]
# current = [0, 12.0, 22.03, 31.0, 40]
# current = [30, 40]
# print(current)
# similarity = disc_similarity(expected, current)
# print("similarity:", str(similarity))
# recall = similarity[0] / (len(expected) + 1)
# print('disc recall:', str(recall))
# precision = similarity[0] / (len(current) + 1)
# print('disc precision:', str(precision))
# print("F:", (2 * precision * recall) / (precision + recall))
# print()
#
# current = [0, 12, 23, 31, 40]
# print(current)
# similarity = disc_similarity(expected, current)
# print("similarity:", str(similarity))
# recall = similarity[0] / len(expected)
# print('disc recall:', str(recall))
# precision = similarity[0] / similarity[1]
# print('disc precision:', str(precision))
# print("F:", (2 * precision * recall) / (precision + recall))
# print()

0 comments on commit f063946

Please sign in to comment.