Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/main.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
864 lines (733 sloc)
34.6 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import sys | |
import time | |
# todo fix for server push | |
import matplotlib | |
import shutil | |
matplotlib.use('Agg') | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import os | |
import re | |
import constants as cst | |
import interaction_distance as id | |
import subspace_mining as sm | |
import util | |
from correlation_measures.binning import Binning | |
import experiments_logging as el | |
from merging import dynamic_merging | |
import cjs | |
import discretization_quality_measure as dqm | |
import json | |
import random | |
import traceback | |
# ------------------------------------------------------ | |
def find_disc_macro_id(disc_macro_intervals, point): | |
for macro in disc_macro_intervals.items(): | |
if macro[1][0] <= point <= macro[1][1]: | |
return macro[0] | |
raise ValueError("Micro bin is not covered by any of the macro intervals!", disc_macro_intervals, point) | |
def write(log, *args): | |
join = ' '.join([str(a) for a in args]) | |
if log: | |
log.write(join) | |
log.write('\n') | |
def plot_distances(dir, distances, disc_intervals): | |
dim_count = len(distances) | |
plt.figure(1) | |
height = int(math.sqrt(dim_count)) | |
width = int(math.ceil((dim_count) / height)) | |
fig, axes = plt.subplots(nrows=height, ncols=width, squeeze=False) | |
for curr, dist in enumerate(distances): | |
ID_threshold = id.compute_ID_threshold(dist[1]) | |
ax1 = axes[int(curr / width), int(curr % width)] | |
ax1.set_ylim([0, 0.1]) | |
# ax1.hist(distances, bins=100, color='c') | |
ax1.plot(dist[0], dist[1]) | |
ax1.axhline(ID_threshold, color='b', linestyle='dashed', linewidth=1) | |
curr_macro_intervals = disc_intervals[curr] | |
for macro_id in range(1, len(curr_macro_intervals)): | |
ax1.axvline(curr_macro_intervals[macro_id][0], color='r') | |
ax1.set_title('dimension ' + str(curr)) | |
plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25, wspace=0.35) | |
plt.savefig(dir + 'distances.png', format='png') | |
def compute_distances(bin_map, curr, data, dim_maxes, | |
cor_measure, method, distance_measure, | |
k=cst.MAX_SUBSPACE_SIZE, | |
delta=cst.HETEROGENEOUS_THRESHOLD, | |
beam_width=cst.BEAM_WIDTH, | |
subspace_map=None, | |
log=None): | |
if method == cst.Method.FULL: | |
return (id.compute_IDs(bin_map, curr, data, dim_maxes) if distance_measure == cst.DistanceMeasure.ID | |
else cjs.compute_CJSs(bin_map, curr, data, dim_maxes)), 0 | |
if method.name.startswith("SM"): | |
subspace_mining_start = time.time() | |
if method == cst.Method.GREEDY_TOPK: | |
curr_subspace = sm.greedy_topk(data, curr, k, cor_measure) | |
elif method == cst.Method.HET_GREEDY_TOPK: | |
curr_subspace = sm.het_greedy_topk(data, curr, k, delta, cor_measure) | |
elif method == cst.Method.BEST_FIRST: | |
curr_subspace = sm.best_first(data, curr, k, cor_measure) | |
elif method == cst.Method.BEAM_SEARCH: | |
curr_subspace = sm.beam_search(data, curr, k, beam_width, cor_measure) | |
elif method == cst.Method.HET_BEAM_SEARCH: | |
curr_subspace = sm.het_beam_search(data, curr, k, beam_width, delta, cor_measure) | |
else: | |
raise ValueError("there is no such method!") | |
subspace_mining_end = time.time() | |
sm_runtime = subspace_mining_end - subspace_mining_start | |
write(log, 'subspace mining runtime:', sm_runtime, 'seconds') | |
else: | |
sm_runtime = 0 | |
if curr in subspace_map: | |
curr_subspace = subspace_map[curr] | |
else: | |
curr_subspace = [] | |
data = data.copy().loc[:, curr_subspace] | |
dim_maxes = dim_maxes[curr_subspace] | |
return (id.compute_IDs1(bin_map, data, dim_maxes) if distance_measure == cst.DistanceMeasure.ID | |
else cjs.compute_CJSs1(bin_map, data, dim_maxes)), sm_runtime | |
def compute_IPD(data, method=cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET, cor_measure=None, | |
distance_measure=cst.DistanceMeasure.ID, | |
subspace_set=None, | |
log=None): | |
start = time.time() | |
dim_count = data.shape[1] | |
# number of initial dist_bins | |
# todo old remove later | |
# init_bins_count = 20 # ceil in original ipd... | |
init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) # ceil in original ipd... | |
write(log, 'row count:', data.shape[0]) | |
write(log, 'init_bins_count:', init_bins_count) | |
write(log, 'ID_THRESHOLD_QUANTILE:', cst.ID_THRESHOLD_QUANTILE) | |
# normalization step todo old (optional) | |
# todo old by default the normalization is optional as it does not influence on the results | |
# norm_data = data.apply(lambda x: 2 * cst.NORMALIZATION_RADIUS * (x - x.min()) / ( | |
# x.max() - x.min()) - cst.NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape))) | |
norm_data = data | |
# dimension maximums | |
dim_maxes = norm_data.max(0) | |
disc_macro_intervals = [] | |
disc_points = [] | |
subspace_map = get_map_from_subspace_set(subspace_set) if subspace_set else None | |
distancez = [] | |
# iterate over all the dimensions | |
full_sm_runtime = 0 | |
for curr in range(dim_count): | |
binning = Binning(norm_data, curr, init_bins_count) | |
bin_map = binning.equal_frequency_binning_by_rank() | |
dist_bins = bin_map.cat.categories | |
# -----------------------------INTERACTION DISTANCES---------------------------------- | |
distances, sm_runtime = compute_distances(bin_map, curr, norm_data, dim_maxes, cor_measure, method, | |
distance_measure, subspace_map=subspace_map, log=log) | |
# todo python361 | |
# distancez.append([[data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[i].right)].index.tolist()[0], curr] for i in | |
# range(len(distances))], distances]) | |
# todo python342 | |
distancez.append([[data.loc[binning.rank_data[binning.rank_data[curr] | |
== math.floor(float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)', | |
dist_bins[i]).group(1)))] | |
.index.tolist()[0], curr] for i in range(len(distances))], distances]) | |
full_sm_runtime += sm_runtime | |
ID_threshold = id.compute_ID_threshold(distances) | |
# todo ext compute sliding average and count ID peaks above the avg (in a sliding window) | |
# ID_peaks = id.compute_sliding_count(distances, ID_threshold) | |
# pd.DataFrame(distances).to_csv(prefix + "_IDs_" + str(curr) + ".csv") | |
# -----------------------------OPTIMAL MERGE STRATEGY---------------------------------- | |
# table with costs, the value in i-th row and j-th column means cost of (i+1) micro bins merged into (j+1) | |
# macro bins | |
F, discretizations = dynamic_merging(ID_threshold, distances, init_bins_count) | |
# pd.DataFrame(F).to_csv(prefix + "_F_" + str(curr) + ".csv") | |
# pd.DataFrame([[[b[-1] for b in k[:-1]] for k in c] for c in discretizations]).to_csv(prefix + "_bp_" + str(curr) + ".csv") | |
min_id = np.argmin(F[-1]) | |
discretization = discretizations[-1][min_id] | |
(curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretization, | |
dist_bins, binning.rank_data) | |
# todo uncomment if detailed log is necessary | |
# write(log, '-------------------------') | |
# write(log, 'dimension:', curr) | |
# write(log, 'ID_threshold:', ID_threshold) | |
# write(log, 'cost:', F[-1, min_id]) | |
# write(log, 'number of macrobins:', len(curr_macro_intervals)) | |
# | |
# # write(log, 'distances', distances) | |
# write(log, '\ndistances between the macrobins:') | |
# for macro_id, macro_bin in enumerate(discretization[:-1]): | |
# write(log, "{0:.2f}".format(curr_macro_intervals[macro_id][1]) + " -", distances[macro_bin[-1]], '[q=' + | |
# str((sorted(distances).index(distances[macro_bin[-1]]) + 1) / len(distances)) + ']') | |
# | |
# write(log, '\nnumber of points per macrobin:') | |
# for macro_id in curr_macro_intervals: | |
# write(log, "[" + "{0:.2f}".format(curr_macro_intervals[macro_id][0]) + ",", | |
# "{0:.2f}".format(curr_macro_intervals[macro_id][1]) + "]", | |
# sum([1 for p in curr_macro_points if p == macro_id])) | |
# write(log, '\n') | |
disc_macro_intervals.append(curr_macro_intervals) | |
disc_points.append(curr_macro_points) | |
end = time.time() | |
runtime = end - start | |
write(log, 'full runtime:', runtime, 'seconds') | |
# todo sm_runtime! | |
return disc_macro_intervals, disc_points, distancez, init_bins_count, runtime, full_sm_runtime | |
def compute_trivial_discretization(data, trivial_bins_count=None, log=None): | |
start = time.time() | |
dim_count = data.shape[1] | |
bins_count = int(math.ceil(math.sqrt(data.shape[0]))) if not trivial_bins_count else trivial_bins_count | |
write(log, 'row count:', data.shape[0]) | |
write(log, 'bins_count:', bins_count) | |
norm_data = data | |
# dimension maximums | |
disc_macro_intervals = [] | |
disc_points = [] | |
distancez = [] | |
# iterate over all the dimensions | |
for curr in range(dim_count): | |
binning = Binning(norm_data, curr, bins_count) | |
bin_map = binning.equal_frequency_binning_by_rank() | |
dist_bins = bin_map.cat.categories | |
discretization = [[i] for i in range(bins_count)] | |
(curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretization, | |
dist_bins, binning.rank_data) | |
disc_macro_intervals.append(curr_macro_intervals) | |
disc_points.append(curr_macro_points) | |
end = time.time() | |
runtime = end - start | |
write(log, runtime, 'seconds') | |
return disc_macro_intervals, disc_points, distancez, bins_count, runtime | |
def read_discretization(disc_file): | |
disc = [] | |
d = {} | |
last = -2 | |
bin = 0 | |
with open(disc_file, "r") as f: | |
for line in f: | |
if line.startswith("---"): | |
disc.append(d) | |
continue | |
if line.startswith("dimension"): | |
last = -2 | |
bin = 0 | |
d = {} | |
continue | |
if len(line.strip()) == 0: | |
continue | |
next = float(line.strip()) | |
d[bin] = [last, next] | |
bin += 1 | |
last = next | |
return disc | |
def compute_perfect_discretization(problem, data, log=None): | |
start = time.time() | |
dim_count = data.shape[1] | |
# dimension maximums | |
disc_points = [] | |
all_intervals = read_discretization('ideal_disc/cut_' + problem + ".txt") | |
# iterate over all the dimensions | |
for curr in range(len(all_intervals)): | |
intervals = all_intervals[curr] | |
macro_points = [] | |
for point in data.iterrows(): | |
macro_points.append(find_disc_macro_id(intervals, point[1][curr])) | |
disc_points.append(macro_points) | |
end = time.time() | |
write(log, end - start, 'seconds') | |
return all_intervals, disc_points, end - start | |
def get_discretized_points(curr, data, discretization, dist_bins, rank_data): | |
disc_macro_intervals = dict() | |
for i, macro_bin in enumerate(discretization): | |
macro_interval = [] | |
for micro_bin_id in macro_bin: | |
# todo python361 | |
# right = \ | |
# data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[micro_bin_id].right)][curr].index[0]][curr] | |
# if not len(macro_interval): | |
# macro_interval.append( | |
# data.loc[rank_data[rank_data[curr] == math.ceil(dist_bins[micro_bin_id].left)][curr].index[0]][ | |
# curr]) | |
# macro_interval.append(right) | |
# todo python342 | |
right = \ | |
data.loc[rank_data[rank_data[curr] == math.floor(float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)', | |
dist_bins[micro_bin_id]).group(1)))][ | |
curr].index[0]][curr] | |
if not len(macro_interval): | |
macro_interval.append( | |
data.loc[rank_data[rank_data[curr] == math.ceil(float(re.search('(-*\d+\.*\d*e*[+-]*\d*),', | |
dist_bins[micro_bin_id]).group( | |
1)))][curr].index[0]][ | |
curr]) | |
macro_interval.append(right) | |
else: | |
macro_interval[1] = right | |
disc_macro_intervals[i] = macro_interval | |
macro_points = [] | |
for point in data.iterrows(): | |
macro_points.append(find_disc_macro_id(disc_macro_intervals, point[1][curr])) | |
return disc_macro_intervals, macro_points | |
def _compute_subspaces(dims, sets): | |
count = len(dims) | |
if count > 0: | |
_compute_subspaces(dims[1:], sets) | |
elif count == 0: | |
sets.append([]) | |
return | |
new_set = [] | |
for s in sets: | |
sp = list(s) | |
sp.append(dims[0]) | |
new_set.append(sp) | |
sets.extend(new_set) | |
# # todo return list of dictionaries | |
# def compute_subspace_sets(data_file_name, method): | |
# dims_count = util.parse_relevant_features(data_file_name) | |
# if dims_count == 2: | |
# return [[0, 1]] | |
# | |
# dims = [i for i in range(dims_count)] | |
# | |
# sets = [] | |
# | |
# _compute_subspaces(dims, sets) | |
# | |
# result = [s for s in sets if len(s) > 1] | |
# | |
# return result | |
ideal = None | |
with open(cst.PERFECT_SUBSPACES_JSON, "r") as f: | |
ideal = json.load(f) | |
# todo return list of dictionaries | |
def get_ideal_subspace_set(data_file_name): | |
# todo naive implementation | |
return ideal.get(data_file_name.replace(".csv", "")) | |
def get_map_from_subspace_set(subspace_set): | |
map = dict() | |
for subspace in subspace_set: | |
for dim in subspace: | |
if dim not in map: | |
map[dim] = set() | |
map[dim] = map[dim].union(set(subspace) - {dim}) | |
return map | |
def compute_predefined_subspace_sets(rel_features, ideal_subspace_set): | |
subspace_sets = [] | |
init_subset = [] | |
dim_map = {i: [] for i in range(rel_features)} | |
dims = [] | |
# if the ideal_subspace_set is already minimal there are no other subspace sets | |
if sum([len(s) == 2 for s in ideal_subspace_set]) == len(ideal_subspace_set): | |
return subspace_sets | |
max_subspace_size = 0 | |
for e, ideal_subspace in enumerate(ideal_subspace_set): | |
if len(ideal_subspace) > max_subspace_size: | |
max_subspace_size = len(ideal_subspace) | |
# every subspace consists of 2 dims | |
init_subset.append(ideal_subspace[:2]) | |
# dims left for considering | |
for i in ideal_subspace[2:]: | |
dim_map[i].append(e) | |
dims.append(i) | |
subspace_sets.append(init_subset) | |
last = init_subset | |
# 2 is minimal number of interacting dimensions | |
for i in range(rel_features - len(ideal_subspace_set) * 2 - 1): | |
d = random.choice(dims) | |
dims.remove(d) | |
subspace = dim_map[d][0] | |
if len(dim_map[d]) > 1: | |
dim_map[d].pop() | |
else: | |
del dim_map[d] | |
subset = [ss.copy() for ss in last] | |
subset[subspace].append(d) | |
if i % cst.SUBSPACE_SET_STEP == 0 or i == rel_features - len(ideal_subspace_set) * 2 - 1: | |
subspace_sets.append(subset) | |
last = subset | |
return subspace_sets | |
def compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subspace_set, greedy): | |
subspace_sets = [] | |
init_subset = [] | |
# if the ideal_subspace_set is already minimal there are no other subspace sets | |
if sum([len(s) == 2 for s in ideal_subspace_set]) == len(ideal_subspace_set): | |
return subspace_sets | |
max_subspace_size = 0 | |
for e, ideal_subspace in enumerate(ideal_subspace_set): | |
if len(ideal_subspace) > max_subspace_size: | |
max_subspace_size = len(ideal_subspace) | |
# every subspace consists of 2 dims | |
init_subset.append(ideal_subspace[:2]) | |
subspace_sets.append(init_subset) | |
last = init_subset | |
irr_counter = 0 | |
for i in range(2, max_subspace_size): | |
subset = [] | |
for j, ss in enumerate(last): | |
if len(ideal_subspace_set[j]) > i: | |
subset.append(ss.copy() + [ideal_subspace_set[j][i]]) | |
elif greedy: | |
subset.append(ss.copy() + [rel_features + irr_counter]) | |
irr_counter += 1 | |
subspace_sets.append(subset) | |
if ideal_subspace_set in subspace_sets: | |
subspace_sets.remove(ideal_subspace_set) | |
return subspace_sets | |
def compute_predefined_subspace_sets_naive(rel_features): | |
dims = [i for i in range(rel_features + cst.IRRELEVANT_FEATURES)] | |
random.shuffle(dims) | |
subspace_sets = [] | |
for chunks in cst.NAIVE_CHUNKS_NUMBER_RANGE_LIST: | |
ss = list(util.chunks(dims, chunks)) | |
# merge the last with the previous subspace, if the last consists only of 1 dimension | |
if len(ss[-1]) == 1: | |
ss[-2].extend(ss[-1]) | |
del ss[-1] | |
subspace_sets.append(ss) | |
return subspace_sets | |
def compute_subspace_sets(data_file_name, method): | |
rel_features = util.parse_relevant_features(data_file_name) | |
ideal_subspace_set = get_ideal_subspace_set(data_file_name) | |
if method is cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET: | |
return [ideal_subspace_set] | |
if method is cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT: | |
redundant_subspace_sets = [] | |
# for irr in range(rel_features + 1, cst.IRRELEVANT_FEATURES + rel_features + 1, cst.SUBSPACE_SET_STEP): | |
for irr in [i + rel_features + 1 for i in cst.IRRELEVANT_FEATURES_RANGE_LIST]: | |
rss = [ideal_subspace + [rf for rf in range(rel_features, irr)] for ideal_subspace in | |
ideal_subspace_set] | |
redundant_subspace_sets.append(rss) | |
# if cst.IRRELEVANT_FEATURES % 2 == 0: | |
# rss = [ideal_subspace + [rf for rf in range(rel_features, cst.IRRELEVANT_FEATURES + rel_features)] for | |
# ideal_subspace in | |
# ideal_subspace_set] | |
# redundant_subspace_sets.append(rss) | |
return redundant_subspace_sets | |
if method is cst.Method.PREDEFINED_SUBSPACESETS: | |
return compute_predefined_subspace_sets(rel_features, ideal_subspace_set) | |
elif method is cst.Method.PREDEFINED_SUBSPACESETS_SYNCHRONOUS_GREEDY: | |
return compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subspace_set, True) | |
elif method is cst.Method.PREDEFINED_SUBSPACESETS_SYNCHRONOUS_OPTIMAL: | |
return compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subspace_set, False) | |
elif method is cst.Method.PREDEFINED_SUBSPACESETS_NAIVE: | |
return compute_predefined_subspace_sets_naive(rel_features) | |
else: | |
raise ValueError("the method has not been implemented yet! " + method) | |
def execute(param, loader=None): | |
assert type(param) == RunParams | |
base_dir = param.base_dir | |
experiment_name = param.experiment_name | |
method = param.method | |
data_file = param.data_file | |
delim = param.delim | |
rows = param.rows | |
columns = param.columns | |
subspace_set = param.subspace_set | |
distance_measure = param.distance_measure | |
cor_measure = param.cor_measure | |
trivial_bins_count = param.trivial_bins_count | |
# todo should not change the constant! fix later | |
# threshold = param.threshold | |
# cst.ID_THRESHOLD_QUANTILE = threshold | |
# reading data from the file with delimiter and NaN values as "?" | |
data = None | |
if loader == None: | |
data = pd.read_csv(data_file, delimiter=delim, header=None, na_values='?') | |
else: | |
data = loader.load_dataset(data_file, delim) | |
# drop a data point if it contains inconsistent data | |
data = data.dropna(axis=0, how='any') | |
if columns: | |
cols = [i for i in range(columns)] | |
cols.append(data.shape[1] - 1) | |
data = data.loc[:, cols] | |
if rows: | |
data = data[:rows] | |
class_labels = data.pop(data.shape[1] - 1) | |
print('executing', experiment_name) | |
# log_file = dir + "log.txt" | |
try: | |
relevant_features = util.parse_relevant_features(experiment_name) | |
# with open(log_file, 'w') as log: | |
if method is cst.Method.TRIVIAL: | |
disc_intervals, disc_points, distances, init_bins_count, runtime = compute_trivial_discretization(data, trivial_bins_count) | |
sm_runtime = 0 | |
elif method is cst.Method.PERFECT: | |
disc_intervals, disc_points, runtime = compute_perfect_discretization( | |
re.search('(.+?_.+?_.+?_.+?)_', experiment_name).group(1), data) | |
sm_runtime = 0 | |
init_bins_count = 0 | |
else: | |
# if subspace_set: | |
# write(log, 'subspace set:', str(subspace_set)) | |
disc_intervals, disc_points, distances, init_bins_count, runtime, sm_runtime = compute_IPD(data, | |
method, | |
cor_measure, | |
distance_measure, | |
subspace_set) | |
# plot_distances(dir, distances, disc_intervals) | |
# output file for classification measurements | |
outdat, outarff = el.get_out_files(experiment_name, disc_intervals, disc_points, class_labels, | |
relevant_features) | |
# output file for compression measurements | |
# slim_dat_content = dq.prepare_slim_dat(base_dir, experiment_name) | |
cut = el.get_cuts(disc_intervals) | |
cut_file_content = el.get_cut_file(disc_intervals) | |
return Result(base_dir, experiment_name, outdat, outarff, cut, cut_file_content, runtime, sm_runtime, | |
init_bins_count, relevant_features) | |
# return Result(base_dir, experiment_name, None, None, None, None, None, None, relevant_features) | |
except: | |
print("Error in " + experiment_name + ":", sys.exc_info()[0], sys.exc_info()[1]) | |
traceback.print_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2], | |
limit=2, file=sys.stdout) | |
# print("deleting the directory of the failed experiment") | |
# shutil.rmtree(dir) | |
# raise | |
return None | |
class Result: | |
def __init__(self, base_dir, experiment_name, outdat_file_content, outarff_file_content, cut, cut_file_content, runtime, sm_runtime, | |
initial_bin_count, rel_feature_count): | |
self.cut_file_content = cut_file_content | |
self.outarff_file_content = outarff_file_content | |
self.rel_feature_count = rel_feature_count | |
self.sm_runtime = sm_runtime | |
self.initial_bin_count = initial_bin_count | |
self.runtime = runtime | |
self.base_dir = base_dir | |
self.experiment_name = experiment_name | |
self.cut = cut | |
self.outdat_file_content = outdat_file_content | |
self.dir = base_dir + experiment_name + "/" | |
def __repr__(self): | |
return "Result(experiment_name=" + self.experiment_name + ")" | |
def append_to_quality_measure_files(result, loader): | |
assert type(result) is Result | |
measure_file = result.base_dir + cst.PRECISION_RECALL_FILENAME | |
ideal_cuts = loader.load_ideal_disc(result.experiment_name) if loader else dqm.parse_cuts(result.experiment_name) | |
with open(measure_file, "a") as f: | |
for i in range(result.rel_feature_count): | |
f.write(",".join([result.experiment_name + "-dim" + str(i + 1), | |
str(dqm.disc_precision(ideal_cuts[i], result.cut[i])), | |
str(dqm.disc_recall(ideal_cuts[i], result.cut[i])), | |
str(result.sm_runtime), | |
str(result.runtime)])) | |
f.write("\n") | |
return | |
def append_to_compression_files(result): | |
measure_file = result.base_dir + cst.COMPRESSION_FILENAME | |
with open(measure_file, "a") as f: | |
f.write(",".join(dqm.run_compression1(result.experiment_name))) | |
f.write("\n") | |
def store(result, loader=None): | |
if not result: | |
return | |
assert type(result) is Result | |
print('storing experiment', result.experiment_name) | |
if not os.path.exists(result.dir): | |
os.makedirs(result.dir) | |
with open(result.dir + "log.txt", "w") as f: | |
f.write("initial bins count: " + str(result.initial_bin_count) + "\n") | |
f.write("runtime " + str(result.runtime) + " seconds\n") | |
f.write("sm runtime " + str(result.sm_runtime) + " seconds\n") | |
append_to_quality_measure_files(result, loader) | |
if not os.path.exists(cst.SLIM_DATA_DIR + result.experiment_name): | |
os.makedirs(cst.SLIM_DATA_DIR + result.experiment_name) | |
with open(cst.SLIM_DATA_DIR + result.experiment_name + "/" + result.experiment_name + ".dat", "w") as f: | |
f.writelines(result.outdat_file_content) | |
append_to_compression_files(result) | |
with open(result.dir + cst.FILE_DATA_OUTPUT, "w") as f: | |
f.writelines(result.outarff_file_content) | |
with open(result.dir + cst.FILE_DATA_CUTS, "w") as f: | |
f.writelines(result.cut_file_content) | |
class RunParams: | |
def __init__(self, base_dir, experiment_name, method, data_file, delim, columns, rows, distance_measure, threshold, | |
cor_measure, subspace_set, trivial_bins_count): | |
self.trivial_bins_count = trivial_bins_count | |
self.cor_measure = cor_measure | |
self.threshold = threshold | |
self.distance_measure = distance_measure | |
self.subspace_set = subspace_set | |
self.columns = columns | |
self.rows = rows | |
self.delim = delim | |
self.data_file = data_file | |
self.method = method | |
self.experiment_name = experiment_name | |
self.base_dir = base_dir | |
def __repr__(self): | |
return "RunParams(experiment_name=" + self.experiment_name +\ | |
", subspace_set=" + str(self.subspace_set) + ")" | |
def prepare(base_dir, data_file, method, time_mark=False, delim=";", columns=None, rows=None, | |
distance_measure=cst.DistanceMeasure.ID, | |
cor_measure=cst.CorrelationMeasure.UDS, | |
threshold=cst.ID_THRESHOLD_QUANTILE, | |
trivial_bins_count=None): | |
params = [] | |
# # defining prefix for the output files | |
data_file_name = util.get_file_name(data_file) | |
if method.name.startswith("PREDEFINED"): | |
subspace_sets = compute_subspace_sets(data_file_name, method) | |
if not subspace_sets: | |
return params | |
else: | |
subspace_sets = None | |
base_dir = cst.BASE + base_dir + "/" | |
# full, trivial, SM methods | |
if not method.name.startswith("PREDEFINED"): | |
experiment_name = data_file_name.replace(".csv", "") + ("_" + str(columns) + "c" if columns else "") + ( | |
"_" + str(rows) + "r" if rows else "") + "_" \ | |
+ method.name.replace("_", "") \ | |
+ ("_" + cor_measure.name if method.name.startswith("SM") else "") \ | |
+ ("_" + str(trivial_bins_count) if method is cst.Method.TRIVIAL and trivial_bins_count else "") | |
timed_name = (util.now() if time_mark else "") + ("_" if time_mark else "") + experiment_name | |
if not os.path.exists(base_dir + timed_name): | |
params.append( | |
RunParams(base_dir, timed_name, method, data_file, delim, columns, rows, distance_measure, threshold, | |
cor_measure, None, trivial_bins_count)) | |
print("prepared parameters for", experiment_name) | |
else: | |
print("experiment", experiment_name, "has already been processed") | |
# predefined subspace sets | |
else: | |
assert subspace_sets is not None | |
counter = 1 | |
for subspace_set in subspace_sets: | |
experiment_name = data_file_name.replace(".csv", "") + ("_" + str(columns) + "c" if columns else "") + ( | |
"_" + str(rows) + "r" if rows else "") + "_" \ | |
+ method.name.replace("_", "") \ | |
+ ("_s" + str(counter) if method.name.startswith("PREDEFINED_SUBSPACESETS") else "") \ | |
+ ("_i" + str(counter) | |
if method is cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT else "") | |
counter += 1 | |
timed_name = (util.now() + "_" if time_mark else "") + experiment_name | |
if os.path.exists(base_dir + timed_name): | |
print("experiment", experiment_name, "has already been processed") | |
continue | |
params.append( | |
RunParams(base_dir, timed_name, method, data_file, delim, columns, rows, distance_measure, threshold, | |
cor_measure, subspace_set, None)) | |
print("prepared parameters for", experiment_name) | |
return params | |
def collect_experiment_params(base_dir): | |
def collect(name, it, rf, i, type, c): | |
params = [] | |
file_path = cst.DATA_DIR + name + ".csv" | |
# additional type of trivial discretization | |
print("preparing", name, cst.Method.TRIVIAL, cst.TRIVIAL_BINS_COUNT) | |
params.extend(prepare(base_dir, file_path, cst.Method.TRIVIAL, trivial_bins_count=cst.TRIVIAL_BINS_COUNT)) | |
# first front of important methods | |
for method in [ | |
cst.Method.PREDEFINED_SUBSPACESETS_SYNCHRONOUS_GREEDY, | |
cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT, | |
cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET | |
]: | |
print("preparing", name, method) | |
params.extend(prepare(base_dir, file_path, method)) | |
# second front of less important methods | |
for method in [ | |
cst.Method.PREDEFINED_SUBSPACESETS_SYNCHRONOUS_OPTIMAL, | |
cst.Method.FULL | |
]: | |
print("preparing", name, method) | |
params.extend(prepare(base_dir, file_path, method)) | |
# third front of less important methods | |
for method in [ | |
cst.Method.PREDEFINED_SUBSPACESETS, | |
cst.Method.TRIVIAL, | |
]: | |
print("preparing", name, method) | |
params.extend(prepare(base_dir, file_path, method)) | |
return params | |
return util.collect_params(collect) | |
if __name__ == "__main__": | |
# print(compute_predefined_subspace_sets_naive(5)) | |
# exit(1) | |
# cubes_03_10_c | |
# print(compute_predefined_subspace_sets(3, [[0,1,2]])) | |
# exit(1) | |
params = collect_experiment_params("logs_test") | |
# print(params) | |
# print(compute_subspace_sets("cubes_10_03_i.csv", cst.Method.PREDEFINED_SUBSPACESETS)) | |
# exit(1) | |
# if len(sys.argv) == 1: | |
# # print( | |
# # 'Usage: main.py ' | |
# # '-b=<logs base dir> ' | |
# # '-f=<data_file> ' | |
# # '-d=<delimiter> ' | |
# # '-c=<number of columns> ' | |
# # '-m=<[original|greedy_topk|trivial|...]> ' | |
# # '-cor=<[uds]> ' | |
# # '-dist=<[id, cjs]> ' | |
# # '-t=<threshold float> ' | |
# # '-s[=<subspace>] ' | |
# # '-r=<number of rows> ') | |
# # command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID' | |
# # print('Running default: ', command) | |
# # command_list = command.split(' ') | |
# raise ValueError("no arguments passed!") | |
# else: | |
# command_list = sys.argv[1:] | |
# | |
# file_arg = list(filter(lambda x: x.startswith("-f="), command_list)) | |
# if not file_arg: | |
# raise ValueError('No data file provided!') | |
# base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list)) | |
# if not base_dir_arg: | |
# raise ValueError('No logs base dir provided!') | |
# time_mark = len(list(filter(lambda x: x.startswith("-time"), command_list))) != 0 | |
# delim_arg = list(filter(lambda x: x.startswith("-d="), command_list)) | |
# columns_arg = list(filter(lambda x: x.startswith("-c="), command_list)) | |
# rows_arg = list(filter(lambda x: x.startswith("-r="), command_list)) | |
# method_arg = list(filter(lambda x: x.startswith("-m="), command_list)) | |
# corr_measure_arg = list(filter(lambda x: x.startswith("-cor="), command_list)) | |
# distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list)) | |
# threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list)) | |
# trivial_bins_count_arg = list(filter(lambda x: x.startswith("-tb="), command_list)) | |
# | |
# data_file = file_arg[0].replace('-f=', '') | |
# base_dir = base_dir_arg[0].replace('-b=', '') | |
# | |
# if delim_arg: | |
# delimiter = delim_arg[0].replace('-d=', '') | |
# else: | |
# print('using default delimiter ;') | |
# delimiter = ';' | |
# columns = int(columns_arg[0].replace('-c=', '')) if columns_arg else None | |
# rows = int(rows_arg[0].replace('-r=', '')) if rows_arg else None | |
# if method_arg: | |
# method = cst.Method[method_arg[0].replace('-m=', '').upper()] | |
# else: | |
# print('using default method PREDEFINED_OPTIMAL_SUBSPACESET') | |
# method = cst.Method.PREDEFINED_OPTIMAL_SUBSPACESET | |
# | |
# cor_measure = cst.CorrelationMeasure[corr_measure_arg[0].replace('-cor=', '').upper()] if corr_measure_arg \ | |
# else None | |
# if method.name.startswith("SM") and cor_measure is None: | |
# raise ValueError('A correlation measure should be given!') | |
# | |
# if distance_measure_arg: | |
# distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()] | |
# print('using distance measure ' + distance_measure.name) | |
# else: | |
# distance_measure = cst.DistanceMeasure.ID | |
# print('using default distance measure ID') | |
# if threshold_arg: | |
# threshold = float(threshold_arg[0].replace('-t=', '')) | |
# | |
# print('using ID_THRESHOLD_QUANTILE = ', str(threshold)) | |
# else: | |
# threshold = cst.ID_THRESHOLD_QUANTILE | |
# print('using default ID_THRESHOLD_QUANTILE = ', str(threshold)) | |
# trivial_bins_count = None | |
# if trivial_bins_count_arg: | |
# trivial_bins_count = float(trivial_bins_count_arg[0].replace('-tb=', '')) | |
# | |
# print('trivial bins count = ', str(trivial_bins_count)) | |
# else: | |
# print('trivial bins count is default (will be computed automatically)') | |
# | |
# params = prepare(base_dir, data_file, method, time_mark, delimiter, columns, rows, distance_measure, cor_measure, | |
# threshold, trivial_bins_count) | |
# print(params) | |
# for p in params: | |
# result = execute(p) | |
# store(result) |