Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/main.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
850 lines (731 sloc)
35.9 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import sys | |
import time | |
# todo fix for server push | |
import matplotlib | |
matplotlib.use('Agg') | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import os | |
import re | |
import constants as cst | |
import objects as o | |
from methods import Method | |
import interaction_distance as id | |
import footprint as f | |
import subspace_mining as sm | |
import util | |
from correlation_measures.binning import Binning | |
import experiments_logging as el | |
import merging as m | |
import cjs | |
import discretization_quality_measure as dqm | |
import traceback | |
# ------------------------------------------------------ | |
def find_disc_macro_id(disc_macro_intervals, point): | |
for id, macro in enumerate(disc_macro_intervals): | |
if macro[0] <= point <= macro[1]: | |
return id | |
raise ValueError("Micro bin is not covered by any of the macro intervals!", disc_macro_intervals, point) | |
def write(log, *args): | |
join = ' '.join([str(a) for a in args]) | |
if log is not None: | |
log.append(join + '\n') | |
def plot_distances(dir, distances, disc_intervals): | |
distances = distances[:8] | |
dim_count = len(distances) | |
plt.figure(1) | |
height = int(math.sqrt(dim_count)) | |
width = int(math.ceil((dim_count) / height)) | |
fig, axes = plt.subplots(nrows=height, ncols=width, squeeze=False) | |
for curr, dist in enumerate(distances): | |
ax1 = axes[int(curr / width), int(curr % width)] | |
# ax1.set_ylim([0, 0.1]) | |
# ax1.hist(distances, bins=100, color='c') | |
ax1.plot(dist[0], dist[1]) | |
if distance_measure is not cst.DistanceMeasure.FID: | |
ID_threshold = id.compute_ID_threshold(dist[0], dist[1]) | |
ax1.axhline(ID_threshold, color='b', linestyle='dashed', linewidth=1) | |
curr_macro_intervals = disc_intervals[curr] | |
for macro_id in range(1, len(curr_macro_intervals)): | |
ax1.axvline(curr_macro_intervals[macro_id][0], color='r') | |
ax1.set_title('dimension ' + str(curr)) | |
plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25, wspace=0.35) | |
if not os.path.exists(dir): | |
os.makedirs(dir) | |
plt.savefig(dir + 'distances.png', format='png') | |
def compute_distances(bin_map, curr, data, dim_maxes, | |
cor_measure, method, distance_measure, dist_attr, | |
curr_points, | |
k, | |
delta=cst.HETEROGENEOUS_THRESHOLD, | |
beam_width=cst.BEAM_WIDTH, | |
subspace_map=None, | |
log=None): | |
curr_subspace = None | |
sm_runtime = 0 | |
if method.name.startswith("SM"): | |
dists = None | |
t = time.time() | |
if method == Method.SM_GREEDY_TOPK: | |
curr_subspace, dists = sm.greedy_topk(bin_map, curr, data, curr_points, dim_maxes, k, cor_measure) | |
elif method == Method.SM_HET_GREEDY_TOPK: | |
curr_subspace, dists = sm.het_greedy_topk(bin_map, curr, data, curr_points, dim_maxes, k, delta, cor_measure) | |
elif method == Method.SM_BEST_FIRST: | |
curr_subspace, dists = sm.best_first(bin_map, curr, data, curr_points, dim_maxes, k, cor_measure) | |
elif method == Method.SM_BEAM_SEARCH: | |
curr_subspace, dists = sm.beam_search(bin_map, curr, data, curr_points, dim_maxes, k, beam_width, cor_measure) | |
elif method == Method.SM_HET_BEAM_SEARCH: | |
curr_subspace, dists = sm.het_beam_search(bin_map, curr, data, curr_points, dim_maxes, k, delta, beam_width, cor_measure) | |
else: | |
raise ValueError("there is no such method!") | |
sm_runtime = time.time() - t | |
write(log, method.name, 'for dim', curr, ":", curr_subspace) | |
# write(log, 'subspace mining runtime:', sm_runtime, 'seconds') | |
if dists is not None and cor_measure.name == distance_measure.name: | |
return dists, sm_runtime, sm_runtime, curr_subspace | |
elif method.name.startswith("PREDEFINED"): | |
if curr in subspace_map: | |
curr_subspace = subspace_map[curr] | |
else: | |
curr_subspace = set() | |
if curr_subspace is not None: | |
curr_subspace = set(curr_subspace).union({curr}) | |
data = data.copy().loc[:, curr_subspace] | |
dim_maxes = dim_maxes[curr_subspace] | |
t = time.time() | |
if distance_measure == cst.DistanceMeasure.ID: | |
distances = id.compute_IDs(bin_map, curr, data, dim_maxes) | |
elif distance_measure == cst.DistanceMeasure.CJS: | |
distances = cjs.compute_CJSs(bin_map, curr, data, dim_maxes) | |
elif distance_measure == cst.DistanceMeasure.FID: | |
distances = f.compute_ID_footprints(bin_map, curr, data, dim_maxes, dist_attr, curr_points) | |
else: | |
raise ValueError("distance measure is not supported!", distance_measure) | |
dist_runtime = time.time() - t | |
return distances, dist_runtime, sm_runtime, curr_subspace | |
# return fr.compute_fractalIDs1(bin_map, data, dim_maxes), sm_runtime | |
# return (id.compute_IDs1(bin_map, data, dim_maxes, curr_points) if distance_measure == cst.DistanceMeasure.ID | |
# else cjs.compute_CJSs1(bin_map, data, dim_maxes)), sm_runtime | |
# def log_distances(data, method=Method.PREDEFINED_OPTIMAL_SUBSPACESET, cor_measure=None, | |
# distance_measure=cst.DistanceMeasure.ID, | |
# subspace_set=None, | |
# log=None): | |
# start = time.time() | |
# dim_count = data.shape[1] | |
# | |
# # number of initial dist_bins | |
# # todo old remove later | |
# # init_bins_count = 20 # ceil in original ipd... | |
# init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) # ceil in original ipd... | |
# write(log, 'row count:', data.shape[0]) | |
# # write(log, 'init_bins_count:', init_bins_count) | |
# write(log, 'ID_THRESHOLD_QUANTILE:', cst.ID_THRESHOLD_QUANTILE) | |
# | |
# # normalization step todo old (optional) | |
# # todo old by default the normalization is optional as it does not influence on the results | |
# | |
# # norm_data = data.apply(lambda x: 2 * cst.NORMALIZATION_RADIUS * (x - x.min()) / ( | |
# # x.max() - x.min()) - cst.NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape))) | |
# | |
# norm_data = data | |
# | |
# # dimension maximums | |
# dim_maxes = norm_data.max(0) | |
# disc_macro_intervals = [] | |
# disc_points = [] | |
# | |
# subspace_map = get_map_from_subspace_set(subspace_set) if subspace_set else None | |
# distancez = [] | |
# # iterate over all the dimensions | |
# full_sm_runtime = 0 | |
# for curr in range(dim_count): | |
# binning = Binning(norm_data, curr, init_bins_count) | |
# bin_map = binning.equal_frequency_binning_by_rank() | |
# dist_bins = bin_map.cat.categories | |
# | |
# # -----------------------------INTERACTION DISTANCES---------------------------------- | |
# | |
# curr_points = [data.loc[binning.rank_data[binning.rank_data[curr] == math.floor( | |
# float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)', dist_bins[i]).group(1)))].index.tolist()[0], curr] for i in | |
# range(len(dist_bins) - 1)] | |
# distances, sm_runtime = compute_distances(bin_map, curr, norm_data, dim_maxes, cor_measure, method, | |
# distance_measure, curr_points, subspace_map=subspace_map, log=log) | |
def compute_IPD(data, relevant_features, method, distance_measure, dist_attr, | |
cor_measure, subspace_set, sm_k, | |
log): | |
# dim_count = data.shape[1] | |
# computing only for relevant features (because only these dims are important for quality measurements) | |
dim_count = relevant_features | |
# number of initial dist_bins | |
# todo old remove later | |
# init_bins_count = 20 # ceil in original ipd... | |
init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) # ceil in original ipd... | |
# write(log, 'row count:', data.shape[0]) | |
# write(log, 'init_bins_count:', init_bins_count) | |
# write(log, 'ID_THRESHOLD_QUANTILE:', cst.ID_THRESHOLD_QUANTILE) | |
# normalization step todo old (optional) | |
# todo old by default the normalization is optional as it does not influence on the results | |
norm_data = data.apply(lambda x: 2 * cst.NORMALIZATION_RADIUS * (x - x.min()) / ( | |
x.max() - x.min()) - cst.NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape))) | |
# norm_data = data | |
# dimension maximums | |
dim_maxes = norm_data.max(0) | |
disc_macro_intervals = [] | |
disc_points = [] | |
predefined_subspace_map = get_map_from_subspace_set(subspace_set) if subspace_set else None | |
subspace_map = dict() | |
distancez = [] | |
# iterate over all the relevant dimensions | |
sm_runtimes = [] | |
dist_runtimes = [] | |
disc_runtimes = [] | |
for curr in range(dim_count): | |
binning = Binning(norm_data, curr, init_bins_count) | |
bin_map = binning.equal_frequency_binning_by_rank() | |
dist_bins = bin_map.cat.categories | |
# -----------------------------INTERACTION DISTANCES---------------------------------- | |
curr_points = [norm_data.loc[binning.rank_data[binning.rank_data[curr] == math.floor( | |
float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)', dist_bins[i]).group(1)))].index.tolist()[0], curr] for i in | |
range(len(dist_bins))] | |
distances, dist_runtime, sm_runtime, subspace = compute_distances(bin_map, curr, norm_data, dim_maxes, cor_measure, method, | |
distance_measure, dist_attr, curr_points, sm_k, subspace_map=predefined_subspace_map, log=log) | |
subspace_map[curr] = subspace - {curr} | |
# todo python361 | |
# distancez.append([[data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[i].right)].index.tolist()[0], curr] for i in | |
# range(len(distances))], distances]) | |
# todo python342 | |
distancez.append([curr_points, distances[1] if distance_measure is cst.DistanceMeasure.FID else distances]) | |
dist_runtimes.append(dist_runtime) | |
sm_runtimes.append(sm_runtime) | |
# todo ext compute sliding average and count ID peaks above the avg (in a sliding window) | |
# ID_peaks = id.compute_sliding_count(distances, ID_threshold) | |
# pd.DataFrame(distances).to_csv(prefix + "_IDs_" + str(curr) + ".csv") | |
# -----------------------------OPTIMAL MERGE STRATEGY---------------------------------- | |
# table with costs, the value in i-th row and j-th column means cost of (i+1) micro bins merged into (j+1) | |
# macro bins | |
t = time.time() | |
if distance_measure is cst.DistanceMeasure.FID: | |
F, discretizations = m.dynamic_merging_footprints(distances[0], init_bins_count, distances[2]) | |
else: | |
ID_threshold = id.compute_ID_threshold(distances, dist_attr) | |
F, discretizations = m.dynamic_merging(ID_threshold, distances, init_bins_count) | |
disc_runtimes.append(time.time() - t) | |
# pd.DataFrame(F).to_csv(prefix + "_F_" + str(curr) + ".csv") | |
# pd.DataFrame([[[b[-1] for b in k[:-1]] for k in c] for c in discretizations]).to_csv(prefix + "_bp_" + str(curr) + ".csv") | |
min_id = np.argmin(F[-1]) | |
discretization = discretizations[-1][min_id] | |
write(log, 'dim', curr, '# bins:', len(discretization)) | |
(curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretization, | |
dist_bins, binning.rank_data) | |
# todo uncomment if detailed log is necessary | |
# write(log, '-------------------------') | |
# write(log, 'dimension:', curr) | |
# write(log, 'ID_threshold:', ID_threshold) | |
# write(log, 'cost:', F[-1, min_id]) | |
# write(log, 'number of macrobins:', len(curr_macro_intervals)) | |
# | |
# # write(log, 'distances', distances) | |
# write(log, '\ndistances between the macrobins:') | |
# for macro_id, macro_bin in enumerate(discretization[:-1]): | |
# write(log, "{0:.2f}".format(curr_macro_intervals[macro_id][1]) + " -", distances[macro_bin[-1]], '[q=' + | |
# str((sorted(distances).index(distances[macro_bin[-1]]) + 1) / len(distances)) + ']') | |
# | |
# write(log, '\nnumber of points per macrobin:') | |
# for macro_id in curr_macro_intervals: | |
# write(log, "[" + "{0:.2f}".format(curr_macro_intervals[macro_id][0]) + ",", | |
# "{0:.2f}".format(curr_macro_intervals[macro_id][1]) + "]", | |
# sum([1 for p in curr_macro_points if p == macro_id])) | |
# write(log, '\n') | |
disc_macro_intervals.append(curr_macro_intervals) | |
disc_points.append(curr_macro_points) | |
return disc_macro_intervals, disc_points, distancez, init_bins_count, dist_runtimes, sm_runtimes, disc_runtimes, subspace_map | |
def compute_trivial_discretization(data, relevant_features, trivial_bins_count=None, log=None): | |
dim_count = relevant_features | |
bins_count = int(math.ceil(math.sqrt(data.shape[0]))) if not trivial_bins_count else trivial_bins_count | |
write(log, 'row count:', data.shape[0]) | |
write(log, 'bins_count:', bins_count) | |
norm_data = data | |
# dimension maximums | |
disc_macro_intervals = [] | |
disc_points = [] | |
distancez = [] | |
runtimes = [] | |
# iterate over all the dimensions | |
for curr in range(dim_count): | |
start = time.time() | |
binning = Binning(norm_data, curr, bins_count) | |
bin_map = binning.equal_frequency_binning_by_rank() | |
dist_bins = bin_map.cat.categories | |
discretization = [[i] for i in range(bins_count)] | |
(curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretization, | |
dist_bins, binning.rank_data) | |
disc_macro_intervals.append(curr_macro_intervals) | |
disc_points.append(curr_macro_points) | |
runtimes.append(time.time() - start) | |
return disc_macro_intervals, disc_points, distancez, bins_count, runtimes | |
def read_discretization(disc_file): | |
disc = [] | |
d = {} | |
last = -2 | |
bin = 0 | |
with open(disc_file, "r") as f: | |
for line in f: | |
if line.startswith("---"): | |
disc.append(d) | |
continue | |
if line.startswith("dimension"): | |
last = -2 | |
bin = 0 | |
d = {} | |
continue | |
if len(line.strip()) == 0: | |
continue | |
next = float(line.strip()) | |
d[bin] = [last, next] | |
bin += 1 | |
last = next | |
return disc | |
def compute_perfect_discretization(problem, data, log=None): | |
start = time.time() | |
dim_count = data.shape[1] | |
# dimension maximums | |
disc_points = [] | |
all_intervals = read_discretization('ideal_disc/cut_' + problem + ".txt") | |
# iterate over all the dimensions | |
for curr in range(len(all_intervals)): | |
intervals = all_intervals[curr] | |
macro_points = [] | |
for point in data.iterrows(): | |
macro_points.append(find_disc_macro_id(intervals, point[1][curr])) | |
disc_points.append(macro_points) | |
end = time.time() | |
write(log, end - start, 'seconds') | |
return all_intervals, disc_points, end - start | |
def get_discretized_points(curr, data, discretization, dist_bins, rank_data): | |
disc_macro_intervals = [] | |
for i, macro_bin in enumerate(discretization): | |
macro_interval = [] | |
for micro_bin_id in macro_bin: | |
# todo python361 | |
# right = \ | |
# data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[micro_bin_id].right)][curr].index[0]][curr] | |
# if not len(macro_interval): | |
# macro_interval.append( | |
# data.loc[rank_data[rank_data[curr] == math.ceil(dist_bins[micro_bin_id].left)][curr].index[0]][ | |
# curr]) | |
# macro_interval.append(right) | |
# todo python342 | |
right = \ | |
data.loc[rank_data[rank_data[curr] == math.floor(float(re.search(', (-*\d+\.*\d*e*[+-]*\d*)', | |
dist_bins[micro_bin_id]).group(1)))][ | |
curr].index[0]][curr] | |
if not len(macro_interval): | |
macro_interval.append( | |
data.loc[rank_data[rank_data[curr] == math.ceil(float(re.search('(-*\d+\.*\d*e*[+-]*\d*),', | |
dist_bins[micro_bin_id]).group( | |
1)))][curr].index[0]][ | |
curr]) | |
macro_interval.append(right) | |
else: | |
macro_interval[1] = right | |
disc_macro_intervals.append(macro_interval) | |
macro_points = [] | |
for point in data.iterrows(): | |
macro_points.append(find_disc_macro_id(disc_macro_intervals, point[1][curr])) | |
return disc_macro_intervals, macro_points | |
def _compute_subspaces(dims, sets): | |
count = len(dims) | |
if count > 0: | |
_compute_subspaces(dims[1:], sets) | |
elif count == 0: | |
sets.append([]) | |
return | |
new_set = [] | |
for s in sets: | |
sp = list(s) | |
sp.append(dims[0]) | |
new_set.append(sp) | |
sets.extend(new_set) | |
def get_map_from_subspace_set(subspace_set): | |
map = dict() | |
for subspace in subspace_set: | |
for dim in subspace: | |
if dim not in map: | |
map[dim] = set() | |
map[dim] = map[dim].union(set(subspace) - {dim}) | |
return map | |
def execute(param, loader=None): | |
assert type(param) == o.RunParams | |
base_dir = param.base_dir | |
experiment_name = param.experiment_name | |
method = param.method | |
data_file = param.data_file | |
delim = param.delim | |
subspace_set = param.subspace_set | |
distance_measure = param.distance_measure | |
cor_measure = param.cor_measure | |
trivial_bins_count = param.trivial_bins_count | |
sm_k = param.sm_k | |
dist_attr = param.dist_attr | |
# reading data from the file with delimiter and NaN values as "?" | |
data = None | |
if loader == None: | |
data = pd.read_csv(data_file, delimiter=delim, header=None, na_values='?') | |
else: | |
data = loader.load_dataset(data_file, delim) | |
# drop a data point if it contains inconsistent data | |
data = data.dropna(axis=0, how='any') | |
class_labels = data.pop(data.shape[1] - 1) | |
print('executing', experiment_name) | |
# log_file = dir + "log.txt" | |
log = [] | |
try: | |
write(log, "experiment_name", experiment_name) | |
write(log, "base_dir", base_dir) | |
write(log, "data_file", data_file) | |
write(log, "delim", delim) | |
write(log, "method", method) | |
write(log, "distance_measure", distance_measure) | |
write(log, "dist_attr", dist_attr) | |
write(log, "subspace_set", subspace_set) | |
write(log, "cor_measure", cor_measure) | |
write(log, "sm_k", sm_k) | |
write(log, "trivial_bins_count", trivial_bins_count) | |
relevant_features = util.parse_relevant_features(experiment_name) | |
if relevant_features is None: | |
write(log, "could not parse relevant features from", experiment_name) | |
relevant_features = data.shape[1] | |
write(log, "relevant features", relevant_features) | |
# with open(log_file, 'w') as log: | |
if method is Method.TRIVIAL: | |
disc_intervals, disc_points, distances, init_bins_count, runtimes = compute_trivial_discretization(data, relevant_features, trivial_bins_count) | |
disc_runtimes = runtimes | |
sm_runtimes = [0] * relevant_features | |
dist_runtimes = [0] * relevant_features | |
subspace_map = None | |
# elif method is Method.PERFECT: | |
# raise ValueError("not implemented yet!") | |
# search = re.search('(.+?_.+?_.+?_.+?)_', experiment_name) | |
# if not search: | |
# raise ValueError('no perfect discretization for ' + experiment_name + ' is stored') | |
# disc_intervals, disc_points, runtime = compute_perfect_discretization( | |
# search.group(1), data) | |
# sm_runtime = 0 | |
# init_bins_count = 0 | |
else: | |
# if subspace_set: | |
# write(log, 'subspace set:', str(subspace_set)) | |
disc_intervals, disc_points, distances, init_bins_count, dist_runtimes, sm_runtimes, disc_runtimes, subspace_map = compute_IPD(data, | |
relevant_features, | |
method, | |
distance_measure, | |
dist_attr, | |
cor_measure, | |
subspace_set, | |
sm_k, | |
log) | |
# todo when only distances info is necessary | |
# log_distances(data, | |
# method, | |
# cor_measure, | |
# distance_measure, | |
# subspace_set, | |
# log) | |
# for l in log: | |
# print(l) | |
# plot_distances(base_dir + experiment_name + "/", distances, disc_intervals) | |
# output file for classification measurements | |
outdat, outarff = el.get_out_files(experiment_name, disc_intervals, disc_points, class_labels, | |
relevant_features) | |
cut = el.get_cuts(disc_intervals) | |
cut_file_content = el.get_cut_file(disc_intervals) | |
return Result(base_dir, experiment_name, outdat, outarff, cut, cut_file_content, dist_runtimes, sm_runtimes, disc_runtimes, | |
init_bins_count, relevant_features, subspace_map, log) | |
# return Result(base_dir, experiment_name, None, None, None, None, None, None, relevant_features) | |
except: | |
print("Error in " + experiment_name + ":", sys.exc_info()[0], sys.exc_info()[1]) | |
traceback.print_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2], | |
file=sys.stdout) | |
print('---LOG---') | |
for l in log: | |
print(l.strip()) | |
# print("deleting the directory of the failed experiment") | |
# shutil.rmtree(dir) | |
# raise | |
return None | |
class Result: | |
def __init__(self, base_dir, experiment_name, outdat_file_content, outarff_file_content, cut, cut_file_content, dist_runtimes, sm_runtimes, disc_runtimes, | |
initial_bin_count, rel_feature_count, subspace_map, log): | |
self.disc_runtimes = disc_runtimes | |
self.sm_runtimes = sm_runtimes | |
self.dist_runtimes = dist_runtimes | |
self.subspace_map = subspace_map | |
self.log = log | |
self.cut_file_content = cut_file_content | |
self.outarff_file_content = outarff_file_content | |
self.rel_feature_count = rel_feature_count | |
self.initial_bin_count = initial_bin_count | |
self.base_dir = base_dir | |
self.experiment_name = experiment_name | |
self.cut = cut | |
self.outdat_file_content = outdat_file_content | |
self.dir = base_dir + experiment_name + "/" | |
def __repr__(self): | |
return "Result(experiment_name=" + self.experiment_name + ")" | |
def append_to_quality_measure_file(result, loader): | |
assert type(result) is Result | |
measure_file = result.base_dir + cst.PRECISION_RECALL_FILENAME | |
ideal_cuts = loader.load_ideal_disc(result.experiment_name) if loader else dqm.parse_ideal_cuts(result.experiment_name) | |
with open(measure_file, "a") as f: | |
for i in range(result.rel_feature_count): | |
f.write(";".join([result.experiment_name + "-dim" + str(i), | |
str(dqm.disc_precision(ideal_cuts[i], result.cut[i])) if ideal_cuts else "", | |
str(dqm.disc_recall(ideal_cuts[i], result.cut[i])) if ideal_cuts else "", | |
str(result.sm_runtimes[i]), | |
str(result.disc_runtimes[i]), | |
str(result.dist_runtimes[i]), | |
str(len(ideal_cuts[i])) if ideal_cuts else "", | |
str(len(result.cut[i])), | |
str(result.subspace_map[i] if result.subspace_map is not None and len(result.subspace_map[i]) > 0 else "{}")])) | |
f.write("\n") | |
return | |
def append_to_compression_files(result): | |
measure_file = result.base_dir + cst.COMPRESSION_FILENAME | |
with open(measure_file, "a") as f: | |
f.write(",".join(dqm.run_compression1(result.experiment_name))) | |
f.write("\n") | |
def store(result, loader=None): | |
if result is None: | |
return | |
assert type(result) is Result | |
print('storing experiment', result.experiment_name) | |
if not os.path.exists(result.dir): | |
os.makedirs(result.dir) | |
with open(result.dir + "log.txt", "w") as f: | |
f.write("initial bins count: " + str(result.initial_bin_count) + "\n") | |
f.write("total distance runtime " + str(sum(result.dist_runtimes)) + " seconds\n") | |
f.write("total sm runtime " + str(sum(result.sm_runtimes)) + " seconds\n") | |
f.write("total dynamic merging runtime " + str(sum(result.disc_runtimes)) + " seconds\n") | |
f.write("\n--LOG--\n") | |
f.writelines(result.log) | |
append_to_quality_measure_file(result, loader) | |
if not os.path.exists(cst.SLIM_DATA_DIR + result.experiment_name): | |
os.makedirs(cst.SLIM_DATA_DIR + result.experiment_name) | |
with open(cst.SLIM_DATA_DIR + result.experiment_name + "/" + result.experiment_name + ".dat", "w") as f: | |
f.writelines(result.outdat_file_content) | |
# if 'xor' not in result.experiment_name: | |
# append_to_compression_files(result) | |
with open(result.dir + cst.FILE_DATA_OUTPUT, "w") as f: | |
f.writelines(result.outarff_file_content) | |
with open(result.dir + cst.FILE_DATA_CUTS, "w") as f: | |
f.writelines(result.cut_file_content) | |
def compute_distance_attrs(data_file_name, distance_measure, threshold, footprints_number): | |
if distance_measure is not cst.DistanceMeasure.FID: | |
return [threshold] | |
if footprints_number is not None: | |
return [footprints_number] | |
# if "XOR" in data_file_name: | |
return [footprints_number for footprints_number in cst.FOOTPRINTS_NUMBER_RANGE_LIST] | |
# else: | |
# cubes_number = util.parse_cubes_number(data_file_name) | |
# return [int(cubes_number / 2), cubes_number, cubes_number * 2] | |
def construct_distance_name(distance_measure, distance_attr, method): | |
if method is Method.TRIVIAL: | |
return "" | |
return distance_measure.name + "_" + (("fn" + str(distance_attr)) if distance_measure is cst.DistanceMeasure.FID | |
else ("t" + str(distance_attr).replace(".", ""))) | |
def prepare(base_dir, data_file, method, delim=";", | |
distance_measure=cst.DistanceMeasure.ID, | |
cor_measure=cst.CorrelationMeasure.ID, | |
threshold=cst.ID_THRESHOLD_QUANTILE, | |
footprints_number=None, | |
experiment_names=None): | |
params = [] | |
# # defining prefix for the output files | |
data_file_name = util.get_file_name(data_file) | |
data_name = data_file_name.replace(".csv", "") | |
method_attrs = method.compute_attrs(data_file_name) | |
distance_attrs = compute_distance_attrs(data_file_name, distance_measure, threshold, footprints_number) | |
base_dir = cst.BASE + base_dir + "/" | |
for method_attr in method_attrs: | |
method_name = method.construct_method_name(cor_measure, method_attr) | |
for distance_attr in distance_attrs: | |
distance_name = construct_distance_name(distance_measure, distance_attr, method) | |
experiment_name = data_name + "__" + distance_name + "__" + method_name | |
if os.path.exists(base_dir + experiment_name) or experiment_names is not None and experiment_name in experiment_names: | |
print("experiment", experiment_name, "has already been processed") | |
continue | |
if experiment_names is not None: | |
experiment_names.add(experiment_name) | |
run_params = method.construct_run_params(base_dir, experiment_name, data_file, delim, cor_measure, | |
distance_measure, distance_attr, method_attr) | |
params.append(run_params) | |
print("prepared parameters for", experiment_name) | |
return params | |
def collect_experiment_params(base_dir): | |
def collect(name, interaction_type, rows, rf, i, c, offset): | |
params = [] | |
experiment_names = set() | |
file_path = cst.DATA_DIR + name + ".csv" | |
for method in [ | |
Method.SM_GREEDY_TOPK, | |
Method.SM_BEST_FIRST, | |
Method.PREDEFINED_GREEDY_OPTIMAL, | |
Method.PREDEFINED_OPTIMAL, | |
Method.PREDEFINED_FULL, | |
Method.PREDEFINED_NAIVE, | |
Method.TRIVIAL, | |
]: | |
for distance_measure in [cst.DistanceMeasure.FID]: | |
print("preparing", name, method, distance_measure) | |
# cor_measure = cst.CorrelationMeasure.ID | |
for cor_measure in [cst.CorrelationMeasure.FID, cst.CorrelationMeasure.ID]: | |
params.extend(prepare(base_dir, file_path, method, cor_measure=cor_measure, distance_measure=distance_measure, experiment_names=experiment_names)) | |
return params | |
return util.datasets_iterator(collect) | |
# def collect_real_experiment_params(base_dir): | |
# | |
# def collect(name): | |
# params = [] | |
# file_path = cst.DATA_DIR + name | |
# | |
# for method in [ | |
# # Method.SM_GREEDY_TOPK, | |
# # Method.SM_HET_GREEDY_TOPK, | |
# # Method.SM_BEST_FIRST, | |
# # Method.SM_BEAM_SEARCH, | |
# # Method.SM_HET_BEAM_SEARCH, | |
# Method.PREDEFINED_SUBSPACESETS_SYNCHRONOUS_GREEDY, | |
# # Method.PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT, | |
# # Method.PREDEFINED_OPTIMAL_SUBSPACESET, | |
# Method.PREDEFINED_SUBSPACESETS_NAIVE, | |
# # Method.PREDEFINED_SUBSPACESETS_SYNCHRONOUS_OPTIMAL, | |
# # Method.FULL, | |
# # Method.PREDEFINED_SUBSPACESETS, | |
# # Method.TRIVIAL, | |
# | |
# ]: | |
# print("preparing", name, method) | |
# params.extend(prepare(base_dir, file_path, method, cor_measure=cst.CorrelationMeasure.ID)) | |
# | |
# return params | |
# | |
# all_params = [] | |
# for f in os.listdir(cst.DATA_DIR): | |
# param = collect(f) | |
# if type(param) == list: | |
# all_params.extend(param) | |
# else: | |
# all_params.append(param) | |
# return all_params | |
if __name__ == "__main__": | |
# params = collect_experiment_params("") | |
# exit(1) | |
# with open(cst.BASE + 'logs_naive/experiments_list.txt', 'w') as f: | |
# for p in params: | |
# f.write(p.experiment_name + "\n") | |
# exit() | |
# params = collect_experiment_params("logs_test") | |
# sm_runtime = [] | |
# | |
# for p in params: | |
# if p.method.name.startswith('SM'): | |
# if not os.path.exists(p.base_dir + p.experiment_name + "/log.txt"): | |
# print(p.base_dir + p.experiment_name + "/log.txt", 'does not exist') | |
# continue | |
# with open(p.base_dir + p.experiment_name + "/log.txt", 'r') as f: | |
# for l in f.readlines(): | |
# search = re.search('sm runtime \d+\.\d+ seconds', l) | |
# if search: | |
# sm_runtime.append(",".join([p.experiment_name, search.group(1) + "\n"])) | |
# print('search', p.base_dir + p.experiment_name + "/log.txt") | |
# break | |
# with open(cst.BASE + "logs_test/sm_runtimes.txt", 'w') as f: | |
# f.writelines(sm_runtime) | |
# | |
# | |
# exit(0) | |
# print(compute_predefined_subspace_sets_naive(5)) | |
# exit(1) | |
# cubes_03_10_c | |
# print(compute_predefined_subspace_sets(3, [[0,1,2]])) | |
# exit(1) | |
# params = collect_experiment_params("logs_test") | |
# print(params) | |
# print(compute_subspace_sets("cubes_10_03_i.csv", Method.PREDEFINED_SUBSPACESETS)) | |
# exit(1) | |
if len(sys.argv) == 1: | |
# print( | |
# 'Usage: main.py ' | |
# '-b=<logs base dir> ' | |
# '-f=<data_file> ' | |
# '-d=<delimiter> ' | |
# '-c=<number of columns> ' | |
# '-m=<[original|greedy_topk|trivial|...]> ' | |
# '-cor=<[uds]> ' | |
# '-dist=<[id, cjs]> ' | |
# '-t=<threshold float> ' | |
# '-s[=<subspace>] ' | |
# '-r=<number of rows> ') | |
# command = '-b=logs -f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID' | |
# print('Running default: ', command) | |
# command_list = command.split(' ') | |
raise ValueError("no arguments passed!") | |
else: | |
command_list = sys.argv[1:] | |
file_arg = list(filter(lambda x: x.startswith("-f="), command_list)) | |
if not file_arg: | |
raise ValueError('No data file provided!') | |
base_dir_arg = list(filter(lambda x: x.startswith("-b="), command_list)) | |
if not base_dir_arg: | |
raise ValueError('No logs base dir provided!') | |
delim_arg = list(filter(lambda x: x.startswith("-d="), command_list)) | |
method_arg = list(filter(lambda x: x.startswith("-m="), command_list)) | |
corr_measure_arg = list(filter(lambda x: x.startswith("-cor="), command_list)) | |
distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list)) | |
footprints_number_arg = list(filter(lambda x: x.startswith("-fn="), command_list)) | |
threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list)) | |
trivial_bins_count_arg = list(filter(lambda x: x.startswith("-tb="), command_list)) | |
data_file = file_arg[0].replace('-f=', '') | |
base_dir = base_dir_arg[0].replace('-b=', '') | |
if delim_arg: | |
delimiter = delim_arg[0].replace('-d=', '') | |
else: | |
print('using default delimiter ;') | |
delimiter = ';' | |
if method_arg: | |
method = Method[method_arg[0].replace('-m=', '').upper()] | |
else: | |
raise ValueError("no method is passed!") | |
cor_measure = cst.CorrelationMeasure[corr_measure_arg[0].replace('-cor=', '').upper()] if corr_measure_arg \ | |
else None | |
if method.name.startswith("SM") and cor_measure is None: | |
raise ValueError('A correlation measure should be given!') | |
if distance_measure_arg: | |
distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()] | |
print('using distance measure ' + distance_measure.name) | |
else: | |
distance_measure = cst.DistanceMeasure.ID | |
print('using default distance measure ID') | |
if threshold_arg: | |
threshold = float(threshold_arg[0].replace('-t=', '')) | |
print('using ID_THRESHOLD_QUANTILE = ', str(threshold)) | |
else: | |
threshold = cst.ID_THRESHOLD_QUANTILE | |
print('using default ID_THRESHOLD_QUANTILE = ', str(threshold)) | |
footprints_number = int(footprints_number_arg[0].replace('-fn=', '')) if footprints_number_arg else None | |
trivial_bins_count = None | |
if method is Method.TRIVIAL: | |
if trivial_bins_count_arg: | |
trivial_bins_count = int(trivial_bins_count_arg[0].replace('-tb=', '')) | |
print('trivial bins count = ', str(trivial_bins_count)) | |
else: | |
print('trivial bins count is default (will be computed automatically)') | |
params = prepare(base_dir, data_file, method, delimiter, distance_measure, cor_measure, | |
threshold, footprints_number) | |
print(params) | |
for p in params: | |
result = execute(p) | |
store(result) |