Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/methods.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
306 lines (242 sloc)
12.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
from enum import Enum | |
import objects as o | |
import constants as cst | |
import util | |
import random | |
def compute_predefined_subspace_sets(rel_features, ideal_subspace_set): | |
subspace_sets = [] | |
init_subset = [] | |
dim_map = {i: [] for i in range(rel_features)} | |
dims = [] | |
# if the ideal_subspace_set is already minimal there are no other subspace sets | |
if sum([len(s) == 2 for s in ideal_subspace_set]) == len(ideal_subspace_set): | |
return subspace_sets | |
max_subspace_size = 0 | |
for e, ideal_subspace in enumerate(ideal_subspace_set): | |
if len(ideal_subspace) > max_subspace_size: | |
max_subspace_size = len(ideal_subspace) | |
# every subspace consists of 2 dims | |
init_subset.append(ideal_subspace[:2]) | |
# dims left for considering | |
for i in ideal_subspace[2:]: | |
dim_map[i].append(e) | |
dims.append(i) | |
subspace_sets.append(init_subset) | |
last = init_subset | |
# 2 is minimal number of interacting dimensions | |
for i in range(rel_features - len(ideal_subspace_set) * 2 - 1): | |
d = random.choice(dims) | |
dims.remove(d) | |
subspace = dim_map[d][0] | |
if len(dim_map[d]) > 1: | |
dim_map[d].pop() | |
else: | |
del dim_map[d] | |
subset = [ss.copy() for ss in last] | |
subset[subspace].append(d) | |
if i % cst.SUBSPACE_SET_STEP == 0 or i == rel_features - len(ideal_subspace_set) * 2 - 1: | |
subspace_sets.append(subset) | |
last = subset | |
return subspace_sets | |
def compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subspace_set, greedy): | |
subspace_sets = [] | |
init_subset = [] | |
# if the ideal_subspace_set is already minimal there are no other subspace sets | |
if sum([len(s) == 2 for s in ideal_subspace_set]) == len(ideal_subspace_set): | |
return subspace_sets | |
max_subspace_size = 0 | |
for e, ideal_subspace in enumerate(ideal_subspace_set): | |
if len(ideal_subspace) > max_subspace_size: | |
max_subspace_size = len(ideal_subspace) | |
# every subspace consists of 2 dims | |
init_subset.append(ideal_subspace[:2]) | |
subspace_sets.append(init_subset) | |
irr_counter = 0 | |
for i in range(2, max_subspace_size): | |
last = subspace_sets[-1] | |
subset = [] | |
for j, ss in enumerate(last): | |
if len(ideal_subspace_set[j]) > i: | |
subset.append(ss.copy() + [ideal_subspace_set[j][i]]) | |
elif greedy: | |
subset.append(ss.copy() + [rel_features + irr_counter]) | |
irr_counter += 1 | |
subspace_sets.append(subset) | |
if ideal_subspace_set in subspace_sets: | |
subspace_sets.remove(ideal_subspace_set) | |
return subspace_sets | |
def compute_predefined_subspace_sets_naive(rel_features): | |
dims = [i for i in range(rel_features + cst.TOTAL_IRRELEVANT_FEATURES)] | |
random.shuffle(dims) | |
subspace_sets = [] | |
for chunk in cst.NAIVE_CHUNK_SIZE_RANGE_LIST: | |
ss = list(util.chunks(dims, chunk)) | |
# merge the last with the previous subspace, if the last consists only of 1 dimension | |
if len(ss[-1]) == 1: | |
ss[-2].extend(ss[-1]) | |
del ss[-1] | |
subspace_sets.append(ss) | |
return subspace_sets | |
ideal = None | |
if os.path.exists(cst.PERFECT_SUBSPACES_JSON): | |
with open(cst.PERFECT_SUBSPACES_JSON, "r") as f: | |
ideal = json.load(f) | |
def get_ideal_subspace_set(data_file_name): | |
return ideal.get(data_file_name.replace(".csv", "")) | |
def compute_subspace_sets(data_file_name, method): | |
assert method.name.startswith("PREDEFINED") | |
rel_features = util.parse_relevant_features(data_file_name) | |
ideal_subspace_set = get_ideal_subspace_set(data_file_name) | |
if method is Method.PREDEFINED_OPTIMAL_SUBSPACESET: | |
return [ideal_subspace_set] | |
if method is Method.PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT: | |
redundant_subspace_sets = [] | |
# for irr in range(rel_features + 1, IRRELEVANT_FEATURES + rel_features + 1, SUBSPACE_SET_STEP): | |
for irr in [i + rel_features for i in cst.IRRELEVANT_FEATURES_RANGE_LIST]: | |
irr_subspace = [rf for rf in range(rel_features, irr)] | |
rss = [ideal_subspace + irr_subspace for ideal_subspace in ideal_subspace_set] | |
redundant_subspace_sets.append(rss) | |
# if IRRELEVANT_FEATURES % 2 == 0: | |
# rss = [ideal_subspace + [rf for rf in range(rel_features, IRRELEVANT_FEATURES + rel_features)] for | |
# ideal_subspace in | |
# ideal_subspace_set] | |
# redundant_subspace_sets.append(rss) | |
return redundant_subspace_sets | |
if method is Method.PREDEFINED_SUBSPACESETS: | |
return compute_predefined_subspace_sets(rel_features, ideal_subspace_set) | |
elif method is Method.PREDEFINED_SUBSPACESETS_SYNCHRONOUS_GREEDY: | |
return compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subspace_set, True) | |
elif method is Method.PREDEFINED_SUBSPACESETS_SYNCHRONOUS_OPTIMAL: | |
return compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subspace_set, False) | |
elif method is Method.PREDEFINED_SUBSPACESETS_NAIVE: | |
return compute_predefined_subspace_sets_naive(rel_features) | |
else: | |
raise ValueError("the method has not been implemented yet! " + method) | |
def _compute_predefined_naive_attrs(data_file_name): | |
rf = util.parse_relevant_features(data_file_name) | |
ideal_subspace_set = get_ideal_subspace_set(data_file_name) | |
subspace_set = [] | |
irr_counter = 0 | |
for ideal_subspace in ideal_subspace_set: | |
for f in ideal_subspace: | |
subspace_set.append([f, rf + irr_counter]) | |
irr_counter += 1 | |
return [(subspace_set, 2)] | |
def _construct_trivial_method_name(cor_measure, tb): | |
return "_tb" + str(tb) if tb is not None else "" | |
def _compute_trivial_attrs(data_file_name): | |
return cst.TRIVIAL_BINS_COUNT_LIST | |
def _construct_sm_method_name(cor_measure, k): | |
return "_cor" + cor_measure.name + "_s" + str(k) | |
def _compute_sm_attrs(data_file_name): | |
rel_features = util.parse_relevant_features(data_file_name) | |
# if the data is real | |
if rel_features is None: | |
return cst.DEFAULT_SM_K_RANGE | |
return [int(rel_features / 2), rel_features, rel_features * 2] | |
def _construct_predefined_method_name(cor_measure, attr): | |
return "_s" + str(attr[1]) | |
def _compute_predefined_optimal_attrs(data_file_name): | |
ideal_subspace_set = get_ideal_subspace_set(data_file_name) | |
return [(ideal_subspace_set, None)] | |
def _compute_predefined_full_attrs(data_file_name): | |
rf = util.parse_relevant_features(data_file_name) | |
ideal_subspace_set = get_ideal_subspace_set(data_file_name) | |
init_full = [f for subspace in ideal_subspace_set for f in subspace] | |
return [([init_full], len(init_full))] + [([init_full.copy() + [rf + ir for ir in range(irr)]], len(init_full) + irr) for irr in cst.IRRELEVANT_FEATURES_RANGE_LIST] | |
def _compute_predefined_greedy_optimal_attrs(data_file_name): | |
ideal_subspace_set = get_ideal_subspace_set(data_file_name) | |
rf = util.parse_relevant_features(data_file_name) | |
subspace_sets = [] | |
init_subset = [] | |
# if the ideal_subspace_set is already minimal there are no other subspace sets | |
# if sum([len(s) == 2 for s in ideal_subspace_set]) == len(ideal_subspace_set): | |
# return [] | |
max_subspace_size = 0 | |
for e, ideal_subspace in enumerate(ideal_subspace_set): | |
if len(ideal_subspace) > max_subspace_size: | |
max_subspace_size = len(ideal_subspace) | |
# every subspace consists of 2 dims | |
init_subset.append(ideal_subspace[:2]) | |
subspace_sets.append((init_subset, 2)) | |
irr_counter = 0 | |
for i in range(2, max_subspace_size): | |
last = subspace_sets[-1] | |
subset = [] | |
for j, ss in enumerate(last[0]): | |
if len(ideal_subspace_set[j]) > i: | |
subset.append(ss.copy() + [ideal_subspace_set[j][i]]) | |
else: | |
subset.append(ss.copy() + [rf + irr_counter]) | |
irr_counter += 1 | |
subspace_sets.append((subset, i + 1)) | |
for i, ir in enumerate(cst.IRRELEVANT_FEATURES_RANGE_LIST): | |
last = subspace_sets[-1] | |
subset = [] | |
for ss in last[0]: | |
new_subspace = ss.copy() | |
for j in range(cst.IRRELEVANT_FEATURES_RANGE_LIST[i-1] if i > 0 else 0, ir): | |
new_subspace.append(rf + irr_counter + j) | |
subset.append(new_subspace) | |
# irr_counter += 1 | |
subspace_sets.append((subset, max_subspace_size + ir)) | |
# if (ideal_subspace_set, max_subspace_size) in subspace_sets: | |
# subspace_sets.remove((ideal_subspace_set, max_subspace_size)) | |
return subspace_sets | |
def _construct_default_method_name(cor_measure, attr): | |
return "" | |
def _compute_default_attrs(data_file_name): | |
return [None] | |
class Method(Enum): | |
def __init__(self, construct_method_name0, compute_attrs0, construct_run_params0, id): | |
self.construct_run_params0 = construct_run_params0 | |
self.id = id | |
self.compute_attrs0 = compute_attrs0 | |
self.construct_method_name0 = construct_method_name0 | |
def __getstate__(self): | |
return {'name': self.name} | |
def construct_method_name(self, cor_measure, attr): | |
return self.name.replace("_", "") \ | |
+ self.construct_method_name0(cor_measure, attr) | |
def compute_attrs(self, data_file_name): | |
return self.compute_attrs0(data_file_name) | |
def construct_run_params(self, base_dir, experiment_name, data_file, delim, cor_measure, | |
distance_measure, dist_attr, | |
method_attr): | |
return self.construct_run_params0(self, method_attr, base_dir, experiment_name, data_file, delim, cor_measure, | |
distance_measure, dist_attr) | |
def _construct_default_run_params(self, method_attr, base_dir, experiment_name, data_file, delim, cor_measure, | |
distance_measure, dist_attr): | |
return o.RunParams(base_dir, experiment_name, self, data_file, delim, distance_measure, dist_attr) | |
def _construct_predefined_run_params(self, method_attr, base_dir, experiment_name, data_file, delim, cor_measure, | |
distance_measure, dist_attr): | |
return o.RunParams(base_dir, experiment_name, self, data_file, delim, distance_measure, dist_attr, | |
subspace_set=method_attr[0]) | |
def _construct_sm_run_params(self, method_attr, base_dir, experiment_name, data_file, delim, cor_measure, | |
distance_measure, dist_attr): | |
return o.RunParams(base_dir, experiment_name, self, data_file, delim, distance_measure, dist_attr, | |
cor_measure=cor_measure, sm_k=method_attr) | |
def _construct_trivial_run_params(self, method_attr, base_dir, experiment_name, data_file, delim, cor_measure, | |
distance_measure, dist_attr): | |
return o.RunParams(base_dir, experiment_name, self, data_file, delim, distance_measure, dist_attr, | |
trivial_bins_count=method_attr) | |
TRIVIAL = (_construct_trivial_method_name, _compute_trivial_attrs, _construct_trivial_run_params, 0) | |
SM_GREEDY_TOPK = (_construct_sm_method_name, _compute_sm_attrs, _construct_sm_run_params, 1) | |
SM_HET_GREEDY_TOPK = (_construct_sm_method_name, _compute_sm_attrs, _construct_sm_run_params, 2) | |
SM_BEST_FIRST = (_construct_sm_method_name, _compute_sm_attrs, _construct_sm_run_params, 3) | |
SM_BEAM_SEARCH = (_construct_sm_method_name, _compute_sm_attrs, _construct_sm_run_params, 4) | |
SM_HET_BEAM_SEARCH = (_construct_sm_method_name, _compute_sm_attrs, _construct_sm_run_params, 5) | |
PREDEFINED_FULL = (_construct_predefined_method_name, _compute_predefined_full_attrs, _construct_predefined_run_params, 6) | |
PREDEFINED_GREEDY_OPTIMAL = (_construct_predefined_method_name, _compute_predefined_greedy_optimal_attrs, _construct_predefined_run_params, 7) | |
PREDEFINED_NAIVE = (_construct_default_method_name, _compute_predefined_naive_attrs, _construct_predefined_run_params, 8) | |
PREDEFINED_OPTIMAL = (_construct_default_method_name, _compute_predefined_optimal_attrs, _construct_predefined_run_params, 9) | |
if __name__ == '__main__': | |
# print(compute_subspace_sets('cubes_08_02_02_i.csv', Method.PREDEFINED_SUBSPACESETS_SYNCHRONOUS_GREEDY)) | |
# print(Method.SM_HET_BEAM_SEARCH.construct_method_name(cst.CorrelationMeasure.ID, 3)) | |
# print(Method.SM_BEST_FIRST.compute_attrs("xor")) | |
print(Method.PREDEFINED_GREEDY_OPTIMAL.compute_attrs('cubes_n1000_r4_i1_c1.csv')) | |
print(Method.PREDEFINED_FULL.compute_attrs('cubes_n1000_r4_i1_c1.csv')) | |
print(Method.PREDEFINED_NAIVE.compute_attrs('cubes_n1000_r4_i1_c1.csv')) | |
print(Method.PREDEFINED_OPTIMAL.compute_attrs('cubes_n1000_r4_i1_c1.csv')) |