Skip to content
Permalink
b1a05852fb
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
306 lines (242 sloc) 12.7 KB
import json
import os
from enum import Enum
import objects as o
import constants as cst
import util
import random
def compute_predefined_subspace_sets(rel_features, ideal_subspace_set):
subspace_sets = []
init_subset = []
dim_map = {i: [] for i in range(rel_features)}
dims = []
# if the ideal_subspace_set is already minimal there are no other subspace sets
if sum([len(s) == 2 for s in ideal_subspace_set]) == len(ideal_subspace_set):
return subspace_sets
max_subspace_size = 0
for e, ideal_subspace in enumerate(ideal_subspace_set):
if len(ideal_subspace) > max_subspace_size:
max_subspace_size = len(ideal_subspace)
# every subspace consists of 2 dims
init_subset.append(ideal_subspace[:2])
# dims left for considering
for i in ideal_subspace[2:]:
dim_map[i].append(e)
dims.append(i)
subspace_sets.append(init_subset)
last = init_subset
# 2 is minimal number of interacting dimensions
for i in range(rel_features - len(ideal_subspace_set) * 2 - 1):
d = random.choice(dims)
dims.remove(d)
subspace = dim_map[d][0]
if len(dim_map[d]) > 1:
dim_map[d].pop()
else:
del dim_map[d]
subset = [ss.copy() for ss in last]
subset[subspace].append(d)
if i % cst.SUBSPACE_SET_STEP == 0 or i == rel_features - len(ideal_subspace_set) * 2 - 1:
subspace_sets.append(subset)
last = subset
return subspace_sets
def compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subspace_set, greedy):
subspace_sets = []
init_subset = []
# if the ideal_subspace_set is already minimal there are no other subspace sets
if sum([len(s) == 2 for s in ideal_subspace_set]) == len(ideal_subspace_set):
return subspace_sets
max_subspace_size = 0
for e, ideal_subspace in enumerate(ideal_subspace_set):
if len(ideal_subspace) > max_subspace_size:
max_subspace_size = len(ideal_subspace)
# every subspace consists of 2 dims
init_subset.append(ideal_subspace[:2])
subspace_sets.append(init_subset)
irr_counter = 0
for i in range(2, max_subspace_size):
last = subspace_sets[-1]
subset = []
for j, ss in enumerate(last):
if len(ideal_subspace_set[j]) > i:
subset.append(ss.copy() + [ideal_subspace_set[j][i]])
elif greedy:
subset.append(ss.copy() + [rel_features + irr_counter])
irr_counter += 1
subspace_sets.append(subset)
if ideal_subspace_set in subspace_sets:
subspace_sets.remove(ideal_subspace_set)
return subspace_sets
def compute_predefined_subspace_sets_naive(rel_features):
dims = [i for i in range(rel_features + cst.TOTAL_IRRELEVANT_FEATURES)]
random.shuffle(dims)
subspace_sets = []
for chunk in cst.NAIVE_CHUNK_SIZE_RANGE_LIST:
ss = list(util.chunks(dims, chunk))
# merge the last with the previous subspace, if the last consists only of 1 dimension
if len(ss[-1]) == 1:
ss[-2].extend(ss[-1])
del ss[-1]
subspace_sets.append(ss)
return subspace_sets
ideal = None
if os.path.exists(cst.PERFECT_SUBSPACES_JSON):
with open(cst.PERFECT_SUBSPACES_JSON, "r") as f:
ideal = json.load(f)
def get_ideal_subspace_set(data_file_name):
return ideal.get(data_file_name.replace(".csv", ""))
def compute_subspace_sets(data_file_name, method):
assert method.name.startswith("PREDEFINED")
rel_features = util.parse_relevant_features(data_file_name)
ideal_subspace_set = get_ideal_subspace_set(data_file_name)
if method is Method.PREDEFINED_OPTIMAL_SUBSPACESET:
return [ideal_subspace_set]
if method is Method.PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT:
redundant_subspace_sets = []
# for irr in range(rel_features + 1, IRRELEVANT_FEATURES + rel_features + 1, SUBSPACE_SET_STEP):
for irr in [i + rel_features for i in cst.IRRELEVANT_FEATURES_RANGE_LIST]:
irr_subspace = [rf for rf in range(rel_features, irr)]
rss = [ideal_subspace + irr_subspace for ideal_subspace in ideal_subspace_set]
redundant_subspace_sets.append(rss)
# if IRRELEVANT_FEATURES % 2 == 0:
# rss = [ideal_subspace + [rf for rf in range(rel_features, IRRELEVANT_FEATURES + rel_features)] for
# ideal_subspace in
# ideal_subspace_set]
# redundant_subspace_sets.append(rss)
return redundant_subspace_sets
if method is Method.PREDEFINED_SUBSPACESETS:
return compute_predefined_subspace_sets(rel_features, ideal_subspace_set)
elif method is Method.PREDEFINED_SUBSPACESETS_SYNCHRONOUS_GREEDY:
return compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subspace_set, True)
elif method is Method.PREDEFINED_SUBSPACESETS_SYNCHRONOUS_OPTIMAL:
return compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subspace_set, False)
elif method is Method.PREDEFINED_SUBSPACESETS_NAIVE:
return compute_predefined_subspace_sets_naive(rel_features)
else:
raise ValueError("the method has not been implemented yet! " + method)
def _compute_predefined_naive_attrs(data_file_name):
rf = util.parse_relevant_features(data_file_name)
ideal_subspace_set = get_ideal_subspace_set(data_file_name)
subspace_set = []
irr_counter = 0
for ideal_subspace in ideal_subspace_set:
for f in ideal_subspace:
subspace_set.append([f, rf + irr_counter])
irr_counter += 1
return [(subspace_set, 2)]
def _construct_trivial_method_name(cor_measure, tb):
return "_tb" + str(tb) if tb is not None else ""
def _compute_trivial_attrs(data_file_name):
return cst.TRIVIAL_BINS_COUNT_LIST
def _construct_sm_method_name(cor_measure, k):
return "_cor" + cor_measure.name + "_s" + str(k)
def _compute_sm_attrs(data_file_name):
rel_features = util.parse_relevant_features(data_file_name)
# if the data is real
if rel_features is None:
return cst.DEFAULT_SM_K_RANGE
return [int(rel_features / 2), rel_features, rel_features * 2]
def _construct_predefined_method_name(cor_measure, attr):
return "_s" + str(attr[1])
def _compute_predefined_optimal_attrs(data_file_name):
ideal_subspace_set = get_ideal_subspace_set(data_file_name)
return [(ideal_subspace_set, None)]
def _compute_predefined_full_attrs(data_file_name):
rf = util.parse_relevant_features(data_file_name)
ideal_subspace_set = get_ideal_subspace_set(data_file_name)
init_full = [f for subspace in ideal_subspace_set for f in subspace]
return [([init_full], len(init_full))] + [([init_full.copy() + [rf + ir for ir in range(irr)]], len(init_full) + irr) for irr in cst.IRRELEVANT_FEATURES_RANGE_LIST]
def _compute_predefined_greedy_optimal_attrs(data_file_name):
ideal_subspace_set = get_ideal_subspace_set(data_file_name)
rf = util.parse_relevant_features(data_file_name)
subspace_sets = []
init_subset = []
# if the ideal_subspace_set is already minimal there are no other subspace sets
# if sum([len(s) == 2 for s in ideal_subspace_set]) == len(ideal_subspace_set):
# return []
max_subspace_size = 0
for e, ideal_subspace in enumerate(ideal_subspace_set):
if len(ideal_subspace) > max_subspace_size:
max_subspace_size = len(ideal_subspace)
# every subspace consists of 2 dims
init_subset.append(ideal_subspace[:2])
subspace_sets.append((init_subset, 2))
irr_counter = 0
for i in range(2, max_subspace_size):
last = subspace_sets[-1]
subset = []
for j, ss in enumerate(last[0]):
if len(ideal_subspace_set[j]) > i:
subset.append(ss.copy() + [ideal_subspace_set[j][i]])
else:
subset.append(ss.copy() + [rf + irr_counter])
irr_counter += 1
subspace_sets.append((subset, i + 1))
for i, ir in enumerate(cst.IRRELEVANT_FEATURES_RANGE_LIST):
last = subspace_sets[-1]
subset = []
for ss in last[0]:
new_subspace = ss.copy()
for j in range(cst.IRRELEVANT_FEATURES_RANGE_LIST[i-1] if i > 0 else 0, ir):
new_subspace.append(rf + irr_counter + j)
subset.append(new_subspace)
# irr_counter += 1
subspace_sets.append((subset, max_subspace_size + ir))
# if (ideal_subspace_set, max_subspace_size) in subspace_sets:
# subspace_sets.remove((ideal_subspace_set, max_subspace_size))
return subspace_sets
def _construct_default_method_name(cor_measure, attr):
return ""
def _compute_default_attrs(data_file_name):
return [None]
class Method(Enum):
def __init__(self, construct_method_name0, compute_attrs0, construct_run_params0, id):
self.construct_run_params0 = construct_run_params0
self.id = id
self.compute_attrs0 = compute_attrs0
self.construct_method_name0 = construct_method_name0
def __getstate__(self):
return {'name': self.name}
def construct_method_name(self, cor_measure, attr):
return self.name.replace("_", "") \
+ self.construct_method_name0(cor_measure, attr)
def compute_attrs(self, data_file_name):
return self.compute_attrs0(data_file_name)
def construct_run_params(self, base_dir, experiment_name, data_file, delim, cor_measure,
distance_measure, dist_attr,
method_attr):
return self.construct_run_params0(self, method_attr, base_dir, experiment_name, data_file, delim, cor_measure,
distance_measure, dist_attr)
def _construct_default_run_params(self, method_attr, base_dir, experiment_name, data_file, delim, cor_measure,
distance_measure, dist_attr):
return o.RunParams(base_dir, experiment_name, self, data_file, delim, distance_measure, dist_attr)
def _construct_predefined_run_params(self, method_attr, base_dir, experiment_name, data_file, delim, cor_measure,
distance_measure, dist_attr):
return o.RunParams(base_dir, experiment_name, self, data_file, delim, distance_measure, dist_attr,
subspace_set=method_attr[0])
def _construct_sm_run_params(self, method_attr, base_dir, experiment_name, data_file, delim, cor_measure,
distance_measure, dist_attr):
return o.RunParams(base_dir, experiment_name, self, data_file, delim, distance_measure, dist_attr,
cor_measure=cor_measure, sm_k=method_attr)
def _construct_trivial_run_params(self, method_attr, base_dir, experiment_name, data_file, delim, cor_measure,
distance_measure, dist_attr):
return o.RunParams(base_dir, experiment_name, self, data_file, delim, distance_measure, dist_attr,
trivial_bins_count=method_attr)
TRIVIAL = (_construct_trivial_method_name, _compute_trivial_attrs, _construct_trivial_run_params, 0)
SM_GREEDY_TOPK = (_construct_sm_method_name, _compute_sm_attrs, _construct_sm_run_params, 1)
SM_HET_GREEDY_TOPK = (_construct_sm_method_name, _compute_sm_attrs, _construct_sm_run_params, 2)
SM_BEST_FIRST = (_construct_sm_method_name, _compute_sm_attrs, _construct_sm_run_params, 3)
SM_BEAM_SEARCH = (_construct_sm_method_name, _compute_sm_attrs, _construct_sm_run_params, 4)
SM_HET_BEAM_SEARCH = (_construct_sm_method_name, _compute_sm_attrs, _construct_sm_run_params, 5)
PREDEFINED_FULL = (_construct_predefined_method_name, _compute_predefined_full_attrs, _construct_predefined_run_params, 6)
PREDEFINED_GREEDY_OPTIMAL = (_construct_predefined_method_name, _compute_predefined_greedy_optimal_attrs, _construct_predefined_run_params, 7)
PREDEFINED_NAIVE = (_construct_default_method_name, _compute_predefined_naive_attrs, _construct_predefined_run_params, 8)
PREDEFINED_OPTIMAL = (_construct_default_method_name, _compute_predefined_optimal_attrs, _construct_predefined_run_params, 9)
if __name__ == '__main__':
# print(compute_subspace_sets('cubes_08_02_02_i.csv', Method.PREDEFINED_SUBSPACESETS_SYNCHRONOUS_GREEDY))
# print(Method.SM_HET_BEAM_SEARCH.construct_method_name(cst.CorrelationMeasure.ID, 3))
# print(Method.SM_BEST_FIRST.compute_attrs("xor"))
print(Method.PREDEFINED_GREEDY_OPTIMAL.compute_attrs('cubes_n1000_r4_i1_c1.csv'))
print(Method.PREDEFINED_FULL.compute_attrs('cubes_n1000_r4_i1_c1.csv'))
print(Method.PREDEFINED_NAIVE.compute_attrs('cubes_n1000_r4_i1_c1.csv'))
print(Method.PREDEFINED_OPTIMAL.compute_attrs('cubes_n1000_r4_i1_c1.csv'))