Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import numpy as np
import pandas as pd
import ID_sm as idsm
from constants import CorrelationMeasure
from uds import compute_uds
import footprint as f
#
# def compute_measure(data, curr, cor_measure):
# if cor_measure is CorrelationMeasure.UDS:
# return compute_uds(data)
# elif cor_measure is CorrelationMeasure.ID:
# return idsm.compute_subspace_interaction_measure(curr, data)
# else:
# ValueError('No implementation!')
def compute_measure_value(bin_map, curr, data, curr_points, dim_maxes, cor_measure):
if cor_measure is CorrelationMeasure.UDS:
return compute_uds(data), None
elif cor_measure is CorrelationMeasure.ID:
return idsm.compute_subspace_interaction_measure(bin_map, curr, data, curr_points, dim_maxes)
elif cor_measure is CorrelationMeasure.FID:
return f.compute_subspace_interaction_measure(bin_map, curr, data, curr_points, dim_maxes)
else:
ValueError('No implementation!')
def greedy_topk(bin_map, curr, data, curr_points, dim_maxes, k, cor_measure):
'''
returns a list of correlated dimensions of size no more than k using the given correlation measure
:param cor_measure:
:param curr:
:param data:
:param k:
:return: list of dimensions
'''
dims = data.columns.tolist()
dims.remove(curr)
if data.shape[1] - 1 <= k:
return dims, None
scores = []
for dim in dims:
score, dists = compute_measure_value(bin_map, curr, data.loc[:, [curr, dim]], curr_points, dim_maxes, cor_measure)
# print(dim, uds)
scores.append(score)
order = np.argsort(scores).tolist()
order.reverse()
return [dims[o] for o in order[:k]], None
def het_greedy_topk(bin_map, curr, data, curr_points, dim_maxes, k, delta, cor_measure):
dims = data.columns.tolist()
dims.remove(curr)
if data.shape[1] - 1 <= k:
return dims, None
scores = []
for dim in dims:
score, dists = compute_measure_value(bin_map, curr, data.loc[:, [curr, dim]], curr_points, dim_maxes, cor_measure)
# print(dim, uds)
scores.append(score)
order = np.argsort(scores).tolist()
order.reverse()
diverse_dims_id = []
for i, o in enumerate(order):
delete_flag = False
for p in order[:i]:
if p not in diverse_dims_id:
continue
if compute_measure_value(bin_map, dims[o], data.loc[:, [dims[o], dims[p]]], curr_points, dim_maxes, cor_measure)[0] > delta:
delete_flag = True
continue
if not delete_flag:
diverse_dims_id.append(o)
if len(diverse_dims_id) == k:
return [dims[l] for l in diverse_dims_id], None
return [dims[o] for o in diverse_dims_id], None
class SubspaceOutcome:
def __init__(self, subspace, score, dists=None):
self.dists = dists
self.score = score
self.subspace = subspace
def __lt__(self, other):
return self.score < other.score
def __repr__(self):
return '(' + str(self.subspace) + ', ' + str(self.score) + ')'
def best_first(bin_map, curr, data, curr_points, dim_maxes, k, cor_measure):
dims = set(data.columns.tolist())
level = 1
best = [SubspaceOutcome({curr}, 0)]
while level <= k and level < len(dims):
outcomes = set()
for j in dims - best[level - 1].subspace:
s = best[level - 1].subspace.union({j})
score, dists = compute_measure_value(bin_map, curr, data.loc[:, s], curr_points, dim_maxes, cor_measure)
outcomes.add(SubspaceOutcome(s, score, dists))
best.append(sorted(outcomes, reverse=True)[0])
level += 1
result = sorted(best, reverse=True)[0]
return result.subspace - {curr}, result.dists
def beam_search(bin_map, curr, data, curr_points, dim_maxes, k, beam_width, cor_measure):
dims = set(data.columns.tolist())
level = 1
best = [[SubspaceOutcome({curr}, 0)]]
while level <= k and level < len(dims):
outcomes = set()
for b in best[level - 1]:
for j in dims - b.subspace:
s = b.subspace.union({j})
score, dists = compute_measure_value(bin_map, curr, data.loc[:, s], curr_points, dim_maxes, cor_measure)
outcomes.add(SubspaceOutcome(s, score, dists))
best.append(sorted(outcomes, reverse=True)[:beam_width])
level += 1
subspace_outcome = sorted([a[0] for a in best], reverse=True)[0]
return subspace_outcome.subspace - {curr}, subspace_outcome.dists
def het_beam_search(bin_map, curr, data, curr_points, dim_maxes, k, delta, beam_width, cor_measure):
dims = set(data.columns.tolist())
level = 1
best = [[SubspaceOutcome({curr}, 0)]]
while level <= k and level < len(dims):
outcomes = set()
for b in best[level - 1]:
for j in dims - b.subspace:
s = b.subspace.union({j})
score, dists = compute_measure_value(bin_map, curr, data.loc[:, s], curr_points, dim_maxes, cor_measure)
outcomes.add(SubspaceOutcome(s, score, dists))
outcomes = sorted(outcomes, reverse=True)
# checking if subspace of c does not correlate with the subspaces seen earlier
diverse_corr = []
for i, c in enumerate(outcomes):
delete_flag = False
for corr_l in outcomes[:i]:
if corr_l not in diverse_corr:
continue
if compute_measure_value(bin_map, curr, data.loc[:, c.subspace.union(corr_l.subspace)], curr_points, dim_maxes, cor_measure)[0] > delta:
delete_flag = True
continue
if not delete_flag:
diverse_corr.append(c)
if len(diverse_corr) == beam_width:
break
best.append(diverse_corr)
level += 1
result = sorted([a[0] for a in best], reverse=True)[0]
return result.subspace - {curr}, result.dists
if __name__ == '__main__':
data = pd.read_csv('synthetic_cases/synthetic_with_nearcopies_4_3.csv', delimiter=';', header=None, na_values='?')
# data = data.loc[:, :data.shape[1] - 2]
# print(str(beam_search(data, 4, 4, cst.BEAM_WIDTH, cst.CorrelationMeasure.UDS)))