Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/subspace_mining.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
170 lines (142 sloc)
6.16 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import ID_sm as idsm | |
from constants import CorrelationMeasure | |
from uds import compute_uds | |
import footprint as f | |
# | |
# def compute_measure(data, curr, cor_measure): | |
# if cor_measure is CorrelationMeasure.UDS: | |
# return compute_uds(data) | |
# elif cor_measure is CorrelationMeasure.ID: | |
# return idsm.compute_subspace_interaction_measure(curr, data) | |
# else: | |
# ValueError('No implementation!') | |
def compute_measure_value(bin_map, curr, data, curr_points, dim_maxes, cor_measure): | |
if cor_measure is CorrelationMeasure.UDS: | |
return compute_uds(data), None | |
elif cor_measure is CorrelationMeasure.ID: | |
return idsm.compute_subspace_interaction_measure(bin_map, curr, data, curr_points, dim_maxes) | |
elif cor_measure is CorrelationMeasure.FID: | |
return f.compute_subspace_interaction_measure(bin_map, curr, data, curr_points, dim_maxes) | |
else: | |
ValueError('No implementation!') | |
def greedy_topk(bin_map, curr, data, curr_points, dim_maxes, k, cor_measure): | |
''' | |
returns a list of correlated dimensions of size no more than k using the given correlation measure | |
:param cor_measure: | |
:param curr: | |
:param data: | |
:param k: | |
:return: list of dimensions | |
''' | |
dims = data.columns.tolist() | |
dims.remove(curr) | |
if data.shape[1] - 1 <= k: | |
return dims, None | |
scores = [] | |
for dim in dims: | |
score, dists = compute_measure_value(bin_map, curr, data.loc[:, [curr, dim]], curr_points, dim_maxes, cor_measure) | |
# print(dim, uds) | |
scores.append(score) | |
order = np.argsort(scores).tolist() | |
order.reverse() | |
return [dims[o] for o in order[:k]], None | |
def het_greedy_topk(bin_map, curr, data, curr_points, dim_maxes, k, delta, cor_measure): | |
dims = data.columns.tolist() | |
dims.remove(curr) | |
if data.shape[1] - 1 <= k: | |
return dims, None | |
scores = [] | |
for dim in dims: | |
score, dists = compute_measure_value(bin_map, curr, data.loc[:, [curr, dim]], curr_points, dim_maxes, cor_measure) | |
# print(dim, uds) | |
scores.append(score) | |
order = np.argsort(scores).tolist() | |
order.reverse() | |
diverse_dims_id = [] | |
for i, o in enumerate(order): | |
delete_flag = False | |
for p in order[:i]: | |
if p not in diverse_dims_id: | |
continue | |
if compute_measure_value(bin_map, dims[o], data.loc[:, [dims[o], dims[p]]], curr_points, dim_maxes, cor_measure)[0] > delta: | |
delete_flag = True | |
continue | |
if not delete_flag: | |
diverse_dims_id.append(o) | |
if len(diverse_dims_id) == k: | |
return [dims[l] for l in diverse_dims_id], None | |
return [dims[o] for o in diverse_dims_id], None | |
class SubspaceOutcome: | |
def __init__(self, subspace, score, dists=None): | |
self.dists = dists | |
self.score = score | |
self.subspace = subspace | |
def __lt__(self, other): | |
return self.score < other.score | |
def __repr__(self): | |
return '(' + str(self.subspace) + ', ' + str(self.score) + ')' | |
def best_first(bin_map, curr, data, curr_points, dim_maxes, k, cor_measure): | |
dims = set(data.columns.tolist()) | |
level = 1 | |
best = [SubspaceOutcome({curr}, 0)] | |
while level <= k and level < len(dims): | |
outcomes = set() | |
for j in dims - best[level - 1].subspace: | |
s = best[level - 1].subspace.union({j}) | |
score, dists = compute_measure_value(bin_map, curr, data.loc[:, s], curr_points, dim_maxes, cor_measure) | |
outcomes.add(SubspaceOutcome(s, score, dists)) | |
best.append(sorted(outcomes, reverse=True)[0]) | |
level += 1 | |
result = sorted(best, reverse=True)[0] | |
return result.subspace - {curr}, result.dists | |
def beam_search(bin_map, curr, data, curr_points, dim_maxes, k, beam_width, cor_measure): | |
dims = set(data.columns.tolist()) | |
level = 1 | |
best = [[SubspaceOutcome({curr}, 0)]] | |
while level <= k and level < len(dims): | |
outcomes = set() | |
for b in best[level - 1]: | |
for j in dims - b.subspace: | |
s = b.subspace.union({j}) | |
score, dists = compute_measure_value(bin_map, curr, data.loc[:, s], curr_points, dim_maxes, cor_measure) | |
outcomes.add(SubspaceOutcome(s, score, dists)) | |
best.append(sorted(outcomes, reverse=True)[:beam_width]) | |
level += 1 | |
subspace_outcome = sorted([a[0] for a in best], reverse=True)[0] | |
return subspace_outcome.subspace - {curr}, subspace_outcome.dists | |
def het_beam_search(bin_map, curr, data, curr_points, dim_maxes, k, delta, beam_width, cor_measure): | |
dims = set(data.columns.tolist()) | |
level = 1 | |
best = [[SubspaceOutcome({curr}, 0)]] | |
while level <= k and level < len(dims): | |
outcomes = set() | |
for b in best[level - 1]: | |
for j in dims - b.subspace: | |
s = b.subspace.union({j}) | |
score, dists = compute_measure_value(bin_map, curr, data.loc[:, s], curr_points, dim_maxes, cor_measure) | |
outcomes.add(SubspaceOutcome(s, score, dists)) | |
outcomes = sorted(outcomes, reverse=True) | |
# checking if subspace of c does not correlate with the subspaces seen earlier | |
diverse_corr = [] | |
for i, c in enumerate(outcomes): | |
delete_flag = False | |
for corr_l in outcomes[:i]: | |
if corr_l not in diverse_corr: | |
continue | |
if compute_measure_value(bin_map, curr, data.loc[:, c.subspace.union(corr_l.subspace)], curr_points, dim_maxes, cor_measure)[0] > delta: | |
delete_flag = True | |
continue | |
if not delete_flag: | |
diverse_corr.append(c) | |
if len(diverse_corr) == beam_width: | |
break | |
best.append(diverse_corr) | |
level += 1 | |
result = sorted([a[0] for a in best], reverse=True)[0] | |
return result.subspace - {curr}, result.dists | |
if __name__ == '__main__': | |
data = pd.read_csv('synthetic_cases/synthetic_with_nearcopies_4_3.csv', delimiter=';', header=None, na_values='?') | |
# data = data.loc[:, :data.shape[1] - 2] | |
# print(str(beam_search(data, 4, 4, cst.BEAM_WIDTH, cst.CorrelationMeasure.UDS))) |