Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/subspace_mining.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
149 lines (122 sloc)
4.57 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from constants import CorrelationMeasure | |
from uds import compute_uds | |
import constants as cst | |
def compute_measure(data, cor_measure): | |
if cor_measure == CorrelationMeasure.UDS: | |
return compute_uds(data) | |
else: | |
ValueError('No implementation!') | |
def greedy_topk(data, curr, k, cor_measure): | |
''' | |
returns a list of correlated dimensions of size no more than k using the given correlation measure | |
:param cor_measure: | |
:param curr: | |
:param data: | |
:param k: | |
:return: list of dimensions | |
''' | |
dims = data.columns.tolist() | |
dims.remove(curr) | |
if data.shape[1] - 1 <= k: | |
return dims | |
corr = [] | |
for dim in dims: | |
measure = compute_measure(data.loc[:, [curr, dim]], cor_measure) | |
# print(dim, uds) | |
corr.append(measure) | |
order = np.argsort(corr).tolist() | |
order.reverse() | |
return [dims[o] for o in order[:k]] | |
def het_greedy_topk(data, curr, k, delta, cor_measure): | |
dims = data.columns.tolist() | |
dims.remove(curr) | |
if data.shape[1] - 1 <= k: | |
return dims | |
corr = [] | |
for dim in dims: | |
measure = compute_measure(data.loc[:, [curr, dim]], cor_measure) | |
# print(dim, uds) | |
corr.append(measure) | |
order = np.argsort(corr).tolist() | |
order.reverse() | |
diverse_dims_id = [] | |
for i, o in enumerate(order): | |
delete_flag = False | |
for p in order[:i]: | |
if p not in diverse_dims_id: | |
continue | |
if compute_measure(data.loc[:, [dims[o], dims[p]]], cor_measure) > delta: | |
delete_flag = True | |
continue | |
if not delete_flag: | |
diverse_dims_id.append(o) | |
if len(diverse_dims_id) == k: | |
return [dims[l] for l in diverse_dims_id] | |
return [dims[o] for o in diverse_dims_id] | |
class Score: | |
def __init__(self, subspace, score): | |
self.score = score | |
self.subspace = subspace | |
def __lt__(self, other): | |
return self.score < other.score | |
def __repr__(self): | |
return '(' + str(self.subspace) + ', ' + str(self.score) + ')' | |
def best_first(data, curr, k, cor_measure): | |
dims = set(data.columns.tolist()) | |
level = 1 | |
best = [Score({curr}, 0)] | |
while level <= k and level < len(dims): | |
corr = set() | |
for j in dims - best[level - 1].subspace: | |
s = best[level - 1].subspace.union({j}) | |
corr.add(Score(s, compute_measure(data.loc[:, s], cor_measure))) | |
best.append(sorted(corr, reverse=True)[0]) | |
level += 1 | |
return sorted(best, reverse=True)[0].subspace - {curr} | |
def beam_search(data, curr, k, beam_width, cor_measure): | |
dims = set(data.columns.tolist()) | |
level = 1 | |
best = [[Score({curr}, 0)]] | |
while level <= k and level < len(dims): | |
corr = set() | |
for b in best[level - 1]: | |
for j in dims - b.subspace: | |
s = b.subspace.union({j}) | |
corr.add(Score(s, compute_measure(data.loc[:, s], cor_measure))) | |
best.append(sorted(corr, reverse=True)[:beam_width]) | |
level += 1 | |
return sorted([a[0] for a in best], reverse=True)[0].subspace - {curr} | |
def het_beam_search(data, curr, k, beam_width, delta, cor_measure): | |
dims = set(data.columns.tolist()) | |
level = 1 | |
best = [[Score({curr}, 0)]] | |
while level <= k and level < len(dims): | |
corr = set() | |
for b in best[level - 1]: | |
for j in dims - b.subspace: | |
s = b.subspace.union({j}) | |
corr.add(Score(s, compute_measure(data.loc[:, s], cor_measure))) | |
corr = sorted(corr, reverse=True) | |
# checking if subspace of c does not correlate with the subspaces seen earlier | |
diverse_corr = [] | |
for i, c in enumerate(corr): | |
delete_flag = False | |
for corr_l in corr[:i]: | |
if corr_l not in diverse_corr: | |
continue | |
if compute_measure(data.loc[:, c.subspace.union(corr_l.subspace)], cor_measure) > delta: | |
delete_flag = True | |
continue | |
if not delete_flag: | |
diverse_corr.append(c) | |
if len(diverse_corr) == beam_width: | |
break | |
best.append(diverse_corr) | |
level += 1 | |
return sorted([a[0] for a in best], reverse=True)[0].subspace - {curr} | |
if __name__ == '__main__': | |
data = pd.read_csv('synthetic_cases/synthetic_with_nearcopies_4_3.csv', delimiter=';', header=None, na_values='?') | |
# data = data.loc[:, :data.shape[1] - 2] | |
print(str(beam_search(data, 4, 4, cst.BEAM_WIDTH, cst.CorrelationMeasure.UDS))) |