Skip to content
Permalink
f810d1f0cc
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
149 lines (122 sloc) 4.57 KB
import numpy as np
import pandas as pd
from constants import CorrelationMeasure
from uds import compute_uds
import constants as cst
def compute_measure(data, cor_measure):
if cor_measure == CorrelationMeasure.UDS:
return compute_uds(data)
else:
ValueError('No implementation!')
def greedy_topk(data, curr, k, cor_measure):
'''
returns a list of correlated dimensions of size no more than k using the given correlation measure
:param cor_measure:
:param curr:
:param data:
:param k:
:return: list of dimensions
'''
dims = data.columns.tolist()
dims.remove(curr)
if data.shape[1] - 1 <= k:
return dims
corr = []
for dim in dims:
measure = compute_measure(data.loc[:, [curr, dim]], cor_measure)
# print(dim, uds)
corr.append(measure)
order = np.argsort(corr).tolist()
order.reverse()
return [dims[o] for o in order[:k]]
def het_greedy_topk(data, curr, k, delta, cor_measure):
dims = data.columns.tolist()
dims.remove(curr)
if data.shape[1] - 1 <= k:
return dims
corr = []
for dim in dims:
measure = compute_measure(data.loc[:, [curr, dim]], cor_measure)
# print(dim, uds)
corr.append(measure)
order = np.argsort(corr).tolist()
order.reverse()
diverse_dims_id = []
for i, o in enumerate(order):
delete_flag = False
for p in order[:i]:
if p not in diverse_dims_id:
continue
if compute_measure(data.loc[:, [dims[o], dims[p]]], cor_measure) > delta:
delete_flag = True
continue
if not delete_flag:
diverse_dims_id.append(o)
if len(diverse_dims_id) == k:
return [dims[l] for l in diverse_dims_id]
return [dims[o] for o in diverse_dims_id]
class Score:
def __init__(self, subspace, score):
self.score = score
self.subspace = subspace
def __lt__(self, other):
return self.score < other.score
def __repr__(self):
return '(' + str(self.subspace) + ', ' + str(self.score) + ')'
def best_first(data, curr, k, cor_measure):
dims = set(data.columns.tolist())
level = 1
best = [Score({curr}, 0)]
while level <= k and level < len(dims):
corr = set()
for j in dims - best[level - 1].subspace:
s = best[level - 1].subspace.union({j})
corr.add(Score(s, compute_measure(data.loc[:, s], cor_measure)))
best.append(sorted(corr, reverse=True)[0])
level += 1
return sorted(best, reverse=True)[0].subspace - {curr}
def beam_search(data, curr, k, beam_width, cor_measure):
dims = set(data.columns.tolist())
level = 1
best = [[Score({curr}, 0)]]
while level <= k and level < len(dims):
corr = set()
for b in best[level - 1]:
for j in dims - b.subspace:
s = b.subspace.union({j})
corr.add(Score(s, compute_measure(data.loc[:, s], cor_measure)))
best.append(sorted(corr, reverse=True)[:beam_width])
level += 1
return sorted([a[0] for a in best], reverse=True)[0].subspace - {curr}
def het_beam_search(data, curr, k, beam_width, delta, cor_measure):
dims = set(data.columns.tolist())
level = 1
best = [[Score({curr}, 0)]]
while level <= k and level < len(dims):
corr = set()
for b in best[level - 1]:
for j in dims - b.subspace:
s = b.subspace.union({j})
corr.add(Score(s, compute_measure(data.loc[:, s], cor_measure)))
corr = sorted(corr, reverse=True)
# checking if subspace of c does not correlate with the subspaces seen earlier
diverse_corr = []
for i, c in enumerate(corr):
delete_flag = False
for corr_l in corr[:i]:
if corr_l not in diverse_corr:
continue
if compute_measure(data.loc[:, c.subspace.union(corr_l.subspace)], cor_measure) > delta:
delete_flag = True
continue
if not delete_flag:
diverse_corr.append(c)
if len(diverse_corr) == beam_width:
break
best.append(diverse_corr)
level += 1
return sorted([a[0] for a in best], reverse=True)[0].subspace - {curr}
if __name__ == '__main__':
data = pd.read_csv('synthetic_cases/synthetic_with_nearcopies_4_3.csv', delimiter=';', header=None, na_values='?')
# data = data.loc[:, :data.shape[1] - 2]
print(str(beam_search(data, 4, 4, cst.BEAM_WIDTH, cst.CorrelationMeasure.UDS)))