Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
small fixes
  • Loading branch information
Tatiana Dembelova committed Aug 29, 2017
1 parent c5ad302 commit f810d1f
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 24 deletions.
50 changes: 36 additions & 14 deletions data_generation.py
Expand Up @@ -111,6 +111,16 @@ def synthetic_cjs():
np.concatenate((np.random.normal(4, 1, (100, 1)), np.random.normal(5, 1, (100, 1))),
axis=1)), axis=0)

def blobs():
# 2d
n = 4000
l = int(n/2)
blob1 = np.random.normal(0, 1, (l, 2)) - np.concatenate((np.ones((l, 1)), np.zeros((l, 1))), axis=1)

blob2 = np.random.normal(0, 1, (l, 2)) + np.concatenate((np.ones((l, 1)), np.zeros((l, 1))), axis=1)

return np.concatenate((blob1, blob2), axis=0)


def append_irrelevant_features(file, n):
if n == 0:
Expand All @@ -127,28 +137,40 @@ def generate():
# -------generating dataset
# data = synthetic_cube_in_cube(rows, rel_features, irrel_features, 'l')
# data__ = synthetic_cjs()
data = correlated_data(rows, rel_features + irrel_features, 1, func1)
#
# # add zeroes as default class
# data = np.concatenate((data, np.zeros((rows, 1))), axis=1)
data = np.concatenate((data, np.zeros((rows, 1))), axis=1)
# -------appending irrelevant features to existing dataset
data = append_irrelevant_features(source, irrel_features)
# data = append_irrelevant_features(source, irrel_features)
# storing to disk
pd.DataFrame(data).to_csv(file, sep=';', header=False, index=False, float_format='%.2f')


if __name__ == '__main__':
for j in [3,4,5,10]:
for i in [1,2,3,4,5,10]:
# file = 'synthetic_cases/synthetic_cube_in_cube_10.csv'
file = 'synthetic_cases/synthetic_cube_in_cube_' + str(j) + '_' + str(i) + '.csv'
source = 'synthetic_cases/synthetic_cube_in_cube_' + str(j) + '.csv'
# for j in [3,4,5,10]:
# for i in [1,2,3,4,5,10]:
# # file = 'synthetic_cases/synthetic_cube_in_cube_10.csv'
# file = 'synthetic_cases/synthetic_cube_in_cube_' + str(j) + '_' + str(i) + '.csv'
# source = 'synthetic_cases/synthetic_cube_in_cube_' + str(j) + '.csv'
#
# if os.path.isfile(file):
# raise ValueError(file + " already exists!")
#
# # parameters
# rows = 20000
# rel_features = 10
# irrel_features = i
#
# generate()
file = 'synthetic_cases/uds_test_4.csv'

if os.path.isfile(file):
raise ValueError(file + " already exists!")
if os.path.isfile(file):
raise ValueError(file + " already exists!")

# parameters
rows = 20000
rel_features = 10
irrel_features = i
# parameters
rows = 4000
rel_features = 2
irrel_features = 2

generate()
generate()
2 changes: 1 addition & 1 deletion experiments_logging.py
Expand Up @@ -65,5 +65,5 @@ def write_cut_file(name, disc_intervals):
if __name__ == '__main__':
# rows = 20000
# data = np.concatenate((synthetic_cube_in_cube(rows, 2, 0), np.zeros((rows, 1))), axis=1)
data = pd.read_csv("synthetic_cases/synthetic_cube_in_cube_5.csv", delimiter=";", header=None, na_values='?')
data = pd.read_csv("synthetic_cases/synthetic_cube_in_sparse_cube_3_0.csv", delimiter=";", header=None, na_values='?')
plot_data_3d(data)
3 changes: 3 additions & 0 deletions main.py
Expand Up @@ -85,6 +85,9 @@ def compute_distances(bin_map, curr, data, dim_maxes,
subspace = sm.het_beam_search(data, curr, k, beam_width, delta, cor_measure)
else:
raise ValueError("there is no such method!")

# if len(subspace) == 0:

# todo the rest of the methods
data = data.copy().loc[:, subspace]
dim_maxes = dim_maxes[subspace]
Expand Down
3 changes: 3 additions & 0 deletions subspace_mining.py
Expand Up @@ -74,6 +74,9 @@ def __init__(self, subspace, score):
def __lt__(self, other):
return self.score < other.score

def __repr__(self):
return '(' + str(self.subspace) + ', ' + str(self.score) + ')'


def best_first(data, curr, k, cor_measure):
dims = set(data.columns.tolist())
Expand Down
18 changes: 9 additions & 9 deletions uds.py
Expand Up @@ -52,13 +52,12 @@ def dim_optimal_disc(curr, prev, I, data):
min_cost = None
arg_min = None
for j in range(l - 1, i):
temp_cost = (support[i] - support[j]) / support[i] * f[i][j + 1] + support[j] / support[i] * val[j][
l - 1]
temp_cost = (support[i] - support[j]) / support[i] * f[i][j + 1] \
+ support[j] / support[i] * val[j][l - 1]
if not min_cost or temp_cost < min_cost:
min_cost = temp_cost
arg_min = j

# val[i][l]
val[i].append(min_cost)
disc = b[arg_min][l - 1].copy()
disc.append(merged_bins[i][arg_min + 1])
Expand Down Expand Up @@ -125,8 +124,11 @@ def compute_uds(data):
opt_score = None
for l, score in enumerate(scores):
temp_I = extend_I(I, discs[l])
temp_cost = score / CEs[dim] + entropy(temp_I, len(data)) / (
math.log(UDS_BETA, 2) + sum([math.log(e + 1, 2) for e in es]))
temp_cost = score / CEs[dim] + (entropy(temp_I, len(data)) / (
# todo old
math.log(UDS_BETA, 2) + sum([math.log(e + 1, 2) for e in es])))
# math.log(l + 1, 2) + sum([math.log(e + 1, 2) for e in es]))
# if math.log(l + 1, 2) + sum([math.log(e + 1, 2) for e in es]) != 0 else 0)
if not opt_cost or temp_cost < opt_cost:
opt_cost = temp_cost
opt_score = score
Expand All @@ -148,14 +150,12 @@ def compute_uds(data):

if __name__ == "__main__":

data = pd.read_csv('synthetic_cases/uds_test.csv', delimiter=';', header=None)
data = pd.read_csv('synthetic_cases/synthetic_3d_parity_problem.csv', delimiter=';', usecols=[0,1,2], header=None) #0.0716387590375
# data = pd.DataFrame(correlated_data(4000, 20, 10, dg.func2))
# data = pd.DataFrame(generate_correlated_data(1000, 10, 2, func1))
# data = pd.DataFrame(generate_uncorrelated_data(4000, 20))
# classLabels = data.pop(len(data.columns) - 1)
uds = compute_uds(data)

print(uds)
# print(es)


# print(es)

0 comments on commit f810d1f

Please sign in to comment.