Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
synthetic cases reproduction
  • Loading branch information
Tatiana Dembelova committed Jun 7, 2017
1 parent 7cd91e1 commit 7168cf4
Show file tree
Hide file tree
Showing 8 changed files with 348 additions and 133 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Expand Up @@ -2,6 +2,8 @@
.idea/*
*.iml

logs/*

# Mobile Tools for Java (J2ME)
.mtj.tmp/

Expand Down
19 changes: 19 additions & 0 deletions constants.py
@@ -0,0 +1,19 @@
from enum import Enum

class Method(Enum):
ORIGINAL = 1
EXTENDED = 2

class Correlation_measure(Enum):
UDS = 1
CMI = 2
MAC = 3

ID_THRESHOLD_QUANTILE = 0.8

NORMALIZATION_RADIUS = 1

FILE_DATA_OUTPUT = "out.txt"
FILE_DATA_CUTS = 'cut.txt'

MAX_SUBSPACE_SIZE = 5
71 changes: 71 additions & 0 deletions data_generation.py
@@ -0,0 +1,71 @@
import numpy as np
import pandas as pd
import os.path


def correlated_data(m, n, sigma, f):
l = int(n / 2)
Z = np.random.normal(0, 1, (m, l))
A = np.matrix(np.random.uniform(0, 1, (l, l)))
X1 = Z * A
B = np.matrix(np.random.uniform(0, 0.5, (l, l)))
W = X1 * B
E = np.random.normal(0, sigma, (m, l))
X2 = f(W) + E
result = np.append(X1, X2, axis=1)
print(result)
return result


def generate_uncorrelated_data(m, n):
return np.random.normal(0, 1, (m, n))


def func1(X):
return 2 * X + 1


def func2(X):
return np.log2(np.abs(X) + 1)


def synthetic_data_1(m, r, s, sigma=0.1):
r_dims = np.random.uniform(-0.5, 0.5, (m, r))
parity_dim = -(np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.random.uniform(0, 0.5,
(m, 1))
s_dims = np.random.normal(0, 1, (m, s))
data = np.concatenate((r_dims, parity_dim, s_dims), axis=1)
if sigma:
e = np.random.normal(0, sigma, (m, r + s + 1))
data = data + e

return data


def synthetic_data_gauss(m, r, s, sigma=0.1):
r_dims = np.random.normal(0, 1, (m, r))
parity_dim = -(np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.abs(np.random.normal(0, 1,
(m, 1)))
s_dims = np.random.normal(0, 1, (m, s))
data = np.concatenate((r_dims, parity_dim, s_dims), axis=1)
if sigma:
e = np.random.normal(0, sigma, (m, r + s + 1))
data = data + e

return data


def synthetic_data_0(m):
l = int(m / 2)
first = np.concatenate((np.random.uniform(-1, 0, (l, 1)), np.random.uniform(0, 1, (l, 1))), axis=1)
sec = np.concatenate((np.random.uniform(0, 1, (m - l, 1)), np.random.uniform(-1, 0, (m - l, 1))), axis=1)
return np.concatenate((first, sec), axis=0)


if __name__ == '__main__':
data__ = np.concatenate((synthetic_data_1(20000, 2, 0, 0), np.zeros((20000, 1))), axis=1)
file = 'synthetic_data_example_20000.csv'

if os.path.isfile(file):
raise ValueError
pd.DataFrame(data__).to_csv(file, sep=';', header=False, index=False, float_format='%.2f')
50 changes: 50 additions & 0 deletions experiments_logging.py
@@ -0,0 +1,50 @@
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D

def plot_data_3d(data):
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

color_cond = {'b': np.logical_and(data[0] < 0, data[1] > 0),
'g': np.logical_and(data[0] > 0, data[1] < 0),
'r': np.logical_and(data[0] < 0, data[1] < 0),
'c': np.logical_and(data[0] > 0, data[1] > 0),
}
for c in color_cond:
ax.scatter(data[0][color_cond[c]], data[1][color_cond[c]], data[2][color_cond[c]], c=c)

ax.set_xlabel('X0')
ax.set_ylabel('X1')
ax.set_zlabel('X2')

plt.show()


def write_out_file(name, disc_intervals, disc_points, class_labels):
with open(name, 'w') as out:
out.write('@relation DB\n\n')
counter = [1]
for i in range(len(disc_intervals)):
out.write(
'@attribute dim' + str(i) + ' {' + ','.join([str(j + counter[-1]) for j in disc_intervals[i]]) + '}\n')
counter.append(counter[-1] + len(disc_intervals[i]))
out.write('@attribute class {' + ','.join(['"' + str(i) + '"' for i in class_labels.unique()]) + '}\n\n')
out.write('@data\n')

for i in range(len(disc_points[0])):
for j in range(len(disc_points)):
out.write(str(disc_points[j][i] + counter[j]))
out.write(',')
out.write('"' + str(class_labels[i]) + '"\n')


def write_cut_file(name, disc_intervals):
with open(name, 'w') as out:
for i in range(len(disc_intervals)):
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
for bin in disc_intervals[i]:
out.write(str(disc_intervals[i][bin][1]) + '\n')
out.write('-------------------------------------\n')


69 changes: 25 additions & 44 deletions interaction_distance.py
Expand Up @@ -2,55 +2,29 @@
import numpy as np

import uds
from constants import Correlation_measure, ID_THRESHOLD_QUANTILE


def compute_IDs(bin_map, curr, data, dist_bins, dim_maxes):
intra_bin_measures = []
inter_bin_measures = []

data_wo_curr = data.copy()
data_wo_curr.pop(curr) # todo slow?
for bin_id, binn in enumerate(dist_bins):
bin_data = data_wo_curr.loc[bin_map == binn]
points_count = bin_data.shape[0]
prev_bin_data = None
inter_prod_matrix = None
prev_points_count = None
if bin_id > 0:
prev_bin_data = data_wo_curr.loc[bin_map == dist_bins[bin_id - 1]]
prev_points_count = prev_bin_data.shape[0]
inter_prod_matrix = np.ones([points_count, prev_points_count])
return _compute_IDs(bin_map, data_wo_curr, dim_maxes, dist_bins)

intra_prod_matrix = np.ones([points_count, points_count])
# product elements for each dimension
for dim in bin_data:
intra_elem = compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim])
intra_prod_matrix = np.multiply(intra_prod_matrix, intra_elem)

if bin_id > 0:
inter_elem = compute_ID_elem(bin_data[dim], prev_bin_data[dim], dim_maxes[dim])
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem)
def compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, cor_measure, k):
if cor_measure == Correlation_measure.UDS:
subspace = uds.find_correlated_subspace(data, curr, k)
else:
ValueError('No implementation!')

intra_bin_measures.append(np.sum(intra_prod_matrix) / points_count ** 2)
data = data.copy().loc[:, subspace]

if bin_id > 0:
inter_bin_measures.append(2 * np.sum(inter_prod_matrix) / (points_count * prev_points_count))
IDs = []
for c, inter_measure in enumerate(inter_bin_measures):
IDs.append(intra_bin_measures[c] - inter_measure + intra_bin_measures[c + 1])
IDs = np.array(IDs)
return IDs
return _compute_IDs(bin_map, data, dim_maxes, dist_bins)


def compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, k):
intra_bin_measures = []
def _compute_IDs(bin_map, data, dim_maxes, dist_bins):
inner_bin_measures = []
inter_bin_measures = []

data = data.copy()

data = data.loc[:, uds.find_correlated_subspace(data, curr, k)]
# data.pop(curr) # todo slow?

for bin_id, binn in enumerate(dist_bins):
bin_data = data.loc[bin_map == binn]
points_count = bin_data.shape[0]
Expand All @@ -62,23 +36,23 @@ def compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, k):
prev_points_count = prev_bin_data.shape[0]
inter_prod_matrix = np.ones([points_count, prev_points_count])

intra_prod_matrix = np.ones([points_count, points_count])
inner_prod_matrix = np.ones([points_count, points_count])
# product elements for each dimension
for dim in bin_data:
intra_elem = compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim])
intra_prod_matrix = np.multiply(intra_prod_matrix, intra_elem)
inner_elem = compute_ID_elem(bin_data[dim], bin_data[dim], dim_maxes[dim])
inner_prod_matrix = np.multiply(inner_prod_matrix, inner_elem)

if bin_id > 0:
inter_elem = compute_ID_elem(bin_data[dim], prev_bin_data[dim], dim_maxes[dim])
inter_prod_matrix = np.multiply(inter_prod_matrix, inter_elem)

intra_bin_measures.append(np.sum(intra_prod_matrix) / points_count ** 2)
inner_bin_measures.append(np.sum(inner_prod_matrix) / points_count ** 2)

if bin_id > 0:
inter_bin_measures.append(2 * np.sum(inter_prod_matrix) / (points_count * prev_points_count))
IDs = []
for c, inter_measure in enumerate(inter_bin_measures):
IDs.append(intra_bin_measures[c] - inter_measure + intra_bin_measures[c + 1])
IDs.append(inner_bin_measures[c] - inter_measure + inner_bin_measures[c + 1])
IDs = np.array(IDs)
return IDs

Expand All @@ -98,5 +72,12 @@ def compute_ID_threshold(IDs):
IDs = IDs.copy()
IDs.sort()
# similar to original ipd (but possibly wrong) todo
return IDs[math.ceil(int(len(IDs) / 3)) - 1]
# return IDs[int(len(IDs) * ID_THRESHOLD_QUANTILE)]
# return IDs[math.ceil(int(len(IDs) / 3)) - 1]
return IDs[int(len(IDs) * ID_THRESHOLD_QUANTILE)]


def compute_max_ID_threshold(IDs):
IDs = IDs.copy()
IDs.sort()

return max(IDs)

0 comments on commit 7168cf4

Please sign in to comment.