Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
added todo ext and todo old comments to see what are the changes
added interaction_distance.py#compute_sliding_count() method for counting ID peaks
updated data_generation.py№synthetic_with_nearcopies() to the range [-0.5;0.5]

the functionality is as before
  • Loading branch information
Tatiana Dembelova committed Jun 22, 2017
1 parent 57d60b4 commit 14ad1aa
Show file tree
Hide file tree
Showing 26 changed files with 180,099 additions and 469,799 deletions.
3 changes: 2 additions & 1 deletion constants.py
Expand Up @@ -11,7 +11,8 @@ class CorrelationMeasure(Enum):
MAC = 3


ID_THRESHOLD_QUANTILE = 1.0 / 3
ID_THRESHOLD_QUANTILE = 1.0/3
ID_SLIDING_WINDOW = 40

NORMALIZATION_RADIUS = 1

Expand Down
49 changes: 35 additions & 14 deletions data_generation.py
Expand Up @@ -2,7 +2,7 @@
import pandas as pd
import os.path


# synthetic case from uds
def correlated_data(m, n, sigma, f):
l = int(n / 2)
Z = np.random.normal(0, 1, (m, l))
Expand All @@ -29,11 +29,24 @@ def func2(X):
return np.log2(np.abs(X) + 1)


def synthetic_data_1(m, r, s, sigma=0.1):
r_dims = np.random.uniform(-0.5, 0.5, (m, r))
def synthetic_data_uni(m, r, s, sigma=0.1):
r_dims = np.random.uniform(-0.5, 0.5, (m, r)) if r > 0 else np.empty((m, r))
parity_dim = -(np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.random.uniform(0, 0.5,
(m, 1))
s_dims = np.random.normal(0, 1, (m, s))
(m, 1)) if r > 0 else np.empty((m, r))
s_dims = np.random.uniform(-0.5, 0.5, (m, s))
data = np.concatenate((r_dims, parity_dim, s_dims), axis=1)
if sigma:
e = np.random.normal(0, sigma, (m, r + s + 1))
data = data + e

return data


def synthetic_data_uni_negative(m, r, s, sigma=0.1):
r_dims = np.random.uniform(-0.5, 0.5, (m, r)) if r > 0 else np.empty((m, r))
parity_dim = (np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.random.uniform(0, 0.5,
(m, 1)) if r > 0 else np.empty((m, r))
s_dims = np.random.uniform(-0.5, 0.5, (m, s))
data = np.concatenate((r_dims, parity_dim, s_dims), axis=1)
if sigma:
e = np.random.normal(0, sigma, (m, r + s + 1))
Expand All @@ -43,9 +56,10 @@ def synthetic_data_1(m, r, s, sigma=0.1):


def synthetic_data_gauss(m, r, s, sigma=0.1):
r_dims = np.random.normal(0, 1, (m, r))

r_dims = np.random.normal(0, 1, (m, r)) if r > 0 else np.empty((m, r))
parity_dim = -(np.count_nonzero(r_dims > 0, axis=1) % 2 * 2 - 1).reshape(m, 1) * np.abs(np.random.normal(0, 1,
(m, 1)))
(m, 1))) if r > 0 else np.empty((m, r))
s_dims = np.random.normal(0, 1, (m, s))
data = np.concatenate((r_dims, parity_dim, s_dims), axis=1)
if sigma:
Expand All @@ -55,16 +69,23 @@ def synthetic_data_gauss(m, r, s, sigma=0.1):
return data


def synthetic_data_0(m):
l = int(m / 2)
first = np.concatenate((np.random.uniform(-1, 0, (l, 1)), np.random.uniform(0, 1, (l, 1))), axis=1)
sec = np.concatenate((np.random.uniform(0, 1, (m - l, 1)), np.random.uniform(-1, 0, (m - l, 1))), axis=1)
return np.concatenate((first, sec), axis=0)
def synthetic_with_nearcopies(m, k, l, sigma=0.1):
k_dims = np.repeat(np.random.uniform(-0.5, 0, (m, 1)), k, axis=1) if k > 0 else np.empty((m, k))
l_dims = np.repeat(np.random.uniform(0, 0.5, (m, 1)), l, axis=1) if l > 0 else np.empty((m, l))

data = np.concatenate((k_dims, l_dims), axis=1)
if sigma:
e = np.random.normal(0, sigma, (m, k + l))
data = data + e

return data


if __name__ == '__main__':
data__ = np.concatenate((synthetic_data_1(20000, 9, 0, 0), np.zeros((20000, 1))), axis=1)
file = 'synthetic_10d_parity_problem.csv'
rows = 20000
data__ = np.concatenate((synthetic_with_nearcopies(rows, 2, 0, 0), np.zeros((rows, 1))), axis=1)
# file = 'synthetic_cases/synthetic_3d_gauss2.csv'
file = 'synthetic_cases/synthetic_exact_copies2_2.csv'

if os.path.isfile(file):
raise ValueError
Expand Down
12 changes: 7 additions & 5 deletions experiments_logging.py
Expand Up @@ -6,14 +6,16 @@
def plot_data_3d(data):
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
# data = data[np.logical_and(data[0] < 0, data[1] > 0)]

color_cond = {'b': np.logical_and(data[0] < 0, data[1] > 0),
color_cond = {'b': np.logical_and(data[0] < 0, np.logical_and(data[1] > 0, data[2] < 0)),
'k': np.logical_and(data[0] < 0, np.logical_and(data[1] > 0, data[2] > 0)),
'g': np.logical_and(data[0] > 0, data[1] < 0),
'r': np.logical_and(data[0] < 0, data[1] < 0),
'c': np.logical_and(data[0] > 0, data[1] > 0),
}
for c in color_cond:
ax.scatter(data[0][color_cond[c]], data[1][color_cond[c]], data[2][color_cond[c]], c=c)
ax.scatter(data[0][color_cond[c]], data[1][color_cond[c]], data[2][color_cond[c]], c=c, s=1)

ax.set_xlabel('X0')
ax.set_ylabel('X1')
Expand Down Expand Up @@ -49,6 +51,6 @@ def write_cut_file(name, disc_intervals):
out.write('-------------------------------------\n')


# if __name__ == '__main__':
# data = pd.read_csv("synthetic_3d_parity_problem.csv", delimiter=";", header=None, na_values='?')
# plot_data_3d(data)
if __name__ == '__main__':
data = pd.read_csv("synthetic_cases/synthetic_3d_gauss.csv", delimiter=";", header=None, na_values='?')
plot_data_3d(data)
15 changes: 14 additions & 1 deletion interaction_distance.py
Expand Up @@ -2,7 +2,7 @@
import numpy as np

import uds
from constants import CorrelationMeasure, ID_THRESHOLD_QUANTILE
from constants import CorrelationMeasure, ID_THRESHOLD_QUANTILE, ID_SLIDING_WINDOW


def compute_IDs(bin_map, curr, data, dist_bins, dim_maxes):
Expand Down Expand Up @@ -81,3 +81,16 @@ def compute_max_ID_threshold(IDs):
IDs.sort()

return max(IDs)


def compute_sliding_count(IDs, ID_threshold):
count = []
avg = sum(IDs) / len(IDs)
for i in range(ID_SLIDING_WINDOW, len(IDs)):
start = i - ID_SLIDING_WINDOW if i > ID_SLIDING_WINDOW else 0
count.append(sum(1 for id in IDs[start: i] if id > ID_threshold))
# i = 0
# while i < len(IDs):
# count.append(sum(1 for id in IDs[i: i + ID_SLIDING_WINDOW] if id > ID_threshold))
# i += ID_SLIDING_WINDOW
return count
37 changes: 20 additions & 17 deletions main.py
Expand Up @@ -38,14 +38,15 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure
dim_count = data.shape[1]

# number of initial dist_bins
# todo remove later
# todo old remove later
# init_bins_count = 20 # ceil in original ipd...
init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) # ceil in original ipd...
write('row count:', data.shape[0])
write('init_bins_count:', init_bins_count)
write('ID_THRESHOLD_QUANTILE:', cst.ID_THRESHOLD_QUANTILE)

# normalization step todo(optional)
# todo by default the normalization is optional as it does not influence on the results
# normalization step todo old (optional)
# todo old by default the normalization is optional as it does not influence on the results

# norm_data = data.apply(lambda x: 2 * cst.NORMALIZATION_RADIUS * (x - x.min()) / (
# x.max() - x.min()) - cst.NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape)))
Expand All @@ -59,12 +60,12 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure
orig_binning = Binning(norm_data)
rank_data = orig_binning.get_rank_data()

# plt.figure(1)
plt.figure(1)

height = int(math.sqrt(dim_count ))
width = int(math.ceil((dim_count ) / height))

# fig, axes = plt.subplots(nrows=height, ncols=width, squeeze=False)
fig, axes = plt.subplots(nrows=height, ncols=width, squeeze=False)

# iterate over all the dimensions
for curr in range(dim_count):
Expand All @@ -80,6 +81,9 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure
IDs = id.compute_IDs(bin_map, curr, norm_data, dist_bins, dim_maxes) if method == cst.Method.ORIGINAL else \
id.compute_IDs_extended(bin_map, curr, norm_data, dist_bins, dim_maxes, cor_measure, cst.MAX_SUBSPACE_SIZE)
ID_threshold = id.compute_ID_threshold(IDs)
# todo ext compute sliding average and count ID peaks above the avg (in a sliding window)
# ID_peaks = id.compute_sliding_count(IDs, ID_threshold)

# pd.DataFrame(IDs).to_csv(prefix + "_IDs_" + str(curr) + ".csv")
# -----------------------------OPTIMAL MERGE STRATEGY----------------------------------

Expand All @@ -95,11 +99,11 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure
(curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations, dist_bins,
min_id, rank_data)

# ax1 = axes[int(curr / width), int(curr % width)]
ax1 = axes[int(curr / width), int(curr % width)]
# # ax1.hist(IDs, bins=100, color='c')
# ax1.plot([i for i in range(len(IDs))], IDs)
# ax1.axhline(ID_threshold, color='b', linestyle='dashed', linewidth=1)
# ax1.set_title('dimension ' + str(curr))
ax1.plot([i for i in range(len(IDs))], IDs)
ax1.axhline(ID_threshold, color='b', linestyle='dashed', linewidth=1)
ax1.set_title('dimension ' + str(curr))

# ax2 = axes[int((2*curr + 1) / width), int((2*curr + 1) % width)]
# ax2.plot(sorted(IDs), color='k')
Expand All @@ -118,7 +122,7 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure
write("{0:.2f}".format(curr_macro_intervals[macro_id][1]) + " -", IDs[macro_bin[-1]], '[q=' +
str((sorted(IDs).index(IDs[macro_bin[-1]]) + 1) / len(IDs)) + ']')
# ax1.axhline(IDs[macro_bin[-1]], color='r', linewidth=1)
# ax1.plot([macro_bin[-1]], [IDs[macro_bin[-1]]], marker='o', markersize=3, color="red")
ax1.plot([macro_bin[-1]], [IDs[macro_bin[-1]]], marker='o', markersize=3, color="red")
# ax2.axvline(IDs[macro_bin[-1]], color='r', linewidth=1)

write('\nnumber of points per macrobin:')
Expand All @@ -131,9 +135,8 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure
disc_macro_intervals.append(curr_macro_intervals)
disc_points.append(curr_macro_points)

# plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25,
# wspace=0.35)
# plt.savefig(prefix + '.png', format='png')
plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25, wspace=0.35)
plt.savefig(dir + 'IDs.png', format='png')
return disc_macro_intervals, disc_points, class_labels


Expand Down Expand Up @@ -161,7 +164,7 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_


if __name__ == "__main__":
sys.argv = '-f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -r=19881'.split(' ')
sys.argv = '-f=synthetic_cases/synthetic_3d_parity_problem.csv -d=;'.split(' ')
# if len(sys.argv) < 2:
# print('Usage: main.py -f=<data_file> -d=<delimiter> -c=<number of columns> -m=<[original|greedy]> -cor=<[uds]>')
file_arg = list(filter(lambda x: x.startswith("-f="), sys.argv))
Expand Down Expand Up @@ -204,6 +207,6 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_
with open(log_file, 'w') as log:
disc_intervals, disc_points, class_labels = compute_optimal_discretization(data, method, cor_measure)

write_out_file(dir + cst.FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels)

write_cut_file(dir + cst.FILE_DATA_CUTS, disc_intervals)
# write_out_file(dir + cst.FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels)
#
# write_cut_file(dir + cst.FILE_DATA_CUTS, disc_intervals)
6 changes: 3 additions & 3 deletions merging.py
Expand Up @@ -22,7 +22,7 @@ def break_points_number(macro_bin, IDs, ID_threshold):
:param ID_threshold:
:return:
'''
# todo condition as in original IPD: ID > ID_threshold
# todo old condition as in original IPD: ID > ID_threshold
ID_boolean = [1 if ID > ID_threshold else 0 for ID in IDs[macro_bin[:-1]]]
return sum(ID_boolean)

Expand All @@ -35,10 +35,10 @@ def compute_bin_cost(c, l, k, macro_bin, IDs, ID_threshold):
macro_bin_size_code = quasi_uniform_code(macro_bin_size)
break_points_size = break_points_number(macro_bin, IDs, ID_threshold)

# todo in the original ipd L_disc L_N is computed for (k-1)
# todo old in the original ipd L_disc L_N is computed for (k-1)
# L_disc = quasi_uniform_code(k) + math.log(comb(c - 1, k - 1), 2)
L_disc = quasi_uniform_code(k - 1) + math.log(comb(c - 1, k - 1), 2)
# todo in the original ipd L_disc L_N is computed for (k-1)
# todo old in the original ipd L_disc L_N is computed for (k-1)
# L_disc_prev = - (quasi_uniform_code(k - 1) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)
L_disc_prev = - (quasi_uniform_code(k - 2) + math.log(comb(l - 1, k - 2), 2) if k > 1 else 0)

Expand Down

0 comments on commit 14ad1aa

Please sign in to comment.