Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
added new synthetic cases with gaussian blobs
added final discretization vizualization
  • Loading branch information
Tatiana Dembelova committed Aug 29, 2017
1 parent f810d1f commit 54f6d3e
Show file tree
Hide file tree
Showing 16 changed files with 44,968 additions and 934 deletions.
28 changes: 15 additions & 13 deletions data_generation.py
Expand Up @@ -111,15 +111,17 @@ def synthetic_cjs():
np.concatenate((np.random.normal(4, 1, (100, 1)), np.random.normal(5, 1, (100, 1))),
axis=1)), axis=0)

def blobs():
# 2d
n = 4000
l = int(n/2)
blob1 = np.random.normal(0, 1, (l, 2)) - np.concatenate((np.ones((l, 1)), np.zeros((l, 1))), axis=1)
def blobs(rows):
# 3d
blobs_number = 4
dims = 3
l = int(rows/blobs_number)
blob1 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * -6, np.ones((l, 1)) * -0, np.ones((l, 1)) * -0), axis=1)
blob3 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 3, np.ones((l, 1)) * 3, np.ones((l, 1)) * -3), axis=1)
blob4 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * -0, np.ones((l, 1)) * -0, np.ones((l, 1)) * -0), axis=1)
blob2 = np.random.normal(0, 1, (l, dims)) + np.concatenate((np.ones((l, 1)) * 3, np.ones((l, 1)) * 3, np.ones((l, 1)) * 3), axis=1)

blob2 = np.random.normal(0, 1, (l, 2)) + np.concatenate((np.ones((l, 1)), np.zeros((l, 1))), axis=1)

return np.concatenate((blob1, blob2), axis=0)
return np.concatenate((blob1, blob2, blob3, blob4), axis=0)


def append_irrelevant_features(file, n):
Expand All @@ -137,10 +139,10 @@ def generate():
# -------generating dataset
# data = synthetic_cube_in_cube(rows, rel_features, irrel_features, 'l')
# data__ = synthetic_cjs()
data = correlated_data(rows, rel_features + irrel_features, 1, func1)
#
# data = correlated_data(rows, rel_features + irrel_features, 1, func1)
data = blobs(rows)
# # add zeroes as default class
data = np.concatenate((data, np.zeros((rows, 1))), axis=1)
data = np.concatenate((data, np.zeros((data.shape[0], 1))), axis=1)
# -------appending irrelevant features to existing dataset
# data = append_irrelevant_features(source, irrel_features)
# storing to disk
Expand All @@ -163,14 +165,14 @@ def generate():
# irrel_features = i
#
# generate()
file = 'synthetic_cases/uds_test_4.csv'
file = 'synthetic_cases/3d_4_blobs_2_aligned_xor.csv'

if os.path.isfile(file):
raise ValueError(file + " already exists!")

# parameters
rows = 4000
rel_features = 2
irrel_features = 2
irrel_features = 0

generate()
5 changes: 3 additions & 2 deletions experiments_logging.py
Expand Up @@ -2,7 +2,7 @@
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd

import data_generation as dg
from data_generation import synthetic_cube_in_cube


Expand Down Expand Up @@ -65,5 +65,6 @@ def write_cut_file(name, disc_intervals):
if __name__ == '__main__':
# rows = 20000
# data = np.concatenate((synthetic_cube_in_cube(rows, 2, 0), np.zeros((rows, 1))), axis=1)
data = pd.read_csv("synthetic_cases/synthetic_cube_in_sparse_cube_3_0.csv", delimiter=";", header=None, na_values='?')
# data = pd.read_csv("synthetic_cases/synthetic_cube_in_sparse_cube_3_0.csv", delimiter=";", header=None, na_values='?')
data = pd.read_csv("synthetic_cases/3d_4_blobs_1_aligned_xor.csv", delimiter=";", header=None, na_values='?')
plot_data_3d(data)
38 changes: 29 additions & 9 deletions main.py
Expand Up @@ -42,7 +42,7 @@ def write(log, *args):
log.write('\n')


def plot_distances(dir, distances):
def plot_distances(dir, distances, disc_intervals):
dim_count = len(distances)
plt.figure(1)

Expand All @@ -58,6 +58,9 @@ def plot_distances(dir, distances):
# ax1.hist(distances, bins=100, color='c')
ax1.plot(dist[0], dist[1])
ax1.axhline(ID_threshold, color='b', linestyle='dashed', linewidth=1)
curr_macro_intervals = disc_intervals[curr]
for macro_id in range(1, len(curr_macro_intervals)):
ax1.axvline(curr_macro_intervals[macro_id][0], color='r')
ax1.set_title('dimension ' + str(curr))

plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25, wspace=0.35)
Expand Down Expand Up @@ -230,7 +233,7 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_
if len(sys.argv) == 1:
print(
'Usage: main.py -f=<data_file> -d=<delimiter> -c=<number of columns> -m=<[original|greedy_topk]> -cor=<[uds]> '
'-dist=<[id, cjs]>')
'-dist=<[id, cjs]> -t=<float>')
command = '-f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID'
print('Running default: ', command)
command_list = command.split(' ')
Expand All @@ -246,19 +249,34 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_
method_arg = list(filter(lambda x: x.startswith("-m="), command_list))
corr_measure_arg = list(filter(lambda x: x.startswith("-cor="), command_list))
distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list))
threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list))

data_file = file_arg[0].replace('-f=', '')
delimiter = delim_arg[0].replace('-d=', '') if delim_arg else ';'
if delim_arg:
delimiter = delim_arg[0].replace('-d=', '')
else:
print('using default delimiter ;')
delimiter = ';'
columns = int(columns_arg[0].replace('-c=', '')) if columns_arg else None
rows = int(rows_arg[0].replace('-r=', '')) if rows_arg else None
method = cst.Method[method_arg[0].replace('-m=', '').upper()] if method_arg else cst.Method.ORIGINAL
if method_arg:
method = cst.Method[method_arg[0].replace('-m=', '').upper()]
else:
print('using default method ORIGINAL')
method = cst.Method.ORIGINAL
cor_measure = cst.CorrelationMeasure[corr_measure_arg[0].replace('-cor=', '').upper()] if corr_measure_arg \
else None
if method is not cst.Method.ORIGINAL and cor_measure is None:
raise ValueError('A correlation measure should be given!')
distance_measure = cst.DistanceMeasure[
distance_measure_arg[0].replace('-dist=', '').upper()] if distance_measure_arg \
else cst.DistanceMeasure.ID
if distance_measure_arg:
distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()]
else:
print('using default distance measure ID')
distance_measure = cst.DistanceMeasure.ID
if threshold_arg:
cst.ID_THRESHOLD_QUANTILE = float(threshold_arg[0].replace('-t=', ''))
else:
print('using default ID_THRESHOLD_QUANTILE = ', str(cst.ID_THRESHOLD_QUANTILE))

# reading data from the file with delimiter and NaN values as "?"
data = pd.read_csv(data_file, delimiter=delimiter, header=None, na_values='?')
Expand All @@ -276,9 +294,11 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_
+ "_" + distance_measure.name \
+ ("_" + cor_measure.name if cor_measure else "") \
+ "_" + method.name \
+ "_" + str(cst.ID_THRESHOLD_QUANTILE) \
+ "_" + data_file_name \
+ ("_" + str(columns) if columns else "") \
+ ("_" + str(rows) if rows else "") + "/"
+ ("_" + str(rows) if rows else "") \
+ "/"
os.makedirs(dir)

print('output files are:', dir + '*')
Expand All @@ -290,7 +310,7 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_
cor_measure,
distance_measure, log)

plot_distances(dir, distances)
plot_distances(dir, distances, disc_intervals)

write_out_file(dir + cst.FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels)

Expand Down

0 comments on commit 54f6d3e

Please sign in to comment.