Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
added new baseline method, PREDEFINED_SUBSPACESETS_NAIVE
  • Loading branch information
Tatiana Dembelova committed Oct 19, 2017
1 parent 1aa01ab commit f9ee841
Show file tree
Hide file tree
Showing 11 changed files with 238 additions and 53 deletions.
63 changes: 63 additions & 0 deletions ID_correlation_measure.py
@@ -0,0 +1,63 @@
import numpy as np
import experiments_logging as log
import pandas as pd
import interaction_distance as id
import data_generator as dg

def evidence_ID():
# no interaction
b = np.matrix(np.random.uniform(1, 2, (4000, 1)))
back = np.matrix(np.random.uniform(0, 2, (4000, 1)))
res = np.append(b, back, axis=0)
b1 = np.matrix(np.random.uniform(0, 2, (8000, 1)))

# either horizontal or vertical tube
# all = np.append(b1, res, axis=1)
all = np.append(res, b1, axis=1)
df = pd.DataFrame(all)
df = df.sort_values(by=0).reset_index(drop=True)
print(id.compute_ID(df.loc[:100, 1].to_frame(), df.loc[7900:8000, 1].to_frame(), [0, 2]))
# log.plot_data_2d(df)
# log.plot_data_2d(pd.concat([df.loc[:100], df.loc[7900:8000]], axis=0))

# cube interaction
b = np.matrix(np.random.uniform(1, 2, (4000, 1)))
back = np.matrix(np.random.uniform(0, 2, (4000, 1)))
res = np.append(b, back, axis=0)
b1 = np.matrix(np.random.uniform(1, 2, (4000, 1)))
back1 = np.matrix(np.random.uniform(0, 2, (4000, 1)))
res1 = np.append(b1, back1, axis=0)

all = np.append(res, res1, axis=1)
df = pd.DataFrame(all)
df = df.sort_values(by=0).reset_index(drop=True)
print(id.compute_ID(df.loc[:100, 1].to_frame(), df.loc[7900:8000, 1].to_frame(), [0, 2]))
# log.plot_data_2d(df)
# log.plot_data_2d(pd.concat([df.loc[:100], df.loc[7900:8000]], axis=0))

cg = dg.produce_cube_generator(7, 0, 2, "i", ".csv")
data, filname = cg.build()
print(cg.subspaces)
print(cg.perf_disc)

data = pd.DataFrame(data)
dim_count = data.shape[1]
for curr in data[:-1]:
dims = data.columns.tolist()
dims.remove(curr)
dims.remove(dim_count - 1)
curr_data = data.sort_values(by=curr).reset_index(drop=True).loc[:, dims]
rows = curr_data.shape[0]
print('curr dimension', curr)
for dim in dims:
counter = 0
ids = []
while(True):
if counter + 280 > rows:
break
ids.append(id.compute_ID(curr_data.loc[counter:counter + 140, dim].to_frame(),
curr_data.loc[counter + 140: counter + 280, dim].to_frame(), [2] * dim_count))
counter += 1
# needs data normalization todo
print('interaction with', dim, np.average(ids))
# break
12 changes: 9 additions & 3 deletions constants.py
Expand Up @@ -15,6 +15,7 @@ class Method(Enum):
PREDEFINED_SUBSPACESETS = 9 # # the subspace sets gradually increase number of dimensions in one of the subspaces chosen randomly; subspace sets are chosen with a step of 2
PREDEFINED_SUBSPACESETS_SYNCHRONOUS_GREEDY = 13 # the subspace sets gradually increase number of dimensions in all the subspaces; if a subspace has been used up, it extends to irrelevant dimensions
PREDEFINED_SUBSPACESETS_SYNCHRONOUS_OPTIMAL = 14 # the subspace sets gradually increase number of dimensions in all the subspaces; if a subspace has been used up, it stays the same
PREDEFINED_SUBSPACESETS_NAIVE = 15

PREDEFINED_OPTIMAL_SUBSPACESET = 10
PREDEFINED_OPTIMAL_SUBSPACESET_AND_IRRELEVANT = 11
Expand Down Expand Up @@ -72,9 +73,12 @@ class DistanceMeasure(Enum):

# new settings (more constrained)
IRRELEVANT_FEATURES_RANGE_LIST = [0, 1, 2, 4, 8, 16, 32, 64, 99, 3, 6, 12, 24, 48, 82] if socket.gethostname() != 'push' else [0, 1, 2, 3]
RELEVANT_FEATURES_RANGE_LIST = [2, 3, 4, 6, 8, 12, 16, 23, 30] if socket.gethostname() == 'push' else [2]
RELEVANT_FEATURES_RANGE_LIST = [2, 3, 4, 6, 8, 12, 16, 23, 30] if socket.gethostname() == 'push' else [7]
INTERACTION_NUMBER_RANGE_LIST = [1, 2, 4, 8, 10] if socket.gethostname() == 'push' else [3]
INTERACTION_TYPES_RANGE_LIST=["c", 'i', "io"] if socket.gethostname() == 'push' else ['c']
INTERACTION_TYPES_RANGE_LIST=["c", 'i', "io"] if socket.gethostname() == 'push' else ['i']
CUBES_LOWER_BOUND=1
CUBES_UPPER_BOUND=3
NAIVE_CHUNKS_NUMBER_RANGE_LIST = [2, 3, 4, 5, 10, 20, 30]


BASE = '/local/tmp/ipd_extended_experiments2/' if socket.gethostname() == 'push' \
Expand All @@ -91,4 +95,6 @@ class DistanceMeasure(Enum):
SLIM_CONVERT_CONF = SLIM_BASE + "branches/slim/trunk/convertdb.conf"

PRECISION_RECALL_FILENAME = "Precision_recall_runtime.csv"
COMPRESSION_FILENAME = "Compression.csv"
COMPRESSION_FILENAME = "Compression.csv"

WEKA_BIN = "/local/tmp/ipd_extended_experiments2/weka/weka-3-9-1/weka.jar" if socket.gethostname() == 'push' else "/Users/tatyanadembelova/Downloads/weka-3-9-1/weka.jar"
15 changes: 15 additions & 0 deletions data_generation.py
Expand Up @@ -8,17 +8,32 @@

# synthetic case from uds
def correlated_data(m, n, sigma, f):
# l1 = int(n / 2)
# l2 = n - l1
# Z = np.random.normal(0, 1, (m, l1))
# A = np.matrix(np.random.uniform(1, 2, (l1, l1)))
# X1 = Z * A
# B = np.matrix(np.random.uniform(1, 2, (l1, l2)))
# W = X1 * B
# E = np.random.normal(0, sigma, (m, l2))
# X2 = f(W) + E
# result = np.append(X1, X2, axis=1)
# print(result)

l1 = int(n / 2)
l2 = n - l1
Z = np.random.normal(0, 1, (m, l1))
A = np.matrix(np.random.uniform(1, 2, (l1, l1)))
X1 = Z * A
# A = np.matrix(np.random.uniform(1, 2, (m, l1)))
# X1 = A
B = np.matrix(np.random.uniform(1, 2, (l1, l2)))
W = X1 * B
E = np.random.normal(0, sigma, (m, l2))
X2 = f(W) + E
result = np.append(X1, X2, axis=1)
print(result)

return result


Expand Down
42 changes: 24 additions & 18 deletions data_generator.py
Expand Up @@ -132,24 +132,25 @@ def generate_overlap_partition(rf, c):
return partition


def produce_data_generator(rf, irf, c, type, file_name):
def produce_cube_generator(rf, irf, interactions, type, cubes, file_name):
dg = CubesGenerator(rf, irf, RADIUS, file_name)
# same number of records for each of the cubes + background
cube_rows = int(ROWS / (c + 1))
# same number of records for each of the interactions * cubes + background
cube_rows = int(ROWS / (interactions * cubes + 1))
if type == 'c':
partition = [range(rf) for i in range(c)]
partition = [range(rf) for i in range(interactions)]
elif type == 'i':
partition = generate_partition(rf, c)
partition = generate_partition(rf, interactions)
elif type == 'io':
partition = generate_overlap_partition(rf, c)
partition = generate_overlap_partition(rf, interactions)
else:
raise ValueError("no such type!")

for p in partition:
location = dict()
for j in p:
location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH)
dg.add_cube_parameter(CubeParameters(cube_rows, location))
for cube in range(cubes):
location = dict()
for j in p:
location[j] = (random.uniform(0, 1) * (RADIUS * 2 - 1) - RADIUS, CUBE_WIDTH)
dg.add_cube_parameter(CubeParameters(cube_rows, location))
dg.add_cube_parameter(CubeParameters(cube_rows))
return dg

Expand All @@ -167,13 +168,13 @@ def produce_all_data_generators():
perf_subspaces = dict()
perf_discs = dict()

def produce_dg(name, rf, c, type):
def produce_dg(name, rf, i, type, cubes):

# if os.path.exists(basedir + name) and os.path.exists(
# perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
# continue
if os.path.exists(basedir + name) and os.path.exists(
perf_disc_dir + 'cut_' + name.replace('csv', 'txt')):
return

dg = produce_data_generator(rf, cst.IRRELEVANT_FEATURES, c, type, name + ".csv")
dg = produce_cube_generator(rf, cst.IRRELEVANT_FEATURES, i, type, cubes, name + ".csv")
perf_discs[name] = dg.get_discs()
perf_subspaces[name] = dg.get_subspaces()
data_generators.append(dg)
Expand Down Expand Up @@ -213,8 +214,13 @@ def store(data):


if __name__ == '__main__':
cg = produce_cube_generator(7, 2, 3, 'c', 'bla')
print(cg.subspaces)
cg = produce_cube_generator(7, 2, 3, 'i', 'bla')
print(cg.subspaces)
# print(generate_overlap_partition(7, 3))
generators = produce_all_data_generators()
for g in generators:

store(g.build())
# generators = produce_all_data_generators()
# for g in generators:
#
# store(g.build())
2 changes: 1 addition & 1 deletion discretization_quality_measure.py
Expand Up @@ -166,7 +166,7 @@ def prepare_compression1(experiment_name):
return False
return True

def run_compression1(name, rf=None, c=None, type=None):
def run_compression1(name, rf=None, i=None, type=None, c=None):
# 1. check slim db
# convert dat-file to db-file if it does not exist
if not os.path.exists(cst.SLIM_DATA_DIR + name + "/" + name + ".db"):
Expand Down
6 changes: 4 additions & 2 deletions experiments_logging.py
@@ -1,6 +1,7 @@
import matplotlib.pyplot as plt
import pandas as pd
import discretization_quality_measure as dq
import data_generation as dg
from mpl_toolkits.mplot3d import Axes3D
import data_generation as dg_old
import util
Expand Down Expand Up @@ -86,7 +87,7 @@ def save_plot_data_3d(f, data):


def plot_data_2d(data):
plt.scatter(data[1], data[2], s=1, c='k')
plt.scatter(data[0], data[1], s=1, c='k')
plt.xlabel("dim 0")
plt.ylabel("dim 1")
plt.show()
Expand Down Expand Up @@ -158,7 +159,8 @@ def get_cuts(disc_intervals):
# data = pd.read_csv("synthetic_cases/blobs/3d_3_blobs_aligned.csv", delimiter=";", header=None, na_values='?')
# data = pd.read_csv("new_cubes/cubes_10_100_03_i.csv", delimiter=";", header=None, na_values='?')
# data = pd.read_csv("new_cubes/cubes_02_03_c.csv", delimiter=";", na_values='?', header=None)
data = pd.read_csv('synthetic_cases/uds_new.csv', delimiter=',', header=None)
# data = pd.read_csv('synthetic_cases/uds_new.csv', delimiter=',', header=None)
data = pd.DataFrame(dg.correlated_data(4000, 2, 0.1, dg.func3))
# data = pd.DataFrame(dg_old.correlated_data(4000, 3, 0.5, dg_old.func3))
# data = pd.DataFrame(dg.cubes(4000))
plot_data_2d(data)
Expand Down
24 changes: 20 additions & 4 deletions main.py
Expand Up @@ -447,6 +447,20 @@ def compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subs
return subspace_sets


def compute_predefined_subspace_sets_naive(rel_features):
dims = [i for i in range(rel_features + cst.IRRELEVANT_FEATURES)]
random.shuffle(dims)
subspace_sets = []
for chunks in cst.NAIVE_CHUNKS_NUMBER_RANGE_LIST:
ss = list(util.chunks(dims, chunks))
# merge the last with the previous subspace, if the last consists only of 1 dimension
if len(ss[-1]) == 1:
ss[-2].extend(ss[-1])
del ss[-1]
subspace_sets.append(ss)
return subspace_sets


def compute_subspace_sets(data_file_name, method):
rel_features = util.parse_relevant_features(data_file_name)
ideal_subspace_set = get_ideal_subspace_set(data_file_name)
Expand Down Expand Up @@ -476,9 +490,11 @@ def compute_subspace_sets(data_file_name, method):
return compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subspace_set, True)
elif method is cst.Method.PREDEFINED_SUBSPACESETS_SYNCHRONOUS_OPTIMAL:
return compute_predefined_subspace_sets_synchronous_greedy(rel_features, ideal_subspace_set, False)
elif method is cst.Method.PREDEFINED_SUBSPACESETS_NAIVE:
return compute_predefined_subspace_sets_naive(rel_features)

else:
raise ValueError("wrong method!")
raise ValueError("the method has not been implemented yet! " + method)


def execute(param, loader=None):
Expand Down Expand Up @@ -709,9 +725,9 @@ def prepare(base_dir, data_file, method, time_mark=False, delim=";", columns=Non
return params


def collect_dataset_params(base_dir):
def collect_experiment_params(base_dir):

def collect(name, rf, c, type):
def collect(name, rf, i, type, c):
params = []
file_path = cst.DATA_DIR + name + ".csv"

Expand Down Expand Up @@ -753,7 +769,7 @@ def collect(name, rf, c, type):
# cubes_03_10_c
# print(compute_predefined_subspace_sets(3, [[0,1,2]]))
# exit(1)
params = collect_dataset_params("logs_test3")
params = collect_experiment_params("logs_test3")
# print(params)
# print(compute_subspace_sets("cubes_10_03_i.csv", cst.Method.PREDEFINED_SUBSPACESETS))
# exit(1)
Expand Down
15 changes: 8 additions & 7 deletions runExperiment.py
Expand Up @@ -18,6 +18,7 @@
import main
import pandas as pd
import discretization_quality_measure as dqm
import data_generator as dg

newRun = None
nbThreads = int(multiprocessing.cpu_count() / 2)
Expand Down Expand Up @@ -48,11 +49,6 @@

items = multiprocessing.Queue()

# todo items.put(WHATEVER PARAMETERS OF TASK)
# data_generators = dg.produce_all_data_generators()
# for data_generator in data_generators:
# items.put(data_generator)

class UnregisteredItem(Exception):
pass

Expand Down Expand Up @@ -97,7 +93,12 @@ def register_ideal_disc(self, name):

loader = Loader()

params = main.collect_dataset_params("logs_test")
# todo items.put(WHATEVER PARAMETERS OF TASK)
# params = dg.produce_all_data_generators()
# for data_generator in params:
# items.put(data_generator)

params = main.collect_experiment_params("logs_test")
if len(params) == 0:
print("no parameters collected!")
exit(0)
Expand Down Expand Up @@ -160,7 +161,7 @@ def datasetWriter():
while True:
try:
result = datasets.get(block=True, timeout=10)
# dg.store(dataset)
# dg.store(result)
main.store(result, loader)
except queue.Empty:
break
Expand Down
39 changes: 39 additions & 0 deletions run_classification.py
@@ -0,0 +1,39 @@
import constants as cst
import subprocess as sp
import re
import os


def run_random_forest1(base_dir_name, experiment_name):
file_path = cst.BASE + base_dir_name + "/" + experiment_name + "/out.arff"
if not os.path.exists(file_path):
return None
try:
output = str(sp.check_output(["java", "-cp", cst.WEKA_BIN,
"weka.classifiers.trees.RandomForest", '-P', '100', '-I',
'100', '-num-slots', '1', '-K', '0', '-M', '1.0', '-V', '0.001', '-S', '1',
"-t", file_path], timeout=30))
match = re.search('Correctly Classified Instances\s+\d+\s+(\d+\.\d+)\s+%', output)
if match:
return experiment_name + "," + match.group(1)
return experiment_name + ",?"
except sp.TimeoutExpired:
print("timeout exceeded", experiment_name)
return experiment_name + ",?"


def classify_experiments(base_dir_name):
results = []
for experiment in os.listdir(cst.BASE + base_dir_name):
if 'cubes' not in experiment:
continue
classification = run_random_forest1(base_dir_name, experiment)
results.append(classification)
results.append("\n")
return results

if __name__ == '__main__':
base_dir_name = "logs_test"
res = classify_experiments(base_dir_name)
with open(cst.BASE + base_dir_name + "/Classification.csv", "w") as f:
f.writelines(res)

0 comments on commit f9ee841

Please sign in to comment.