Skip to content

Commit

Permalink
adjusting to python 3.4.2
Browse files Browse the repository at this point in the history
  • Loading branch information
Tatiana Dembelova committed Aug 11, 2017
1 parent 49a3b2f commit f58cab5
Show file tree
Hide file tree
Showing 10 changed files with 210 additions and 198 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
*.iml

logs/*
logs2/*
*.png
# Mobile Tools for Java (J2ME)
.mtj.tmp/
Expand Down
4 changes: 4 additions & 0 deletions cjs.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,10 @@ def compute_CJSs(bin_map, curr, data, dist_bins, dim_maxes):
data_wo_curr.pop(curr)
maxes_ = dim_maxes.drop(curr).to_frame().reset_index(drop=True)

return compute_CJSs1(bin_map, data_wo_curr, dist_bins, maxes_)


def compute_CJSs1(bin_map, data_wo_curr, dist_bins, maxes_):
cjs = []
for bin_id, binn in enumerate(dist_bins[1:], start=1):
bin_data = data_wo_curr.loc[bin_map == binn]
Expand Down
9 changes: 7 additions & 2 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@ class CorrelationMeasure(Enum):
MAC = 3


ID_THRESHOLD_QUANTILE = 0.8
class DistanceMeasure(Enum):
ID = 1
CJS = 2


ID_THRESHOLD_QUANTILE = 0.90
ID_SLIDING_WINDOW = 40

NORMALIZATION_RADIUS = 1
Expand All @@ -27,7 +32,7 @@ class CorrelationMeasure(Enum):
# subspace mining parameters
MAX_SUBSPACE_SIZE = 5
HETEROGENEOUS_THRESHOLD=0.8
BEAM_WIDTH=3
BEAM_WIDTH=4

# cjs
CLUMP = 2
Expand Down
43 changes: 35 additions & 8 deletions data_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,41 @@ def synthetic_cjs():
axis=1)), axis=0)


def append_irrelevant_features(file, n):
if n == 0:
raise ValueError("# of irrelevant features is 0")
data = pd.read_csv(file, delimiter=";", header=None, na_values='?')
rows = data.shape[0]
last_dim = data.shape[1] - 1
irrel_data = np.random.uniform(-0.5, 0.5, (rows, n))
return np.concatenate([data.loc[:, :last_dim - 1], irrel_data, data.loc[:, last_dim].to_frame()], axis=1) if (data[last_dim] == 0).all() \
else np.concatenate([data, irrel_data], axis=1)


def generate():
# -------generating dataset
# data = synthetic_cube_in_cube(rows, rel_features, irrel_features, 'r')
# data__ = synthetic_cjs()
#
# # add zeroes as default class
# data = np.concatenate((data, np.zeros((rows, 1))), axis=1)
# -------appending irrelevant features to existing dataset
data = append_irrelevant_features(source, irrel_features)
# storing to disk
pd.DataFrame(data).to_csv(file, sep=';', header=False, index=False, float_format='%.2f')


if __name__ == '__main__':
file = 'synthetic_cases/synthetic_cjs_nonsimilar.csv'
if os.path.isfile(file):
raise ValueError
for i in [1,2,3,4,5,10]:
file = 'synthetic_cases/synthetic_10d_parity_problem_' + str(i) + '.csv'
source = 'synthetic_cases/synthetic_10d_parity_problem.csv'

if os.path.isfile(file):
raise ValueError(file + " already exists!")

rows = 20000
# data__ = np.concatenate((synthetic_cube_in_cube(rows, 2, 0, 'r'), np.zeros((rows, 1))), axis=1)
# file = 'synthetic_cases/synthetic_3d_gauss2.csv'
data__ = synthetic_cjs()
# parameters
rows = 20000
rel_features = 2
irrel_features = 10

pd.DataFrame(data__).to_csv(file, sep=';', header=False, index=False, float_format='%.2f')
generate()
38 changes: 19 additions & 19 deletions interaction_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,25 @@
def compute_IDs(bin_map, curr, data, dist_bins, dim_maxes):
data_wo_curr = data.copy()
data_wo_curr.pop(curr) # todo slow?
return _compute_IDs(bin_map, data_wo_curr, dim_maxes, dist_bins)


def compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, cor_measure,
method,
k=cst.MAX_SUBSPACE_SIZE,
delta=cst.HETEROGENEOUS_THRESHOLD,
beam_width=cst.BEAM_WIDTH):
if method == cst.Method.GREEDY_TOPK:
subspace = sm.greedy_topk(data, curr, k, cor_measure)
elif method == cst.Method.HET_GREEDY_TOPK:
subspace = sm.het_greedy_topk(data, curr, k, delta, cor_measure)
# todo the rest of the methods
data = data.copy().loc[:, subspace]

return _compute_IDs(bin_map, data, dim_maxes, dist_bins)


def _compute_IDs(bin_map, data, dim_maxes, dist_bins):
return compute_IDs1(bin_map, data_wo_curr, dim_maxes, dist_bins)
#
# deprecated
# def compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, cor_measure,
# method,
# k=cst.MAX_SUBSPACE_SIZE,
# delta=cst.HETEROGENEOUS_THRESHOLD,
# beam_width=cst.BEAM_WIDTH):
# if method == cst.Method.GREEDY_TOPK:
# subspace = sm.greedy_topk(data, curr, k, cor_measure)
# elif method == cst.Method.HET_GREEDY_TOPK:
# subspace = sm.het_greedy_topk(data, curr, k, delta, cor_measure)
# # todo the rest of the methods
# data = data.copy().loc[:, subspace]
#
# return compute_IDs1(bin_map, data, dim_maxes, dist_bins)


def compute_IDs1(bin_map, data, dim_maxes, dist_bins):
inner_bin_measures = []
inter_bin_measures = []
for bin_id, binn in enumerate(dist_bins):
Expand Down
Loading

0 comments on commit f58cab5

Please sign in to comment.