Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
adjusting to python 3.4.2 - refactored binning module
  • Loading branch information
Tatiana Dembelova committed Aug 12, 2017
1 parent f58cab5 commit a4ab83c
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 72 deletions.
10 changes: 6 additions & 4 deletions cjs.py
Expand Up @@ -110,7 +110,7 @@ def dim_optimal_disc(prev, curr, I1, I2, binA, binB, maxes):
# equal frequency binning
global_min = min(binA[prev].min(), binB[prev].min())
binning = Binning(binA, prev, DEFAULT_BINS_COUNT, global_min)
points2binA_map = binning.equal_frequency_binning4()
points2binA_map = binning.equal_frequency_binning_duplicate_drop()
points2binB_map = binning.interpolate(binB)

# cjs discretizations and values
Expand Down Expand Up @@ -239,16 +239,18 @@ def _compute_CJS(binA, binB, maxes):
print(str(compute_CJS(binA, binB, pd.DataFrame(np.max(data[attrs]).transpose().reset_index(drop=True)))))


def compute_CJSs(bin_map, curr, data, dist_bins, dim_maxes):
def compute_CJSs(bin_map, curr, data, dim_maxes):
data_wo_curr = data.copy()
data_wo_curr.pop(curr)
maxes_ = dim_maxes.drop(curr).to_frame().reset_index(drop=True)

return compute_CJSs1(bin_map, data_wo_curr, dist_bins, maxes_)
return compute_CJSs1(bin_map, data_wo_curr, maxes_)


def compute_CJSs1(bin_map, data_wo_curr, dist_bins, maxes_):
def compute_CJSs1(bin_map, data_wo_curr, maxes_):
cjs = []
# distinct bins
dist_bins = bin_map.cat.categories
for bin_id, binn in enumerate(dist_bins[1:], start=1):
bin_data = data_wo_curr.loc[bin_map == binn]
prev_bin_data = data_wo_curr.loc[bin_map == dist_bins[bin_id - 1]]
Expand Down
59 changes: 28 additions & 31 deletions correlation_measures/binning.py
@@ -1,31 +1,39 @@
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
import re


class Binning:
def __init__(self, data, dim=None, desired_bins_count=None, global_min=None):
self.desired_bins_count = desired_bins_count if desired_bins_count is None or data.shape[0] > desired_bins_count\
def __init__(self, data, dim, desired_bins_count, global_min=None):
self.desired_bins_count = desired_bins_count if desired_bins_count is None or data.shape[0] > desired_bins_count \
else data.shape[0]
self.dim = dim
self.data = data
self.rank_data = data.rank(method='first')
self.global_min = global_min


# todo old (small reminder) in the original ipd it is NOT equal binning
# Series of binned points (with dropDuplicates produces not equally frequent bins)
def equal_frequency_binning(self, dim, bins_count):
return pd.qcut(self.rank_data.sort_values(by=dim)[dim], bins_count)

def equal_frequency_binning2(self):
qcut = pd.qcut(self.rank_data.sort_values(by=self.dim)[self.dim], self.desired_bins_count)
self.qcut = qcut.cat.rename_categories([i for i in range(self.desired_bins_count)]).reindex(qcut.index)
def equal_frequency_binning_by_rank(self):
self.rank_data = self.data.rank(method='first')
self.bins_count = self.desired_bins_count
self.qcut = pd.qcut(self.rank_data[self.dim], self.bins_count)
return self.qcut

def equal_frequency_binning_by_rank_int_categories(self):
self.equal_frequency_binning_by_rank()
self.qcut = self.qcut.cat.rename_categories([i for i in range(self.desired_bins_count)]).reindex(
self.qcut.index)
return self.qcut

def equal_frequency_binning4(self):
qcut = pd.qcut(self.data[self.dim], self.desired_bins_count, duplicates='drop')
qcut = qcut.cat.remove_unused_categories()
bounds = [c.right for c in qcut.cat.categories]
def equal_frequency_binning_duplicate_drop(self):
# todo python361
# qcut = pd.qcut(self.data[self.dim], self.desired_bins_count, duplicates='drop')
# todo python 342
qcut = self._compute_qcut()

# qcut = qcut.cat.remove_unused_categories()
bounds = [float(re.search(', (-*\d+\.*\d*)', c).group(1)) for c in qcut.cat.categories]
# including global_min with a margin of 1
bounds.insert(0, self.global_min - 1)
self.bounds = pd.Series(bounds)
Expand All @@ -34,22 +42,12 @@ def equal_frequency_binning4(self):
self.qcut = qcut.cat.rename_categories([i for i in range(self.bins_count)]).reindex(qcut.index)
return self.qcut

def equal_frequency_binning3(self, dim, desired_bins_count):
qcut = pd.qcut(self.data[dim], desired_bins_count, duplicates='drop')
self.qcut = qcut.cat.rename_categories([i for i in range(desired_bins_count)]).reindex(qcut.index)
return self.qcut

def get_bounds(self, global_min):
groupby = self.qcut.reset_index().groupby(0)

self.bounds = pd.Series(pd.unique(pd.concat([pd.Series(global_min - 1),
self.data.loc[groupby.last()['index'], self.dim]], axis=0))) \
.reset_index(drop=True)
return self.bounds


def get_rank_data(self):
return self.rank_data
def _compute_qcut(self):
quantiles = np.linspace(0, 1, self.desired_bins_count + 1)
bins = algos.quantile(self.data[self.dim], quantiles)
bins = pd.unique(bins)
qcut = pd.cut(self.data[self.dim], bins, include_lowest=True)
return qcut

def interpolate(self, other_bin):
if self.bounds is None:
Expand All @@ -62,4 +60,3 @@ def interpolate(self, other_bin):

data_ = pd.cut(other_col, self.bounds)
return data_.cat.rename_categories([i for i in range(self.bins_count)]).reindex(data_.index)

7 changes: 4 additions & 3 deletions interaction_distance.py
Expand Up @@ -5,10 +5,10 @@
import constants as cst


def compute_IDs(bin_map, curr, data, dist_bins, dim_maxes):
def compute_IDs(bin_map, curr, data, dim_maxes):
data_wo_curr = data.copy()
data_wo_curr.pop(curr) # todo slow?
return compute_IDs1(bin_map, data_wo_curr, dim_maxes, dist_bins)
return compute_IDs1(bin_map, data_wo_curr, dim_maxes)
#
# deprecated
# def compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, cor_measure,
Expand All @@ -26,7 +26,8 @@ def compute_IDs(bin_map, curr, data, dist_bins, dim_maxes):
# return compute_IDs1(bin_map, data, dim_maxes, dist_bins)


def compute_IDs1(bin_map, data, dim_maxes, dist_bins):
def compute_IDs1(bin_map, data, dim_maxes):
dist_bins = bin_map.cat.categories
inner_bin_measures = []
inter_bin_measures = []
for bin_id, binn in enumerate(dist_bins):
Expand Down
43 changes: 21 additions & 22 deletions main.py
Expand Up @@ -61,14 +61,14 @@ def plot_distances(dir, distances):
plt.savefig(dir + 'distances.png', format='png')


def compute_distances(bin_map, curr, data, dist_bins, dim_maxes,
def compute_distances(bin_map, curr, data, dim_maxes,
cor_measure, method, distance_measure,
k=cst.MAX_SUBSPACE_SIZE,
delta=cst.HETEROGENEOUS_THRESHOLD,
beam_width=cst.BEAM_WIDTH):
if method == cst.Method.ORIGINAL:
return id.compute_IDs(bin_map, curr, data, dist_bins, dim_maxes) if distance_measure == cst.DistanceMeasure.ID \
else cjs.compute_CJSs(bin_map, curr, data, dist_bins, dim_maxes)
return id.compute_IDs(bin_map, curr, data, dim_maxes) if distance_measure == cst.DistanceMeasure.ID \
else cjs.compute_CJSs(bin_map, curr, data, dim_maxes)

if method == cst.Method.GREEDY_TOPK:
subspace = sm.greedy_topk(data, curr, k, cor_measure)
Expand All @@ -77,8 +77,8 @@ def compute_distances(bin_map, curr, data, dist_bins, dim_maxes,
# todo the rest of the methods
data = data.copy().loc[:, subspace]

return id.compute_IDs1(bin_map, data, dim_maxes, dist_bins) if distance_measure == cst.DistanceMeasure.ID \
else cjs.compute_CJSs1(bin_map, data, dist_bins, dim_maxes)
return id.compute_IDs1(bin_map, data, dim_maxes) if distance_measure == cst.DistanceMeasure.ID \
else cjs.compute_CJSs1(bin_map, data, dim_maxes)


def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure=None,
Expand Down Expand Up @@ -107,28 +107,27 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure
dim_maxes = norm_data.max(0)
disc_macro_intervals = []
disc_points = []
orig_binning = Binning(norm_data)
rank_data = orig_binning.get_rank_data()

distancez = []
# iterate over all the dimensions
for curr in range(dim_count):
bin_map = orig_binning.equal_frequency_binning(curr, init_bins_count)

# distinct bins
dist_bins = bin_map.unique()
binning = Binning(norm_data, curr, init_bins_count)
bin_map = binning.equal_frequency_binning_by_rank()
dist_bins = bin_map.cat.categories

# -----------------------------INTERACTION DISTANCES----------------------------------

distances = compute_distances(bin_map, curr, norm_data, dist_bins, dim_maxes, cor_measure, method,
distances = compute_distances(bin_map, curr, norm_data, dim_maxes, cor_measure, method,
distance_measure)
# todo python361
# distancez.append([[data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[i].right)].index.tolist()[0], curr] for i in
# range(len(distances))], distances])
# todo python342
distancez.append(
[[data.loc[rank_data[rank_data[curr] == math.floor(float(re.search(', (\d+\.*\d*?)', dist_bins[i]).group(1)))].index.tolist()[0], curr] for i in
range(len(distances))], distances])
distancez.append([[data.loc[binning.rank_data[binning.rank_data[curr]
== math.floor(float(re.search(', (-*\d+\.*\d*)',
dist_bins[i]).group(1)))]
.index.tolist()[0], curr] for i in range(len(distances))], distances])

ID_threshold = id.compute_ID_threshold(distances)
# todo ext compute sliding average and count ID peaks above the avg (in a sliding window)
# ID_peaks = id.compute_sliding_count(distances, ID_threshold)
Expand All @@ -145,8 +144,8 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure

min_id = np.argmin(F[-1])

(curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations, dist_bins,
min_id, rank_data)
(curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations,
dist_bins, min_id, binning.rank_data)

write(log, '-------------------------')
write(log, 'dimension:', curr)
Expand Down Expand Up @@ -188,10 +187,10 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_

# todo python342
right = \
data.loc[rank_data[rank_data[curr] == math.floor(float(re.search(', (\d+\.*\d*?)', dist_bins[micro_bin_id]).group(1)))][curr].index[0]][curr]
data.loc[rank_data[rank_data[curr] == math.floor(float(re.search(', (-*\d+\.*\d*)', dist_bins[micro_bin_id]).group(1)))][curr].index[0]][curr]
if not len(macro_interval):
macro_interval.append(
data.loc[rank_data[rank_data[curr] == math.ceil(float(re.search('(\d+\.*\d*?),', dist_bins[micro_bin_id]).group(1)))][curr].index[0]][
data.loc[rank_data[rank_data[curr] == math.ceil(float(re.search('(-*\d+\.*\d*),', dist_bins[micro_bin_id]).group(1)))][curr].index[0]][
curr])
macro_interval.append(right)
else:
Expand All @@ -210,7 +209,7 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_
print(
'Usage: main.py -f=<data_file> -d=<delimiter> -c=<number of columns> -m=<[original|greedy_topk]> -cor=<[uds]> '
'-dist=<[id, cjs]>')
command = '-f=synthetic_cases/synthetic_2d_parity_problem.csv -d=; -dist=ID'
command = '-f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID'
print('Running default: ', command)
command_list = command.split(' ')
else:
Expand Down Expand Up @@ -263,8 +262,8 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_

with open(log_file, 'w') as log:
disc_intervals, disc_points, class_labels, distances = compute_optimal_discretization(data, method,
distance_measure,
cor_measure, log)
cor_measure,
distance_measure, log)

plot_distances(dir, distances)

Expand Down
5 changes: 3 additions & 2 deletions 3d_parity_statistics.py → old/3d_parity_statistics.py
Expand Up @@ -15,10 +15,11 @@
def average_id(bin1, bin2, dim_maxes):
data0 = data[bin1]
data1 = data[bin2]
# todo fix
orig_binning0 = Binning(data0)
orig_binning1 = Binning(data1)
bin0_map = orig_binning0.equal_frequency_binning(0, int(data0.shape[0] / 141))
bin1_map = orig_binning1.equal_frequency_binning(0, int(data1.shape[0] / 141))
bin0_map = orig_binning0.equal_frequency_binning_by_rank(0, int(data0.shape[0] / 141))
bin1_map = orig_binning1.equal_frequency_binning_by_rank(0, int(data1.shape[0] / 141))
# distinct bins
dist_bins0 = bin0_map.unique()
dist_bins1 = bin1_map.unique()
Expand Down
5 changes: 3 additions & 2 deletions 4d_parity_statistics.py → old/4d_parity_statistics.py
Expand Up @@ -15,10 +15,11 @@
def average_id(bin1, bin2, dim_maxes):
data0 = data[bin1]
data1 = data[bin2]
# todo fix
orig_binning0 = Binning(data0)
orig_binning1 = Binning(data1)
bin0_map = orig_binning0.equal_frequency_binning(0, int(data0.shape[0] / 141))
bin1_map = orig_binning1.equal_frequency_binning(0, int(data1.shape[0] / 141))
bin0_map = orig_binning0.equal_frequency_binning_by_rank(0, int(data0.shape[0] / 141))
bin1_map = orig_binning1.equal_frequency_binning_by_rank(0, int(data1.shape[0] / 141))
# distinct bins
dist_bins0 = bin0_map.unique()
dist_bins1 = bin1_map.unique()
Expand Down
Empty file added old/__init__.py
Empty file.
5 changes: 3 additions & 2 deletions temp_exp.py → old/temp_exp.py
Expand Up @@ -52,8 +52,9 @@ def id_exceeds_experiment(data, method=cst.Method.ORIGINAL, cor_measure=None):

disc_macro_intervals = []
disc_points = []
# todo fix
orig_binning = Binning(data)
rank_data = orig_binning.get_rank_data()
rank_data = orig_binning.rank_data

# plt.figure(1)

Expand All @@ -64,7 +65,7 @@ def id_exceeds_experiment(data, method=cst.Method.ORIGINAL, cor_measure=None):
second = []
# iterate over all the dimensions
for curr in range(dim_count):
bin_map = orig_binning.equal_frequency_binning(curr, init_bins_count)
bin_map = orig_binning.equal_frequency_binning_by_rank(curr, init_bins_count)

# distinct bins
dist_bins = bin_map.unique()
Expand Down
13 changes: 7 additions & 6 deletions uds.py
Expand Up @@ -17,8 +17,9 @@ def compute_cond_CE(data, dim, I, point_ids):


# discretization of the next dimension
def dim_optimal_disc(curr, binning, I, data):
binned_points = binning.equal_frequency_binning2()
def dim_optimal_disc(curr, prev, I, data):
binning = Binning(data, prev, UDS_BETA)
binned_points = binning.equal_frequency_binning_by_rank_int_categories()

# Series with bins support
support = binned_points.value_counts().sort_index().cumsum()
Expand Down Expand Up @@ -114,10 +115,9 @@ def compute_uds(data):
es = []
uds = 0
prev = perm[0]
binning = Binning(data, prev, UDS_BETA)
for dim in perm[1:]:
# todo should I pass binning?
scores, discs = dim_optimal_disc(dim, binning, I, data)
scores, discs = dim_optimal_disc(dim, prev, I, data)

# regularization step
opt_cost = None
Expand Down Expand Up @@ -148,8 +148,9 @@ def compute_uds(data):


if __name__ == "__main__":
# data = pd.read_csv('data/testdata.csv', delimiter=',', header=None)
data = pd.DataFrame(correlated_data(4000, 20, 10, dg.func2))

data = pd.read_csv('synthetic_cases/uds_test.csv', delimiter=';', header=None)
# data = pd.DataFrame(correlated_data(4000, 20, 10, dg.func2))
# data = pd.DataFrame(generate_correlated_data(1000, 10, 2, func1))
# data = pd.DataFrame(generate_uncorrelated_data(4000, 20))
# classLabels = data.pop(len(data.columns) - 1)
Expand Down

0 comments on commit a4ab83c

Please sign in to comment.