Skip to content

Commit

Permalink
cjs - matches the original code for small number of records, but for …
Browse files Browse the repository at this point in the history
…larger there is a small difference in the results because of different implementations of equal frequency binning
  • Loading branch information
Tatiana Dembelova committed Aug 6, 2017
1 parent 03a37c2 commit 49a3b2f
Show file tree
Hide file tree
Showing 6 changed files with 583 additions and 510 deletions.
188 changes: 115 additions & 73 deletions cjs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
import math
import constants as cst

from correlation_measures.binning import Binning

Expand All @@ -17,34 +18,34 @@ def sum_P(bin, maxes):
count = bin_.shape[0]

if count == 0:
return 0
return pd.Series(0)

return (pd.concat([bin_.loc[1:], maxes_], ignore_index=True, axis=0) - bin_).transpose() \
.dot([i + 1 for i in range(count)]) / count


def sum_PlogP(bin, maxes):
def sum_PlogP(sorted_bin, maxes):
maxes_ = maxes.transpose()
bin_ = bin.reset_index(drop=True)
bin_ = sorted_bin.reset_index(drop=True)
count = bin_.shape[0]

if count == 0:
return 0
return pd.Series([0 for i in range(maxes.shape[0])])

cum = np.array([(i + 1) for i in range(count)])

return (pd.concat([bin_.loc[1:], maxes_], ignore_index=True, axis=0) - bin_).transpose() \
.dot(cum * (np.log2(cum) - math.log(count, 2))) / count


def sum_PlogPQ(bin1, bin2, maxes):
count1 = bin1.shape[0]
count2 = bin2.shape[0]
def sum_PlogPQ(binA, binB, maxes):
count1 = binA.shape[0]
count2 = binB.shape[0]

if count1 == 0:
return 0
maxes_ = maxes
data = pd.concat([bin1, bin2], axis=0)
data = pd.concat([binA, binB], axis=0)
values = []
ids = []
for col in data:
Expand All @@ -53,9 +54,9 @@ def sum_PlogPQ(bin1, bin2, maxes):
ids.append(tmp['index'])
values = pd.DataFrame(values)

in_bin1 = np.array([np.in1d(col, bin1.index) for col in ids])
cum1 = np.cumsum(in_bin1, axis=1)
cum2 = np.cumsum(np.negative(in_bin1), axis=1)
in_binA = np.array([np.in1d(col, binA.index) for col in ids])
cum1 = np.cumsum(in_binA, axis=1)
cum2 = np.cumsum(np.negative(in_binA), axis=1)

diff = pd.concat([values.loc[:, 1:].reset_index(drop=True), maxes_], ignore_index=True,
axis=1) - values.reset_index(drop=True)
Expand All @@ -65,92 +66,108 @@ def sum_PlogPQ(bin1, bin2, maxes):


def ind_sort(bin):
if bin.empty:
return bin
return pd.concat([bin[col].sort_values().reset_index(drop=True) for col in bin], axis=1, ignore_index=True)


def compute_univariate_cjs(bin1, bin2, maxes):
if bin1.shape[1] != bin2.shape[1]:
raise ValueError
sorted_bin1 = ind_sort(bin1)
sorted_bin2 = ind_sort(bin2)
CJSs = sum_PlogP(sorted_bin1, maxes) \
- sum_PlogPQ(bin1, bin2, maxes) \
+ (sum_P(sorted_bin2, maxes) - sum_P(sorted_bin1, maxes)) / (2 * math.log(2))
def compute_univariate_cjs(binA, binB, maxes):
'''
accepts bins of many dimensions, it returns a series of univariate CJS for each of the dimensions
:param binA:
:param binB:
:param maxes:
:return:
'''
if binA.shape[1] != binB.shape[1]:
raise ValueError('the compared distributions have different number of dimensions!')
# sort the bins in each of the dimensions destroying the inner integrity
# - done for the parallel computation for all the dimensions
ind_sorted_binA = ind_sort(binA)
ind_sorted_binB = ind_sort(binB)

CJSs = sum_PlogP(ind_sorted_binA, maxes) \
- sum_PlogPQ(binA, binB, maxes) \
+ (sum_P(ind_sorted_binB, maxes) - sum_P(ind_sorted_binA, maxes)) / (2 * math.log(2))
return CJSs


def compute_cond_CJS(bin1, bin2, bin1_point_ids, bin2_point_ids, I1, I2, maxes, dim):
def compute_cond_CJS(binA, binB, binA_point_ids, binB_point_ids, I1, I2, maxes, dim):
if len(I1) != len(I2):
raise ValueError

maxes_ = maxes.loc[dim].to_frame()
totalPointsCount = bin1.shape[0]
return sum([len(I1[i]) / totalPointsCount *
compute_univariate_cjs(bin1.loc[bin1_point_ids.intersection(I1[i]), dim].to_frame(),
bin2.loc[bin2_point_ids.intersection(I2[i]), dim].to_frame(), maxes_)[0]
total_binA_points_count = len(binA_point_ids)
if 0 == total_binA_points_count:
return 0
return sum([len(binA_point_ids.intersection(I1[i])) / total_binA_points_count *
compute_univariate_cjs(binA.loc[binA_point_ids.intersection(I1[i]), dim].to_frame(),
binB.loc[binB_point_ids.intersection(I2[i]), dim].to_frame(), maxes_)[0]
for i in range(len(I1))])


def dim_optimal_disc(prev, curr, I1, I2, bin1, bin2, maxes):
b1 = []
val = []

def dim_optimal_disc(prev, curr, I1, I2, binA, binB, maxes):
# equal frequency binning
binning = Binning(bin1)
point_bin1_map = binning.equal_frequency_binning2(prev, DEFAULT_BIN_COUNT)
global_min = min(bin1[prev].min(), bin2[prev].min())
bounds = binning.get_bounds(point_bin1_map, prev, global_min)
bin_count = len(bounds) - 1
point_bin2_map = binning.interpolate(bin2, prev, bin_count, bounds)
global_min = min(binA[prev].min(), binB[prev].min())
binning = Binning(binA, prev, DEFAULT_BINS_COUNT, global_min)
points2binA_map = binning.equal_frequency_binning4()
points2binB_map = binning.interpolate(binB)

# cjs discretizations and values
b1 = []
b2 = []
val = []

# compute conditional cjs
cond_bins1 = []
cond_bins2 = []

# ccjs cells
cond_binsA = []
cond_binsB = []
# todo worth considering implementation of the ufunc in C (slow)

# conditional CJS (univariate CJS bounded by initial binning cells)
ccjs = []

# upper bound of a ccjs cell
for i in range(bin_count):
for u in range(binning.bins_count):
ccjs_row = []
bin1_point_ids_row = []
bin2_point_ids_row = []
binA_point_ids_row = []
binB_point_ids_row = []
# lower bound of a ccjs cell
for j in range(i + 1):
bin1_point_ids = point_bin1_map[np.logical_and(point_bin1_map <= i, point_bin1_map >= j)].index
bin2_point_ids = point_bin2_map[np.logical_and(point_bin2_map <= i, point_bin2_map >= j)].index
bin1_point_ids_row.append(bin1_point_ids)
bin2_point_ids_row.append(bin2_point_ids)
for j in range(u + 1):
binA_point_ids = points2binA_map[np.logical_and(points2binA_map <= u, points2binA_map >= j)].index
binB_point_ids = points2binB_map[np.logical_and(points2binB_map <= u, points2binB_map >= j)].index
binA_point_ids_row.append(binA_point_ids)
binB_point_ids_row.append(binB_point_ids)

ccjs_row.append(
compute_cond_CJS(bin1, bin2, bin1_point_ids, bin2_point_ids, I1, I2, maxes, curr))
compute_cond_CJS(binA, binB, binA_point_ids, binB_point_ids, I1, I2, maxes, curr))
ccjs.append(ccjs_row)
cond_bins1.append(bin1_point_ids_row)
cond_bins2.append(bin2_point_ids_row)
cond_binsA.append(binA_point_ids_row)
cond_binsB.append(binB_point_ids_row)

b1.append([[bin1_point_ids_row[0]]])
b2.append([[bin2_point_ids_row[0]]])
b1.append([[binA_point_ids_row[0]]])
b2.append([[binB_point_ids_row[0]]])
val.append([ccjs_row[0]])

# dynamic programming
for l in range(1, bin_count):
for i in range(l, bin_count):
for l in range(1, MAX_BINS):
for u in range(l, binning.bins_count):
max_cost = None
arg_max = None
for j in range(l - 1, i):
temp_cost = (i - j) / i * ccjs[i][j + 1] + j / i * val[j][l - 1]
for j in range(l - 1, u):
temp_cost = (u - j) / (u + 1) * ccjs[u][j + 1] + (j + 1) / (u + 1) * val[j][l - 1]
if not max_cost or temp_cost > max_cost:
max_cost = temp_cost
arg_max = j

val[i].append(max_cost)
val[u].append(max_cost)
disc1 = b1[arg_max][l - 1].copy()
disc1.append(cond_bins1[i][arg_max + 1])
b1[i].append(disc1)
disc1.append(cond_binsA[u][arg_max + 1])
b1[u].append(disc1)
disc2 = b2[arg_max][l - 1].copy()
disc2.append(cond_bins2[i][arg_max + 1])
b2[i].append(disc2)
disc2.append(cond_binsB[u][arg_max + 1])
b2[u].append(disc2)

best_disc_id = np.argmax(val[-1])

Expand All @@ -160,41 +177,66 @@ def dim_optimal_disc(prev, curr, I1, I2, bin1, bin2, maxes):
def extend_I(I, disc):
disc_ = [i.intersection(j) for i in I for j in disc]

return [d for d in disc_ if not d.empty]
# return [d for d in disc_ if not d.empty]
return disc_


def compute_CJS(init_binA, init_binB, maxes):
# renaming relevant columns in the basic order
binA = init_binA.rename(columns={init_binA.columns[i]: i for i in range(len(init_binA.columns))})
binB = init_binB.rename(columns={init_binB.columns[i]: i for i in range(len(init_binB.columns))})

# fix missing values with mean value
fix_missing_values(binA)
fix_missing_values(binB)

symm_cjs = _compute_CJS(binA, binB, maxes) + _compute_CJS(binB, binA, maxes)
normalization = sum(sum_P(binA, maxes)) + sum(sum_P(binB, maxes))
return symm_cjs / normalization


def fix_missing_values(bin):
bin_means = bin.mean()
for d in bin.columns:
bin.loc[pd.isnull(bin[d]), d] = bin_means[d]


def _compute_CJS(binA, binB, maxes):
global DEFAULT_BINS_COUNT
# todo fix B = int(math.sqrt(binA.shape[0]) - by paper
B = min(int(math.sqrt(binA.shape[0] + binB.shape[0])), cst.MAXMAX)
DEFAULT_BINS_COUNT = B * cst.CLUMP
global MAX_BINS
MAX_BINS = B

def compute_CJS(bin1, bin2, maxes):
bin1_ = bin1.rename(columns={bin1.columns[i]: i for i in range(len(bin1.columns))})
bin2_ = bin2.rename(columns={bin2.columns[i]: i for i in range(len(bin2.columns))})
global DEFAULT_BIN_COUNT
DEFAULT_BIN_COUNT = int(math.sqrt(bin1_.shape[0]) * 2 + 1)
# find the permutation
univariate_cjs = compute_univariate_cjs(bin1_, bin2_, maxes)
# find the permutation of dimensions in descending order of CJS
univariate_cjs = compute_univariate_cjs(binA, binB, maxes)
perm = compute_permutation(univariate_cjs)

discretizations1 = [bin1_.index]
discretizations2 = [bin2_.index]
discretizations1 = [binA.index]
discretizations2 = [binB.index]
prev = perm[0]
cjs = univariate_cjs[prev]

for curr in perm[1:]:
curr_cjs, disc1, disc2 = dim_optimal_disc(prev, curr, discretizations1, discretizations2, bin1_, bin2_, maxes)
curr_cjs, disc1, disc2 = dim_optimal_disc(prev, curr, discretizations1, discretizations2, binA, binB, maxes)

discretizations1 = extend_I(discretizations1, disc1)
discretizations2 = extend_I(discretizations2, disc2)
cjs += curr_cjs
prev = curr
return cjs


if __name__ == '__main__':
data = pd.read_csv("synthetic_cases/realKD.txt", delimiter=";", header=None, na_values='?')

m = data.shape[0]
half = int(m / 2)
bin1 = data[:half]
bin2 = data[half:]
attrs = [12, 19, 20]
binA = data.loc[[i for i in range(200)], attrs]
binB = data.loc[[i for i in range(200, 400)], attrs]

print(compute_CJS(bin1, bin2, pd.DataFrame(np.max(data).transpose())))
print(str(compute_CJS(binA, binB, pd.DataFrame(np.max(data[attrs]).transpose().reset_index(drop=True)))))


def compute_CJSs(bin_map, curr, data, dist_bins, dim_maxes):
Expand Down
6 changes: 5 additions & 1 deletion constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,8 @@ class CorrelationMeasure(Enum):
# subspace mining parameters
MAX_SUBSPACE_SIZE = 5
HETEROGENEOUS_THRESHOLD=0.8
BEAM_WIDTH=3
BEAM_WIDTH=3

# cjs
CLUMP = 2
MAXMAX = 5
59 changes: 43 additions & 16 deletions correlation_measures/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,64 @@


class Binning:
def __init__(self, data):
def __init__(self, data, dim=None, desired_bins_count=None, global_min=None):
self.desired_bins_count = desired_bins_count if desired_bins_count is None or data.shape[0] > desired_bins_count\
else data.shape[0]
self.dim = dim
self.data = data
self.rank_data = data.rank(method='first')
self.global_min = global_min


# todo old (small reminder) in the original ipd it is NOT equal binning
# Series of binned points (with dropDuplicates produces not equally frequent bins)
def equal_frequency_binning(self, dim, bins_count):
return pd.qcut(self.rank_data.sort_values(by=dim)[dim], bins_count)

def equal_frequency_binning2(self, dim, bins_count):
qcut = pd.qcut(self.rank_data.sort_values(by=dim)[dim], bins_count)
qcut = qcut.cat.rename_categories([i for i in range(bins_count)]).reindex(qcut.index)
return qcut
def equal_frequency_binning2(self):
qcut = pd.qcut(self.rank_data.sort_values(by=self.dim)[self.dim], self.desired_bins_count)
self.qcut = qcut.cat.rename_categories([i for i in range(self.desired_bins_count)]).reindex(qcut.index)
self.bins_count = self.desired_bins_count
return self.qcut

def equal_frequency_binning4(self):
qcut = pd.qcut(self.data[self.dim], self.desired_bins_count, duplicates='drop')
qcut = qcut.cat.remove_unused_categories()
bounds = [c.right for c in qcut.cat.categories]
# including global_min with a margin of 1
bounds.insert(0, self.global_min - 1)
self.bounds = pd.Series(bounds)

def equal_frequency_binning3(self, dim, bins_count):
qcut = pd.qcut(self.data[dim], bins_count, duplicates='drop')
qcut = qcut.cat.rename_categories([i for i in range(bins_count)]).reindex(qcut.index)
return qcut
self.bins_count = len(qcut.cat.categories)
self.qcut = qcut.cat.rename_categories([i for i in range(self.bins_count)]).reindex(qcut.index)
return self.qcut

def get_bounds(self, bin_points, dim, global_min):
groupby = bin_points.reset_index().groupby(0)
def equal_frequency_binning3(self, dim, desired_bins_count):
qcut = pd.qcut(self.data[dim], desired_bins_count, duplicates='drop')
self.qcut = qcut.cat.rename_categories([i for i in range(desired_bins_count)]).reindex(qcut.index)
return self.qcut

return pd.Series(pd.unique(pd.concat([pd.Series(global_min - 1),
self.data.loc[groupby.last()['index'], dim]], axis=0))) \
def get_bounds(self, global_min):
groupby = self.qcut.reset_index().groupby(0)

self.bounds = pd.Series(pd.unique(pd.concat([pd.Series(global_min - 1),
self.data.loc[groupby.last()['index'], self.dim]], axis=0))) \
.reset_index(drop=True)
return self.bounds


def get_rank_data(self):
return self.rank_data

def interpolate(self, data, dim, bins_count, bounds):
data_ = pd.cut(data[dim], bounds)
return data_.cat.rename_categories([i for i in range(bins_count)]).reindex(data_.index)
def interpolate(self, other_bin):
if self.bounds is None:
raise ValueError('No bounds!')
other_col = other_bin[self.dim]
if max(other_col) > self.bounds.max():
self.bounds = self.bounds.append(pd.Series(max(other_col)), ignore_index=True)
self.qcut.cat.add_categories(self.bins_count, inplace=True)
self.bins_count += 1

data_ = pd.cut(other_col, self.bounds)
return data_.cat.rename_categories([i for i in range(self.bins_count)]).reindex(data_.index)

Loading

0 comments on commit 49a3b2f

Please sign in to comment.