Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import numpy as np
import pandas as pd
import math
import constants as cst
from correlation_measures.binning import Binning
def compute_permutation(u_CJSs):
argsort = np.argsort(u_CJSs).tolist()
argsort.reverse()
return argsort
def sum_P(bin, maxes):
maxes_ = maxes.transpose()
bin_ = bin.reset_index(drop=True)
count = bin_.shape[0]
if count == 0:
return pd.Series(0)
return (pd.concat([bin_.loc[1:], maxes_], ignore_index=True, axis=0) - bin_).transpose() \
.dot([i + 1 for i in range(count)]) / count
def sum_PlogP(sorted_bin, maxes):
maxes = maxes.transpose()
sorted_bin = sorted_bin.reset_index(drop=True)
count = sorted_bin.shape[0]
if count == 0:
return pd.Series([0 for i in range(maxes.shape[0])])
cum = np.array([(i + 1) for i in range(count)])
return (pd.concat([sorted_bin.loc[1:], maxes], ignore_index=True, axis=0) - sorted_bin).transpose() \
.dot(cum * (np.log2(cum) - math.log(count, 2))) / count
def sum_PlogPQ(binA, binB, maxes):
count1 = binA.shape[0]
count2 = binB.shape[0]
if count1 == 0:
return 0
maxes_ = maxes
data = pd.concat([binA, binB], axis=0)
values = []
ids = []
for col in data:
tmp = data[col].sort_values().reset_index()
values.append(tmp[col])
ids.append(tmp['index'])
values = pd.DataFrame(values)
in_binA = np.array([np.in1d(col, binA.index) for col in ids])
cum1 = np.cumsum(in_binA, axis=1)
cum2 = np.cumsum(np.negative(in_binA), axis=1)
diff = pd.concat([values.loc[:, 1:].reset_index(drop=True), maxes_], ignore_index=True,
axis=1) - values.reset_index(drop=True)
log_part = np.log2(cum1 * count2 + cum2 * count1) - math.log(2 * count1 * count2, 2) if count2 != 0 else np.log2(
cum1) - math.log(2 * count1, 2)
return (diff * cum1 * (log_part)).sum(axis=1) / count1
def ind_sort(bin):
if bin.empty:
return bin
return pd.concat([bin[col].sort_values().reset_index(drop=True) for col in bin], axis=1, ignore_index=True)
def compute_univariate_cjs(binA, binB, maxes):
'''
accepts bins of many dimensions, it returns a series of univariate CJS for each of the dimensions
:param binA:
:param binB:
:param maxes: pd.Series
:return:
'''
if binA.shape[1] != binB.shape[1]:
raise ValueError('the compared distributions have different number of dimensions!')
# sort the bins in each of the dimensions destroying the inner integrity
# - done for the parallel computation for all the dimensions
ind_sorted_binA = ind_sort(binA)
ind_sorted_binB = ind_sort(binB)
maxes = maxes.to_frame()
CJSs = sum_PlogP(ind_sorted_binA, maxes) \
- sum_PlogPQ(binA, binB, maxes) \
+ (sum_P(ind_sorted_binB, maxes) - sum_P(ind_sorted_binA, maxes)) / (2 * math.log(2))
return CJSs
def compute_cond_CJS(binA, binB, binA_point_ids, binB_point_ids, I1, I2, maxes, dim):
if len(I1) != len(I2):
raise ValueError
max = pd.Series(maxes[dim])
total_binA_points_count = len(binA_point_ids)
if 0 == total_binA_points_count:
return 0
return sum([len(binA_point_ids.intersection(I1[i])) / total_binA_points_count *
compute_univariate_cjs(binA.loc[binA_point_ids.intersection(I1[i]), dim].to_frame(),
binB.loc[binB_point_ids.intersection(I2[i]), dim].to_frame(), max)[0]
for i in range(len(I1))])
def dim_optimal_disc(prev, curr, I1, I2, binA, binB, maxes):
# equal frequency binning
global_min = min(binA[prev].min(), binB[prev].min())
binning = Binning(binA, prev, DEFAULT_BINS_COUNT, global_min)
points2binA_map = binning.equal_frequency_binning_duplicate_drop()
points2binB_map = binning.interpolate(binB)
# cjs discretizations and values
b1 = []
b2 = []
val = []
# compute conditional cjs
# ccjs cells
cond_binsA = []
cond_binsB = []
# todo worth considering implementation of the ufunc in C (slow)
# conditional CJS (univariate CJS bounded by initial binning cells)
ccjs = []
# upper bound of a ccjs cell
for u in range(binning.bins_count):
ccjs_row = []
binA_point_ids_row = []
binB_point_ids_row = []
# lower bound of a ccjs cell
for j in range(u + 1):
binA_point_ids = points2binA_map[np.logical_and(points2binA_map <= u, points2binA_map >= j)].index
binB_point_ids = points2binB_map[np.logical_and(points2binB_map <= u, points2binB_map >= j)].index
binA_point_ids_row.append(binA_point_ids)
binB_point_ids_row.append(binB_point_ids)
ccjs_row.append(
compute_cond_CJS(binA, binB, binA_point_ids, binB_point_ids, I1, I2, maxes, curr))
ccjs.append(ccjs_row)
cond_binsA.append(binA_point_ids_row)
cond_binsB.append(binB_point_ids_row)
b1.append([[binA_point_ids_row[0]]])
b2.append([[binB_point_ids_row[0]]])
val.append([ccjs_row[0]])
# dynamic programming
for l in range(1, MAX_BINS):
for u in range(l, binning.bins_count):
max_cost = None
arg_max = None
for j in range(l - 1, u):
temp_cost = (u - j) / (u + 1) * ccjs[u][j + 1] + (j + 1) / (u + 1) * val[j][l - 1]
if not max_cost or temp_cost > max_cost:
max_cost = temp_cost
arg_max = j
val[u].append(max_cost)
disc1 = b1[arg_max][l - 1].copy()
disc1.append(cond_binsA[u][arg_max + 1])
b1[u].append(disc1)
disc2 = b2[arg_max][l - 1].copy()
disc2.append(cond_binsB[u][arg_max + 1])
b2[u].append(disc2)
best_disc_id = np.argmax(val[-1])
return val[-1][best_disc_id], b1[-1][best_disc_id], b2[-1][best_disc_id]
def extend_I(I, disc):
disc_ = [i.intersection(j) for i in I for j in disc]
# return [d for d in disc_ if not d.empty]
return disc_
def compute_CJS(binA, binB, maxes):
'''
main method to compute CJS
:param binA:
:param binB:
:param maxes:
:return:
'''
if type(maxes) is not pd.Series:
raise ValueError("maxes should be of pd.Series type!")
if len(maxes) != len(binA.columns) or len(maxes) != len(binB.columns):
raise ValueError("For computing CJS bins should have the same number of dimensions as maxes!")
# renaming relevant columns in the basic order
binA = binA.rename(columns={binA.columns[i]: i for i in range(len(binA.columns))})
binB = binB.rename(columns={binB.columns[i]: i for i in range(len(binB.columns))})
# reindexing maxes in the basic order
maxes = maxes.reset_index(drop=True)
# fix missing values with mean value
fix_missing_values(binA)
fix_missing_values(binB)
symm_cjs = _compute_CJS(binA, binB, maxes) + _compute_CJS(binB, binA, maxes)
maxes_frame = maxes.to_frame()
normalization = sum(sum_P(binA, maxes_frame)) + sum(sum_P(binB, maxes_frame))
return symm_cjs / normalization
def fix_missing_values(bin):
bin_means = bin.mean()
for d in bin.columns:
bin.loc[pd.isnull(bin[d]), d] = bin_means[d]
def _compute_CJS(binA, binB, maxes):
global DEFAULT_BINS_COUNT
# todo fix B = int(math.sqrt(binA.shape[0]) - by paper
B = min(int(math.sqrt(binA.shape[0] + binB.shape[0])), cst.MAXMAX)
DEFAULT_BINS_COUNT = B * cst.CLUMP
global MAX_BINS
MAX_BINS = B
# find the permutation of dimensions in descending order of CJS
univariate_cjs = compute_univariate_cjs(binA, binB, maxes)
perm = compute_permutation(univariate_cjs)
discretizations1 = [binA.index]
discretizations2 = [binB.index]
prev = perm[0]
cjs = univariate_cjs[prev]
for curr in perm[1:]:
curr_cjs, disc1, disc2 = dim_optimal_disc(prev, curr, discretizations1, discretizations2, binA, binB, maxes)
discretizations1 = extend_I(discretizations1, disc1)
discretizations2 = extend_I(discretizations2, disc2)
cjs += curr_cjs
prev = curr
return cjs
if __name__ == '__main__':
data = pd.read_csv("synthetic_cases/realKD.txt", delimiter=";", header=None, na_values='?')
m = data.shape[0]
attrs = [12, 19, 20]
binA = data.loc[[i for i in range(200)], attrs]
binB = data.loc[[i for i in range(200, 400)], attrs]
print(str(compute_CJS(binA, binB, binA.max(0)[attrs])))
def compute_CJSs(bin_map, curr, data, dim_maxes):
data_wo_curr = data.copy()
data_wo_curr.pop(curr)
dim_maxes = dim_maxes.drop(curr)
return compute_CJSs1(bin_map, data_wo_curr, dim_maxes)
def compute_CJSs1(bin_map, data_wo_curr, maxes_):
if data_wo_curr.empty:
return np.array([0 for i in range(len(bin_map.cat.categories))])
cjs = []
# distinct bins
dist_bins = bin_map.cat.categories
for bin_id, binn in enumerate(dist_bins[1:], start=1):
bin_data = data_wo_curr.loc[bin_map == binn]
prev_bin_data = data_wo_curr.loc[bin_map == dist_bins[bin_id - 1]]
cjs.append(compute_CJS(prev_bin_data, bin_data, maxes_))
return np.array(cjs)