Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/cjs.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
275 lines (213 sloc)
9.08 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import math | |
import constants as cst | |
from correlation_measures.binning import Binning | |
def compute_permutation(u_CJSs): | |
argsort = np.argsort(u_CJSs).tolist() | |
argsort.reverse() | |
return argsort | |
def sum_P(bin, maxes): | |
maxes_ = maxes.transpose() | |
bin_ = bin.reset_index(drop=True) | |
count = bin_.shape[0] | |
if count == 0: | |
return pd.Series(0) | |
return (pd.concat([bin_.loc[1:], maxes_], ignore_index=True, axis=0) - bin_).transpose() \ | |
.dot([i + 1 for i in range(count)]) / count | |
def sum_PlogP(sorted_bin, maxes): | |
maxes = maxes.transpose() | |
sorted_bin = sorted_bin.reset_index(drop=True) | |
count = sorted_bin.shape[0] | |
if count == 0: | |
return pd.Series([0 for i in range(maxes.shape[0])]) | |
cum = np.array([(i + 1) for i in range(count)]) | |
return (pd.concat([sorted_bin.loc[1:], maxes], ignore_index=True, axis=0) - sorted_bin).transpose() \ | |
.dot(cum * (np.log2(cum) - math.log(count, 2))) / count | |
def sum_PlogPQ(binA, binB, maxes): | |
count1 = binA.shape[0] | |
count2 = binB.shape[0] | |
if count1 == 0: | |
return 0 | |
maxes_ = maxes | |
data = pd.concat([binA, binB], axis=0) | |
values = [] | |
ids = [] | |
for col in data: | |
tmp = data[col].sort_values().reset_index() | |
values.append(tmp[col]) | |
ids.append(tmp['index']) | |
values = pd.DataFrame(values) | |
in_binA = np.array([np.in1d(col, binA.index) for col in ids]) | |
cum1 = np.cumsum(in_binA, axis=1) | |
cum2 = np.cumsum(np.negative(in_binA), axis=1) | |
diff = pd.concat([values.loc[:, 1:].reset_index(drop=True), maxes_], ignore_index=True, | |
axis=1) - values.reset_index(drop=True) | |
log_part = np.log2(cum1 * count2 + cum2 * count1) - math.log(2 * count1 * count2, 2) if count2 != 0 else np.log2( | |
cum1) - math.log(2 * count1, 2) | |
return (diff * cum1 * (log_part)).sum(axis=1) / count1 | |
def ind_sort(bin): | |
if bin.empty: | |
return bin | |
return pd.concat([bin[col].sort_values().reset_index(drop=True) for col in bin], axis=1, ignore_index=True) | |
def compute_univariate_cjs(binA, binB, maxes): | |
''' | |
accepts bins of many dimensions, it returns a series of univariate CJS for each of the dimensions | |
:param binA: | |
:param binB: | |
:param maxes: pd.Series | |
:return: | |
''' | |
if binA.shape[1] != binB.shape[1]: | |
raise ValueError('the compared distributions have different number of dimensions!') | |
# sort the bins in each of the dimensions destroying the inner integrity | |
# - done for the parallel computation for all the dimensions | |
ind_sorted_binA = ind_sort(binA) | |
ind_sorted_binB = ind_sort(binB) | |
maxes = maxes.to_frame() | |
CJSs = sum_PlogP(ind_sorted_binA, maxes) \ | |
- sum_PlogPQ(binA, binB, maxes) \ | |
+ (sum_P(ind_sorted_binB, maxes) - sum_P(ind_sorted_binA, maxes)) / (2 * math.log(2)) | |
return CJSs | |
def compute_cond_CJS(binA, binB, binA_point_ids, binB_point_ids, I1, I2, maxes, dim): | |
if len(I1) != len(I2): | |
raise ValueError | |
max = pd.Series(maxes[dim]) | |
total_binA_points_count = len(binA_point_ids) | |
if 0 == total_binA_points_count: | |
return 0 | |
return sum([len(binA_point_ids.intersection(I1[i])) / total_binA_points_count * | |
compute_univariate_cjs(binA.loc[binA_point_ids.intersection(I1[i]), dim].to_frame(), | |
binB.loc[binB_point_ids.intersection(I2[i]), dim].to_frame(), max)[0] | |
for i in range(len(I1))]) | |
def dim_optimal_disc(prev, curr, I1, I2, binA, binB, maxes): | |
# equal frequency binning | |
global_min = min(binA[prev].min(), binB[prev].min()) | |
binning = Binning(binA, prev, DEFAULT_BINS_COUNT, global_min) | |
points2binA_map = binning.equal_frequency_binning_duplicate_drop() | |
points2binB_map = binning.interpolate(binB) | |
# cjs discretizations and values | |
b1 = [] | |
b2 = [] | |
val = [] | |
# compute conditional cjs | |
# ccjs cells | |
cond_binsA = [] | |
cond_binsB = [] | |
# todo worth considering implementation of the ufunc in C (slow) | |
# conditional CJS (univariate CJS bounded by initial binning cells) | |
ccjs = [] | |
# upper bound of a ccjs cell | |
for u in range(binning.bins_count): | |
ccjs_row = [] | |
binA_point_ids_row = [] | |
binB_point_ids_row = [] | |
# lower bound of a ccjs cell | |
for j in range(u + 1): | |
binA_point_ids = points2binA_map[np.logical_and(points2binA_map <= u, points2binA_map >= j)].index | |
binB_point_ids = points2binB_map[np.logical_and(points2binB_map <= u, points2binB_map >= j)].index | |
binA_point_ids_row.append(binA_point_ids) | |
binB_point_ids_row.append(binB_point_ids) | |
ccjs_row.append( | |
compute_cond_CJS(binA, binB, binA_point_ids, binB_point_ids, I1, I2, maxes, curr)) | |
ccjs.append(ccjs_row) | |
cond_binsA.append(binA_point_ids_row) | |
cond_binsB.append(binB_point_ids_row) | |
b1.append([[binA_point_ids_row[0]]]) | |
b2.append([[binB_point_ids_row[0]]]) | |
val.append([ccjs_row[0]]) | |
# dynamic programming | |
for l in range(1, MAX_BINS): | |
for u in range(l, binning.bins_count): | |
max_cost = None | |
arg_max = None | |
for j in range(l - 1, u): | |
temp_cost = (u - j) / (u + 1) * ccjs[u][j + 1] + (j + 1) / (u + 1) * val[j][l - 1] | |
if not max_cost or temp_cost > max_cost: | |
max_cost = temp_cost | |
arg_max = j | |
val[u].append(max_cost) | |
disc1 = b1[arg_max][l - 1].copy() | |
disc1.append(cond_binsA[u][arg_max + 1]) | |
b1[u].append(disc1) | |
disc2 = b2[arg_max][l - 1].copy() | |
disc2.append(cond_binsB[u][arg_max + 1]) | |
b2[u].append(disc2) | |
best_disc_id = np.argmax(val[-1]) | |
return val[-1][best_disc_id], b1[-1][best_disc_id], b2[-1][best_disc_id] | |
def extend_I(I, disc): | |
disc_ = [i.intersection(j) for i in I for j in disc] | |
# return [d for d in disc_ if not d.empty] | |
return disc_ | |
def compute_CJS(binA, binB, maxes): | |
''' | |
main method to compute CJS | |
:param binA: | |
:param binB: | |
:param maxes: | |
:return: | |
''' | |
if type(maxes) is not pd.Series: | |
raise ValueError("maxes should be of pd.Series type!") | |
if len(maxes) != len(binA.columns) or len(maxes) != len(binB.columns): | |
raise ValueError("For computing CJS bins should have the same number of dimensions as maxes!") | |
# renaming relevant columns in the basic order | |
binA = binA.rename(columns={binA.columns[i]: i for i in range(len(binA.columns))}) | |
binB = binB.rename(columns={binB.columns[i]: i for i in range(len(binB.columns))}) | |
# reindexing maxes in the basic order | |
maxes = maxes.reset_index(drop=True) | |
# fix missing values with mean value | |
fix_missing_values(binA) | |
fix_missing_values(binB) | |
symm_cjs = _compute_CJS(binA, binB, maxes) + _compute_CJS(binB, binA, maxes) | |
maxes_frame = maxes.to_frame() | |
normalization = sum(sum_P(binA, maxes_frame)) + sum(sum_P(binB, maxes_frame)) | |
return symm_cjs / normalization | |
def fix_missing_values(bin): | |
bin_means = bin.mean() | |
for d in bin.columns: | |
bin.loc[pd.isnull(bin[d]), d] = bin_means[d] | |
def _compute_CJS(binA, binB, maxes): | |
global DEFAULT_BINS_COUNT | |
# todo fix B = int(math.sqrt(binA.shape[0]) - by paper | |
B = min(int(math.sqrt(binA.shape[0] + binB.shape[0])), cst.MAXMAX) | |
DEFAULT_BINS_COUNT = B * cst.CLUMP | |
global MAX_BINS | |
MAX_BINS = B | |
# find the permutation of dimensions in descending order of CJS | |
univariate_cjs = compute_univariate_cjs(binA, binB, maxes) | |
perm = compute_permutation(univariate_cjs) | |
discretizations1 = [binA.index] | |
discretizations2 = [binB.index] | |
prev = perm[0] | |
cjs = univariate_cjs[prev] | |
for curr in perm[1:]: | |
curr_cjs, disc1, disc2 = dim_optimal_disc(prev, curr, discretizations1, discretizations2, binA, binB, maxes) | |
discretizations1 = extend_I(discretizations1, disc1) | |
discretizations2 = extend_I(discretizations2, disc2) | |
cjs += curr_cjs | |
prev = curr | |
return cjs | |
if __name__ == '__main__': | |
data = pd.read_csv("synthetic_cases/realKD.txt", delimiter=";", header=None, na_values='?') | |
m = data.shape[0] | |
attrs = [12, 19, 20] | |
binA = data.loc[[i for i in range(200)], attrs] | |
binB = data.loc[[i for i in range(200, 400)], attrs] | |
print(str(compute_CJS(binA, binB, binA.max(0)[attrs]))) | |
def compute_CJSs(bin_map, curr, data, dim_maxes): | |
data_wo_curr = data.copy() | |
data_wo_curr.pop(curr) | |
dim_maxes = dim_maxes.drop(curr) | |
return compute_CJSs1(bin_map, data_wo_curr, dim_maxes) | |
def compute_CJSs1(bin_map, data_wo_curr, maxes_): | |
if data_wo_curr.empty: | |
return np.array([0 for i in range(len(bin_map.cat.categories))]) | |
cjs = [] | |
# distinct bins | |
dist_bins = bin_map.cat.categories | |
for bin_id, binn in enumerate(dist_bins[1:], start=1): | |
bin_data = data_wo_curr.loc[bin_map == binn] | |
prev_bin_data = data_wo_curr.loc[bin_map == dist_bins[bin_id - 1]] | |
cjs.append(compute_CJS(prev_bin_data, bin_data, maxes_)) | |
return np.array(cjs) |