Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/main.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
141 lines (104 sloc)
5.35 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import pandas as pd | |
import numpy as np | |
from correlation_measures.binning import Binning | |
from interaction_distance import computeIDs, compute_ID_threshold | |
from merging import dynamic_merging | |
# ----------------------CONSTANTS----------------------- | |
ID_THRESHOLD_QUANTILE = 1.0 / 3 | |
NORMALIZATION_RADIUS = 1 | |
FILE_DATA_OUTPUT = "out.txt" | |
FILE_DATA_CUTS = 'cut.txt' | |
# ------------------------------------------------------ | |
def find_disc_macro_id(disc_macro_intervals, point): | |
for macro in disc_macro_intervals.items(): | |
if macro[1][0] <= point <= macro[1][1]: | |
return macro[0] | |
raise ValueError("Micro bin is not covered by any of the macro intervals!", disc_macro_intervals, point) | |
def writeOutFile(name, disc_intervals, disc_points, class_labels): | |
with open(name, 'w') as out: | |
out.write('@relation DB\n\n') | |
counter = [1] | |
for i in range(len(disc_intervals)): | |
out.write( | |
'@attribute dim' + str(i) + ' {' + ','.join([str(j + counter[-1]) for j in disc_intervals[i]]) + '}\n') | |
counter.append(counter[-1] + len(disc_intervals[i])) | |
out.write('@attribute class {' + ','.join(['"' + str(i) + '"' for i in class_labels.unique()]) + '}\n\n') | |
out.write('@data\n') | |
for i in range(len(disc_points[0])): | |
for j in range(len(disc_points)): | |
out.write(str(disc_points[j][i] + counter[j])) | |
out.write(',') | |
out.write('"' + str(class_labels[i]) + '"\n') | |
def writeCutFile(name, disc_intervals): | |
with open(name, 'w') as out: | |
for i in range(len(disc_intervals)): | |
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n') | |
for bin in disc_intervals[i]: | |
out.write(str(disc_intervals[i][bin][1]) + '\n') | |
out.write('-------------------------------------\n') | |
def compute_optimal_discretization(data): | |
# class labels are not of much use in original ipd.. | |
class_labels = data.pop(data.shape[1] - 1) | |
dim_count = data.shape[1] | |
# dimension maximums | |
dim_maxes = data.max(0) | |
# number of initial dist_bins | |
# initBinsCount = int(math.ceil(math.sqrt(row_count))) # ceil in original ipd... | |
# todo remove later | |
init_bins_count = 20 # ceil in original ipd... | |
print('initBinsCount: ', init_bins_count) | |
# normalization step todo(optional) | |
# data = data.apply(lambda x: 2 * NORMALIZATION_RADIUS * (x - x.min()) / ( | |
# x.max() - x.min()) - NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape))) | |
disc_macro_intervals = [] | |
disc_points = [] | |
orig_binning = Binning(data) | |
rank_data = orig_binning.get_rank_data() | |
# iterate over all the dimensions | |
for curr in range(dim_count): | |
bin_map = orig_binning.equal_frequency_binning(curr, init_bins_count) | |
# distinct bins | |
dist_bins = bin_map.drop_duplicates().values | |
# -----------------------------INTERACTION DISTANCES---------------------------------- | |
# for each bin along the current dimension compute inner measure B and inter measure | |
IDs = computeIDs(bin_map, curr, data, dist_bins, dim_maxes) | |
ID_threshold = compute_ID_threshold(IDs) | |
print('ID_threshold', ID_threshold) | |
# -----------------------------OPTIMAL MERGE STRATEGY---------------------------------- | |
# todo replace by empty method later | |
# table with costs, the value in i-th row and j-th column means cost of (i+1) micro bins merged into (j+1) macro bins | |
F, discretizations = dynamic_merging(ID_threshold, IDs, init_bins_count) | |
print('dimension ' + str(curr)) | |
min_id = np.argmin(F[-1]) | |
print('cost ' + str(F[-1, min_id])) | |
(curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations, dist_bins, | |
min_id, rank_data) | |
print(curr_macro_intervals) | |
print(curr_macro_points) | |
disc_macro_intervals.append(curr_macro_intervals) | |
disc_points.append(curr_macro_points) | |
return disc_macro_intervals, disc_points, class_labels | |
def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_data): | |
disc_macro_intervals = dict() | |
for i, macro_bin in enumerate(discretizations[-1][min_id]): | |
macro_interval = [] | |
for micro_bin_id in macro_bin: | |
right = \ | |
data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[micro_bin_id].right)][curr].index[0]][curr] | |
if not len(macro_interval): | |
macro_interval.append( | |
data.loc[rank_data[rank_data[curr] == math.ceil(dist_bins[micro_bin_id].left)][curr].index[0]][ | |
curr]) | |
macro_interval.append(right) | |
else: | |
macro_interval[1] = right | |
disc_macro_intervals[i] = macro_interval | |
macro_points = [] | |
for point in data.iterrows(): | |
macro_points.append(find_disc_macro_id(disc_macro_intervals, point[1][curr])) | |
return disc_macro_intervals, macro_points | |
data = pd.read_csv('example/simple.csv', delimiter=';', header=None) | |
disc_intervals, disc_points, class_labels = compute_optimal_discretization(data) | |
writeOutFile(FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels) | |
writeCutFile(FILE_DATA_CUTS, disc_intervals) |