Skip to content
Permalink
92e72c3bf7
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
141 lines (104 sloc) 5.35 KB
import math
import pandas as pd
import numpy as np
from correlation_measures.binning import Binning
from interaction_distance import computeIDs, compute_ID_threshold
from merging import dynamic_merging
# ----------------------CONSTANTS-----------------------
ID_THRESHOLD_QUANTILE = 1.0 / 3
NORMALIZATION_RADIUS = 1
FILE_DATA_OUTPUT = "out.txt"
FILE_DATA_CUTS = 'cut.txt'
# ------------------------------------------------------
def find_disc_macro_id(disc_macro_intervals, point):
for macro in disc_macro_intervals.items():
if macro[1][0] <= point <= macro[1][1]:
return macro[0]
raise ValueError("Micro bin is not covered by any of the macro intervals!", disc_macro_intervals, point)
def writeOutFile(name, disc_intervals, disc_points, class_labels):
with open(name, 'w') as out:
out.write('@relation DB\n\n')
counter = [1]
for i in range(len(disc_intervals)):
out.write(
'@attribute dim' + str(i) + ' {' + ','.join([str(j + counter[-1]) for j in disc_intervals[i]]) + '}\n')
counter.append(counter[-1] + len(disc_intervals[i]))
out.write('@attribute class {' + ','.join(['"' + str(i) + '"' for i in class_labels.unique()]) + '}\n\n')
out.write('@data\n')
for i in range(len(disc_points[0])):
for j in range(len(disc_points)):
out.write(str(disc_points[j][i] + counter[j]))
out.write(',')
out.write('"' + str(class_labels[i]) + '"\n')
def writeCutFile(name, disc_intervals):
with open(name, 'w') as out:
for i in range(len(disc_intervals)):
out.write('dimension ' + str(i) + ' (' + str(len(disc_intervals[i])) + ' bins)\n')
for bin in disc_intervals[i]:
out.write(str(disc_intervals[i][bin][1]) + '\n')
out.write('-------------------------------------\n')
def compute_optimal_discretization(data):
# class labels are not of much use in original ipd..
class_labels = data.pop(data.shape[1] - 1)
dim_count = data.shape[1]
# dimension maximums
dim_maxes = data.max(0)
# number of initial dist_bins
# initBinsCount = int(math.ceil(math.sqrt(row_count))) # ceil in original ipd...
# todo remove later
init_bins_count = 20 # ceil in original ipd...
print('initBinsCount: ', init_bins_count)
# normalization step todo(optional)
# data = data.apply(lambda x: 2 * NORMALIZATION_RADIUS * (x - x.min()) / (
# x.max() - x.min()) - NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape)))
disc_macro_intervals = []
disc_points = []
orig_binning = Binning(data)
rank_data = orig_binning.get_rank_data()
# iterate over all the dimensions
for curr in range(dim_count):
bin_map = orig_binning.equal_frequency_binning(curr, init_bins_count)
# distinct bins
dist_bins = bin_map.drop_duplicates().values
# -----------------------------INTERACTION DISTANCES----------------------------------
# for each bin along the current dimension compute inner measure B and inter measure
IDs = computeIDs(bin_map, curr, data, dist_bins, dim_maxes)
ID_threshold = compute_ID_threshold(IDs)
print('ID_threshold', ID_threshold)
# -----------------------------OPTIMAL MERGE STRATEGY----------------------------------
# todo replace by empty method later
# table with costs, the value in i-th row and j-th column means cost of (i+1) micro bins merged into (j+1) macro bins
F, discretizations = dynamic_merging(ID_threshold, IDs, init_bins_count)
print('dimension ' + str(curr))
min_id = np.argmin(F[-1])
print('cost ' + str(F[-1, min_id]))
(curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations, dist_bins,
min_id, rank_data)
print(curr_macro_intervals)
print(curr_macro_points)
disc_macro_intervals.append(curr_macro_intervals)
disc_points.append(curr_macro_points)
return disc_macro_intervals, disc_points, class_labels
def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_data):
disc_macro_intervals = dict()
for i, macro_bin in enumerate(discretizations[-1][min_id]):
macro_interval = []
for micro_bin_id in macro_bin:
right = \
data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[micro_bin_id].right)][curr].index[0]][curr]
if not len(macro_interval):
macro_interval.append(
data.loc[rank_data[rank_data[curr] == math.ceil(dist_bins[micro_bin_id].left)][curr].index[0]][
curr])
macro_interval.append(right)
else:
macro_interval[1] = right
disc_macro_intervals[i] = macro_interval
macro_points = []
for point in data.iterrows():
macro_points.append(find_disc_macro_id(disc_macro_intervals, point[1][curr]))
return disc_macro_intervals, macro_points
data = pd.read_csv('example/simple.csv', delimiter=';', header=None)
disc_intervals, disc_points, class_labels = compute_optimal_discretization(data)
writeOutFile(FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels)
writeCutFile(FILE_DATA_CUTS, disc_intervals)