Skip to content
Permalink
14ad1aa2e8
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
212 lines (165 sloc) 8.99 KB
import glob
import math
import sys
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import constants as cst
import data_generation as dg
import interaction_distance as id
import util
from correlation_measures.binning import Binning
from experiments_logging import write_out_file, write_cut_file
from merging import dynamic_merging
# ------------------------------------------------------
def find_disc_macro_id(disc_macro_intervals, point):
for macro in disc_macro_intervals.items():
if macro[1][0] <= point <= macro[1][1]:
return macro[0]
raise ValueError("Micro bin is not covered by any of the macro intervals!", disc_macro_intervals, point)
# todo valid only in compute_optimal_discretization method!
def write(*args):
log.write(' '.join([str(a) for a in args]))
log.write('\n')
def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure=None):
# class labels are not of much use in original ipd..
class_labels = data.pop(data.shape[1] - 1)
dim_count = data.shape[1]
# number of initial dist_bins
# todo old remove later
# init_bins_count = 20 # ceil in original ipd...
init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) # ceil in original ipd...
write('row count:', data.shape[0])
write('init_bins_count:', init_bins_count)
write('ID_THRESHOLD_QUANTILE:', cst.ID_THRESHOLD_QUANTILE)
# normalization step todo old (optional)
# todo old by default the normalization is optional as it does not influence on the results
# norm_data = data.apply(lambda x: 2 * cst.NORMALIZATION_RADIUS * (x - x.min()) / (
# x.max() - x.min()) - cst.NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape)))
norm_data = data
# dimension maximums
dim_maxes = norm_data.max(0)
disc_macro_intervals = []
disc_points = []
orig_binning = Binning(norm_data)
rank_data = orig_binning.get_rank_data()
plt.figure(1)
height = int(math.sqrt(dim_count ))
width = int(math.ceil((dim_count ) / height))
fig, axes = plt.subplots(nrows=height, ncols=width, squeeze=False)
# iterate over all the dimensions
for curr in range(dim_count):
bin_map = orig_binning.equal_frequency_binning(curr, init_bins_count)
# distinct bins
dist_bins = bin_map.unique()
# -----------------------------INTERACTION DISTANCES----------------------------------
# for each bin along the current dimension compute inner measure B and inter measure
# IDs = id.compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, MAX_SUBSPACE_SIZE)
IDs = id.compute_IDs(bin_map, curr, norm_data, dist_bins, dim_maxes) if method == cst.Method.ORIGINAL else \
id.compute_IDs_extended(bin_map, curr, norm_data, dist_bins, dim_maxes, cor_measure, cst.MAX_SUBSPACE_SIZE)
ID_threshold = id.compute_ID_threshold(IDs)
# todo ext compute sliding average and count ID peaks above the avg (in a sliding window)
# ID_peaks = id.compute_sliding_count(IDs, ID_threshold)
# pd.DataFrame(IDs).to_csv(prefix + "_IDs_" + str(curr) + ".csv")
# -----------------------------OPTIMAL MERGE STRATEGY----------------------------------
# table with costs, the value in i-th row and j-th column means cost of (i+1) micro bins merged into (j+1)
# macro bins
F, discretizations = dynamic_merging(ID_threshold, IDs, init_bins_count)
# pd.DataFrame(F).to_csv(prefix + "_F_" + str(curr) + ".csv")
# pd.DataFrame([[[b[-1] for b in k[:-1]] for k in c] for c in discretizations]).to_csv(prefix + "_bp_" + str(curr) + ".csv")
min_id = np.argmin(F[-1])
(curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations, dist_bins,
min_id, rank_data)
ax1 = axes[int(curr / width), int(curr % width)]
# # ax1.hist(IDs, bins=100, color='c')
ax1.plot([i for i in range(len(IDs))], IDs)
ax1.axhline(ID_threshold, color='b', linestyle='dashed', linewidth=1)
ax1.set_title('dimension ' + str(curr))
# ax2 = axes[int((2*curr + 1) / width), int((2*curr + 1) % width)]
# ax2.plot(sorted(IDs), color='k')
# ax2.set_title('dimension ' + str(curr))
write('-------------------------')
write('dimension:', curr)
write('ID_threshold:', ID_threshold)
write('cost:', F[-1, min_id])
write('number of macrobins:', len(curr_macro_intervals))
# write('IDs', IDs)
write('\nIDs between the macrobins:')
for macro_id, macro_bin in enumerate(discretizations[-1][min_id][:-1]):
write("{0:.2f}".format(curr_macro_intervals[macro_id][1]) + " -", IDs[macro_bin[-1]], '[q=' +
str((sorted(IDs).index(IDs[macro_bin[-1]]) + 1) / len(IDs)) + ']')
# ax1.axhline(IDs[macro_bin[-1]], color='r', linewidth=1)
ax1.plot([macro_bin[-1]], [IDs[macro_bin[-1]]], marker='o', markersize=3, color="red")
# ax2.axvline(IDs[macro_bin[-1]], color='r', linewidth=1)
write('\nnumber of points per macrobin:')
for macro_id in curr_macro_intervals:
write("[" + "{0:.2f}".format(curr_macro_intervals[macro_id][0]) + ",",
"{0:.2f}".format(curr_macro_intervals[macro_id][1]) + "]",
sum([1 for p in curr_macro_points if p == macro_id]))
write('\n')
disc_macro_intervals.append(curr_macro_intervals)
disc_points.append(curr_macro_points)
plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25, wspace=0.35)
plt.savefig(dir + 'IDs.png', format='png')
return disc_macro_intervals, disc_points, class_labels
def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_data):
disc_macro_intervals = dict()
for i, macro_bin in enumerate(discretizations[-1][min_id]):
macro_interval = []
for micro_bin_id in macro_bin:
right = \
data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[micro_bin_id].right)][curr].index[0]][curr]
if not len(macro_interval):
macro_interval.append(
data.loc[rank_data[rank_data[curr] == math.ceil(dist_bins[micro_bin_id].left)][curr].index[0]][
curr])
macro_interval.append(right)
else:
macro_interval[1] = right
disc_macro_intervals[i] = macro_interval
macro_points = []
for point in data.iterrows():
macro_points.append(find_disc_macro_id(disc_macro_intervals, point[1][curr]))
return disc_macro_intervals, macro_points
if __name__ == "__main__":
sys.argv = '-f=synthetic_cases/synthetic_3d_parity_problem.csv -d=;'.split(' ')
# if len(sys.argv) < 2:
# print('Usage: main.py -f=<data_file> -d=<delimiter> -c=<number of columns> -m=<[original|greedy]> -cor=<[uds]>')
file_arg = list(filter(lambda x: x.startswith("-f="), sys.argv))
if not file_arg:
raise ValueError('No data file provided!')
delim_arg = list(filter(lambda x: x.startswith("-d="), sys.argv))
columns_arg = list(filter(lambda x: x.startswith("-c="), sys.argv))
rows_arg = list(filter(lambda x: x.startswith("-r="), sys.argv))
method_arg = list(filter(lambda x: x.startswith("-m="), sys.argv))
corr_measure_arg = list(filter(lambda x: x.startswith("-cor="), sys.argv))
data_file = file_arg[0].replace('-f=', '')
delimiter = delim_arg[0].replace('-d=', '') if delim_arg else ','
columns = int(columns_arg[0].replace('-c=', '')) if columns_arg else None
rows = int(rows_arg[0].replace('-r=', '')) if rows_arg else None
method = cst.Method[method_arg[0].replace('-m=', '').uppercase()] if method_arg else cst.Method.ORIGINAL
cor_measure = cst.CorrelationMeasure[corr_measure_arg[0].replace('-cor=', '').uppercase()] if corr_measure_arg \
else cst.CorrelationMeasure.UDS
# reading data from the file with delimiter and NaN values as "?"
data = pd.read_csv(data_file, delimiter=delimiter, header=None, na_values='?')
# drop a data point if it contains inconsistent data
data = data.dropna(axis=0, how='any')
if columns:
data = data.loc[:, :columns]
if rows:
data = data[:rows]
# defining prefix for the output files
file_name = util.get_file_name(data_file)
dir = 'logs/' + file_name + "_" + \
(method.name if method == cst.Method.ORIGINAL else cor_measure.name) + \
datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + "/"
os.makedirs(dir)
print('output files are:', dir + '*')
log_file = dir + "log.txt"
with open(log_file, 'w') as log:
disc_intervals, disc_points, class_labels = compute_optimal_discretization(data, method, cor_measure)
# write_out_file(dir + cst.FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels)
#
# write_cut_file(dir + cst.FILE_DATA_CUTS, disc_intervals)