Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/main.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
212 lines (165 sloc)
8.99 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import math | |
import sys | |
import datetime | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import os | |
import constants as cst | |
import data_generation as dg | |
import interaction_distance as id | |
import util | |
from correlation_measures.binning import Binning | |
from experiments_logging import write_out_file, write_cut_file | |
from merging import dynamic_merging | |
# ------------------------------------------------------ | |
def find_disc_macro_id(disc_macro_intervals, point): | |
for macro in disc_macro_intervals.items(): | |
if macro[1][0] <= point <= macro[1][1]: | |
return macro[0] | |
raise ValueError("Micro bin is not covered by any of the macro intervals!", disc_macro_intervals, point) | |
# todo valid only in compute_optimal_discretization method! | |
def write(*args): | |
log.write(' '.join([str(a) for a in args])) | |
log.write('\n') | |
def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure=None): | |
# class labels are not of much use in original ipd.. | |
class_labels = data.pop(data.shape[1] - 1) | |
dim_count = data.shape[1] | |
# number of initial dist_bins | |
# todo old remove later | |
# init_bins_count = 20 # ceil in original ipd... | |
init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) # ceil in original ipd... | |
write('row count:', data.shape[0]) | |
write('init_bins_count:', init_bins_count) | |
write('ID_THRESHOLD_QUANTILE:', cst.ID_THRESHOLD_QUANTILE) | |
# normalization step todo old (optional) | |
# todo old by default the normalization is optional as it does not influence on the results | |
# norm_data = data.apply(lambda x: 2 * cst.NORMALIZATION_RADIUS * (x - x.min()) / ( | |
# x.max() - x.min()) - cst.NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape))) | |
norm_data = data | |
# dimension maximums | |
dim_maxes = norm_data.max(0) | |
disc_macro_intervals = [] | |
disc_points = [] | |
orig_binning = Binning(norm_data) | |
rank_data = orig_binning.get_rank_data() | |
plt.figure(1) | |
height = int(math.sqrt(dim_count )) | |
width = int(math.ceil((dim_count ) / height)) | |
fig, axes = plt.subplots(nrows=height, ncols=width, squeeze=False) | |
# iterate over all the dimensions | |
for curr in range(dim_count): | |
bin_map = orig_binning.equal_frequency_binning(curr, init_bins_count) | |
# distinct bins | |
dist_bins = bin_map.unique() | |
# -----------------------------INTERACTION DISTANCES---------------------------------- | |
# for each bin along the current dimension compute inner measure B and inter measure | |
# IDs = id.compute_IDs_extended(bin_map, curr, data, dist_bins, dim_maxes, MAX_SUBSPACE_SIZE) | |
IDs = id.compute_IDs(bin_map, curr, norm_data, dist_bins, dim_maxes) if method == cst.Method.ORIGINAL else \ | |
id.compute_IDs_extended(bin_map, curr, norm_data, dist_bins, dim_maxes, cor_measure, cst.MAX_SUBSPACE_SIZE) | |
ID_threshold = id.compute_ID_threshold(IDs) | |
# todo ext compute sliding average and count ID peaks above the avg (in a sliding window) | |
# ID_peaks = id.compute_sliding_count(IDs, ID_threshold) | |
# pd.DataFrame(IDs).to_csv(prefix + "_IDs_" + str(curr) + ".csv") | |
# -----------------------------OPTIMAL MERGE STRATEGY---------------------------------- | |
# table with costs, the value in i-th row and j-th column means cost of (i+1) micro bins merged into (j+1) | |
# macro bins | |
F, discretizations = dynamic_merging(ID_threshold, IDs, init_bins_count) | |
# pd.DataFrame(F).to_csv(prefix + "_F_" + str(curr) + ".csv") | |
# pd.DataFrame([[[b[-1] for b in k[:-1]] for k in c] for c in discretizations]).to_csv(prefix + "_bp_" + str(curr) + ".csv") | |
min_id = np.argmin(F[-1]) | |
(curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations, dist_bins, | |
min_id, rank_data) | |
ax1 = axes[int(curr / width), int(curr % width)] | |
# # ax1.hist(IDs, bins=100, color='c') | |
ax1.plot([i for i in range(len(IDs))], IDs) | |
ax1.axhline(ID_threshold, color='b', linestyle='dashed', linewidth=1) | |
ax1.set_title('dimension ' + str(curr)) | |
# ax2 = axes[int((2*curr + 1) / width), int((2*curr + 1) % width)] | |
# ax2.plot(sorted(IDs), color='k') | |
# ax2.set_title('dimension ' + str(curr)) | |
write('-------------------------') | |
write('dimension:', curr) | |
write('ID_threshold:', ID_threshold) | |
write('cost:', F[-1, min_id]) | |
write('number of macrobins:', len(curr_macro_intervals)) | |
# write('IDs', IDs) | |
write('\nIDs between the macrobins:') | |
for macro_id, macro_bin in enumerate(discretizations[-1][min_id][:-1]): | |
write("{0:.2f}".format(curr_macro_intervals[macro_id][1]) + " -", IDs[macro_bin[-1]], '[q=' + | |
str((sorted(IDs).index(IDs[macro_bin[-1]]) + 1) / len(IDs)) + ']') | |
# ax1.axhline(IDs[macro_bin[-1]], color='r', linewidth=1) | |
ax1.plot([macro_bin[-1]], [IDs[macro_bin[-1]]], marker='o', markersize=3, color="red") | |
# ax2.axvline(IDs[macro_bin[-1]], color='r', linewidth=1) | |
write('\nnumber of points per macrobin:') | |
for macro_id in curr_macro_intervals: | |
write("[" + "{0:.2f}".format(curr_macro_intervals[macro_id][0]) + ",", | |
"{0:.2f}".format(curr_macro_intervals[macro_id][1]) + "]", | |
sum([1 for p in curr_macro_points if p == macro_id])) | |
write('\n') | |
disc_macro_intervals.append(curr_macro_intervals) | |
disc_points.append(curr_macro_points) | |
plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25, wspace=0.35) | |
plt.savefig(dir + 'IDs.png', format='png') | |
return disc_macro_intervals, disc_points, class_labels | |
def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_data): | |
disc_macro_intervals = dict() | |
for i, macro_bin in enumerate(discretizations[-1][min_id]): | |
macro_interval = [] | |
for micro_bin_id in macro_bin: | |
right = \ | |
data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[micro_bin_id].right)][curr].index[0]][curr] | |
if not len(macro_interval): | |
macro_interval.append( | |
data.loc[rank_data[rank_data[curr] == math.ceil(dist_bins[micro_bin_id].left)][curr].index[0]][ | |
curr]) | |
macro_interval.append(right) | |
else: | |
macro_interval[1] = right | |
disc_macro_intervals[i] = macro_interval | |
macro_points = [] | |
for point in data.iterrows(): | |
macro_points.append(find_disc_macro_id(disc_macro_intervals, point[1][curr])) | |
return disc_macro_intervals, macro_points | |
if __name__ == "__main__": | |
sys.argv = '-f=synthetic_cases/synthetic_3d_parity_problem.csv -d=;'.split(' ') | |
# if len(sys.argv) < 2: | |
# print('Usage: main.py -f=<data_file> -d=<delimiter> -c=<number of columns> -m=<[original|greedy]> -cor=<[uds]>') | |
file_arg = list(filter(lambda x: x.startswith("-f="), sys.argv)) | |
if not file_arg: | |
raise ValueError('No data file provided!') | |
delim_arg = list(filter(lambda x: x.startswith("-d="), sys.argv)) | |
columns_arg = list(filter(lambda x: x.startswith("-c="), sys.argv)) | |
rows_arg = list(filter(lambda x: x.startswith("-r="), sys.argv)) | |
method_arg = list(filter(lambda x: x.startswith("-m="), sys.argv)) | |
corr_measure_arg = list(filter(lambda x: x.startswith("-cor="), sys.argv)) | |
data_file = file_arg[0].replace('-f=', '') | |
delimiter = delim_arg[0].replace('-d=', '') if delim_arg else ',' | |
columns = int(columns_arg[0].replace('-c=', '')) if columns_arg else None | |
rows = int(rows_arg[0].replace('-r=', '')) if rows_arg else None | |
method = cst.Method[method_arg[0].replace('-m=', '').uppercase()] if method_arg else cst.Method.ORIGINAL | |
cor_measure = cst.CorrelationMeasure[corr_measure_arg[0].replace('-cor=', '').uppercase()] if corr_measure_arg \ | |
else cst.CorrelationMeasure.UDS | |
# reading data from the file with delimiter and NaN values as "?" | |
data = pd.read_csv(data_file, delimiter=delimiter, header=None, na_values='?') | |
# drop a data point if it contains inconsistent data | |
data = data.dropna(axis=0, how='any') | |
if columns: | |
data = data.loc[:, :columns] | |
if rows: | |
data = data[:rows] | |
# defining prefix for the output files | |
file_name = util.get_file_name(data_file) | |
dir = 'logs/' + file_name + "_" + \ | |
(method.name if method == cst.Method.ORIGINAL else cor_measure.name) + \ | |
datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + "/" | |
os.makedirs(dir) | |
print('output files are:', dir + '*') | |
log_file = dir + "log.txt" | |
with open(log_file, 'w') as log: | |
disc_intervals, disc_points, class_labels = compute_optimal_discretization(data, method, cor_measure) | |
# write_out_file(dir + cst.FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels) | |
# | |
# write_cut_file(dir + cst.FILE_DATA_CUTS, disc_intervals) |