Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ipd_extended/main.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
320 lines (263 sloc)
13.5 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import sys | |
import datetime | |
import time | |
# todo fix for server push | |
import matplotlib | |
matplotlib.use('Agg') | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import os | |
import re | |
import constants as cst | |
import data_generation as dg | |
import interaction_distance as id | |
import subspace_mining as sm | |
import util | |
from correlation_measures.binning import Binning | |
from experiments_logging import write_out_file, write_cut_file | |
from merging import dynamic_merging | |
import cjs | |
# ------------------------------------------------------ | |
def find_disc_macro_id(disc_macro_intervals, point): | |
for macro in disc_macro_intervals.items(): | |
if macro[1][0] <= point <= macro[1][1]: | |
return macro[0] | |
raise ValueError("Micro bin is not covered by any of the macro intervals!", disc_macro_intervals, point) | |
def write(log, *args): | |
join = ' '.join([str(a) for a in args]) | |
if not log: | |
print(join) | |
else: | |
log.write(join) | |
log.write('\n') | |
def plot_distances(dir, distances, disc_intervals): | |
dim_count = len(distances) | |
plt.figure(1) | |
height = int(math.sqrt(dim_count)) | |
width = int(math.ceil((dim_count) / height)) | |
fig, axes = plt.subplots(nrows=height, ncols=width, squeeze=False) | |
for curr, dist in enumerate(distances): | |
ID_threshold = id.compute_ID_threshold(dist[1]) | |
ax1 = axes[int(curr / width), int(curr % width)] | |
ax1.set_ylim([0, 0.1]) | |
# ax1.hist(distances, bins=100, color='c') | |
ax1.plot(dist[0], dist[1]) | |
ax1.axhline(ID_threshold, color='b', linestyle='dashed', linewidth=1) | |
curr_macro_intervals = disc_intervals[curr] | |
for macro_id in range(1, len(curr_macro_intervals)): | |
ax1.axvline(curr_macro_intervals[macro_id][0], color='r') | |
ax1.set_title('dimension ' + str(curr)) | |
plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25, wspace=0.35) | |
plt.savefig(dir + 'distances.png', format='png') | |
def compute_distances(bin_map, curr, data, dim_maxes, | |
cor_measure, method, distance_measure, | |
k=cst.MAX_SUBSPACE_SIZE, | |
delta=cst.HETEROGENEOUS_THRESHOLD, | |
beam_width=cst.BEAM_WIDTH): | |
if method == cst.Method.ORIGINAL: | |
return id.compute_IDs(bin_map, curr, data, dim_maxes) if distance_measure == cst.DistanceMeasure.ID \ | |
else cjs.compute_CJSs(bin_map, curr, data, dim_maxes) | |
if method == cst.Method.GREEDY_TOPK: | |
subspace = sm.greedy_topk(data, curr, k, cor_measure) | |
elif method == cst.Method.HET_GREEDY_TOPK: | |
subspace = sm.het_greedy_topk(data, curr, k, delta, cor_measure) | |
elif method == cst.Method.BEST_FIRST: | |
subspace = sm.best_first(data, curr, k, cor_measure) | |
elif method == cst.Method.BEAM_SEARCH: | |
subspace = sm.beam_search(data, curr, k, beam_width, cor_measure) | |
elif method == cst.Method.HET_BEAM_SEARCH: | |
subspace = sm.het_beam_search(data, curr, k, beam_width, delta, cor_measure) | |
else: | |
raise ValueError("there is no such method!") | |
# if len(subspace) == 0: | |
# todo the rest of the methods | |
data = data.copy().loc[:, subspace] | |
dim_maxes = dim_maxes[subspace] | |
return id.compute_IDs1(bin_map, data, dim_maxes) if distance_measure == cst.DistanceMeasure.ID \ | |
else cjs.compute_CJSs1(bin_map, data, dim_maxes) | |
def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure=None, | |
distance_measure=cst.DistanceMeasure.ID, log=None): | |
start = time.time() | |
# class labels are not of much use in original ipd.. | |
class_labels = data.pop(data.shape[1] - 1) | |
dim_count = data.shape[1] | |
# number of initial dist_bins | |
# todo old remove later | |
# init_bins_count = 20 # ceil in original ipd... | |
init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) # ceil in original ipd... | |
write(log, 'row count:', data.shape[0]) | |
write(log, 'init_bins_count:', init_bins_count) | |
write(log, 'ID_THRESHOLD_QUANTILE:', cst.ID_THRESHOLD_QUANTILE) | |
# normalization step todo old (optional) | |
# todo old by default the normalization is optional as it does not influence on the results | |
# norm_data = data.apply(lambda x: 2 * cst.NORMALIZATION_RADIUS * (x - x.min()) / ( | |
# x.max() - x.min()) - cst.NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape))) | |
norm_data = data | |
# dimension maximums | |
dim_maxes = norm_data.max(0) | |
disc_macro_intervals = [] | |
disc_points = [] | |
distancez = [] | |
# iterate over all the dimensions | |
for curr in range(dim_count): | |
binning = Binning(norm_data, curr, init_bins_count) | |
bin_map = binning.equal_frequency_binning_by_rank() | |
dist_bins = bin_map.cat.categories | |
# -----------------------------INTERACTION DISTANCES---------------------------------- | |
distances = compute_distances(bin_map, curr, norm_data, dim_maxes, cor_measure, method, | |
distance_measure) | |
# todo python361 | |
# distancez.append([[data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[i].right)].index.tolist()[0], curr] for i in | |
# range(len(distances))], distances]) | |
# todo python342 | |
distancez.append([[data.loc[binning.rank_data[binning.rank_data[curr] | |
== math.floor(float(re.search(', (-*\d+\.*\d*e*-*\d*)', | |
dist_bins[i]).group(1)))] | |
.index.tolist()[0], curr] for i in range(len(distances))], distances]) | |
ID_threshold = id.compute_ID_threshold(distances) | |
# todo ext compute sliding average and count ID peaks above the avg (in a sliding window) | |
# ID_peaks = id.compute_sliding_count(distances, ID_threshold) | |
# pd.DataFrame(distances).to_csv(prefix + "_IDs_" + str(curr) + ".csv") | |
# -----------------------------OPTIMAL MERGE STRATEGY---------------------------------- | |
# table with costs, the value in i-th row and j-th column means cost of (i+1) micro bins merged into (j+1) | |
# macro bins | |
F, discretizations = dynamic_merging(ID_threshold, distances, init_bins_count) | |
# pd.DataFrame(F).to_csv(prefix + "_F_" + str(curr) + ".csv") | |
# pd.DataFrame([[[b[-1] for b in k[:-1]] for k in c] for c in discretizations]).to_csv(prefix + "_bp_" + str(curr) + ".csv") | |
min_id = np.argmin(F[-1]) | |
(curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations, | |
dist_bins, min_id, binning.rank_data) | |
write(log, '-------------------------') | |
write(log, 'dimension:', curr) | |
write(log, 'ID_threshold:', ID_threshold) | |
write(log, 'cost:', F[-1, min_id]) | |
write(log, 'number of macrobins:', len(curr_macro_intervals)) | |
# write(log, 'distances', distances) | |
write(log, '\ndistances between the macrobins:') | |
for macro_id, macro_bin in enumerate(discretizations[-1][min_id][:-1]): | |
write(log, "{0:.2f}".format(curr_macro_intervals[macro_id][1]) + " -", distances[macro_bin[-1]], '[q=' + | |
str((sorted(distances).index(distances[macro_bin[-1]]) + 1) / len(distances)) + ']') | |
write(log, '\nnumber of points per macrobin:') | |
for macro_id in curr_macro_intervals: | |
write(log, "[" + "{0:.2f}".format(curr_macro_intervals[macro_id][0]) + ",", | |
"{0:.2f}".format(curr_macro_intervals[macro_id][1]) + "]", | |
sum([1 for p in curr_macro_points if p == macro_id])) | |
write(log, '\n') | |
disc_macro_intervals.append(curr_macro_intervals) | |
disc_points.append(curr_macro_points) | |
end = time.time() | |
write(log, end - start, 'seconds') | |
return disc_macro_intervals, disc_points, class_labels, distancez | |
def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_data): | |
disc_macro_intervals = dict() | |
for i, macro_bin in enumerate(discretizations[-1][min_id]): | |
macro_interval = [] | |
for micro_bin_id in macro_bin: | |
# todo python361 | |
# right = \ | |
# data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[micro_bin_id].right)][curr].index[0]][curr] | |
# if not len(macro_interval): | |
# macro_interval.append( | |
# data.loc[rank_data[rank_data[curr] == math.ceil(dist_bins[micro_bin_id].left)][curr].index[0]][ | |
# curr]) | |
# macro_interval.append(right) | |
# todo python342 | |
right = \ | |
data.loc[rank_data[rank_data[curr] == math.floor(float(re.search(', (-*\d+\.*\d*e*-*\d*)', | |
dist_bins[micro_bin_id]).group(1)))][ | |
curr].index[0]][curr] | |
if not len(macro_interval): | |
macro_interval.append( | |
data.loc[rank_data[rank_data[curr] == math.ceil(float(re.search('(-*\d+\.*\d*e*-*\d*),', | |
dist_bins[micro_bin_id]).group( | |
1)))][curr].index[0]][ | |
curr]) | |
macro_interval.append(right) | |
else: | |
macro_interval[1] = right | |
disc_macro_intervals[i] = macro_interval | |
macro_points = [] | |
for point in data.iterrows(): | |
macro_points.append(find_disc_macro_id(disc_macro_intervals, point[1][curr])) | |
return disc_macro_intervals, macro_points | |
if __name__ == "__main__": | |
if len(sys.argv) == 1: | |
print( | |
'Usage: main.py -f=<data_file> -d=<delimiter> -c=<number of columns> -m=<[original|greedy_topk]> -cor=<[uds]> ' | |
'-dist=<[id, cjs]> -t=<float>') | |
command = '-f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID' | |
print('Running default: ', command) | |
command_list = command.split(' ') | |
else: | |
command_list = sys.argv[1:] | |
file_arg = list(filter(lambda x: x.startswith("-f="), command_list)) | |
if not file_arg: | |
raise ValueError('No data file provided!') | |
delim_arg = list(filter(lambda x: x.startswith("-d="), command_list)) | |
columns_arg = list(filter(lambda x: x.startswith("-c="), command_list)) | |
rows_arg = list(filter(lambda x: x.startswith("-r="), command_list)) | |
method_arg = list(filter(lambda x: x.startswith("-m="), command_list)) | |
corr_measure_arg = list(filter(lambda x: x.startswith("-cor="), command_list)) | |
distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list)) | |
threshold_arg = list(filter(lambda x: x.startswith("-t="), command_list)) | |
data_file = file_arg[0].replace('-f=', '') | |
if delim_arg: | |
delimiter = delim_arg[0].replace('-d=', '') | |
else: | |
print('using default delimiter ;') | |
delimiter = ';' | |
columns = int(columns_arg[0].replace('-c=', '')) if columns_arg else None | |
rows = int(rows_arg[0].replace('-r=', '')) if rows_arg else None | |
if method_arg: | |
method = cst.Method[method_arg[0].replace('-m=', '').upper()] | |
else: | |
print('using default method ORIGINAL') | |
method = cst.Method.ORIGINAL | |
cor_measure = cst.CorrelationMeasure[corr_measure_arg[0].replace('-cor=', '').upper()] if corr_measure_arg \ | |
else None | |
if method is not cst.Method.ORIGINAL and cor_measure is None: | |
raise ValueError('A correlation measure should be given!') | |
if distance_measure_arg: | |
distance_measure = cst.DistanceMeasure[distance_measure_arg[0].replace('-dist=', '').upper()] | |
else: | |
print('using default distance measure ID') | |
distance_measure = cst.DistanceMeasure.ID | |
if threshold_arg: | |
cst.ID_THRESHOLD_QUANTILE = float(threshold_arg[0].replace('-t=', '')) | |
else: | |
print('using default ID_THRESHOLD_QUANTILE = ', str(cst.ID_THRESHOLD_QUANTILE)) | |
# reading data from the file with delimiter and NaN values as "?" | |
data = pd.read_csv(data_file, delimiter=delimiter, header=None, na_values='?') | |
# drop a data point if it contains inconsistent data | |
data = data.dropna(axis=0, how='any') | |
if columns: | |
data = data.loc[:, :columns] | |
if rows: | |
data = data[:rows] | |
# defining prefix for the output files | |
data_file_name = util.get_file_name(data_file) | |
dir = 'logs3/' + datetime.datetime.now().strftime("%Y%m%d_%H%M%S") \ | |
+ "_" + distance_measure.name \ | |
+ ("_" + cor_measure.name if cor_measure else "") \ | |
+ "_" + method.name \ | |
+ "_" + str(cst.ID_THRESHOLD_QUANTILE) \ | |
+ "_" + data_file_name \ | |
+ ("_" + str(columns) if columns else "") \ | |
+ ("_" + str(rows) if rows else "") \ | |
+ "/" | |
os.makedirs(dir) | |
print('output files are:', dir + '*') | |
log_file = dir + "log.txt" | |
try: | |
with open(log_file, 'w') as log: | |
disc_intervals, disc_points, class_labels, distances = compute_optimal_discretization(data, method, | |
cor_measure, | |
distance_measure, log) | |
plot_distances(dir, distances, disc_intervals) | |
write_out_file(dir + cst.FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels) | |
write_cut_file(dir + cst.FILE_DATA_CUTS, disc_intervals) | |
except: | |
print ("Error in " + dir + ":", sys.exc_info()[0]) | |
raise |