Skip to content
Permalink
40dd4250a1
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
297 lines (242 sloc) 12.7 KB
import math
import sys
import datetime
import time
# todo fix for server push
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import re
import constants as cst
import data_generation as dg
import interaction_distance as id
import subspace_mining as sm
import util
from correlation_measures.binning import Binning
from experiments_logging import write_out_file, write_cut_file
from merging import dynamic_merging
import cjs
# ------------------------------------------------------
def find_disc_macro_id(disc_macro_intervals, point):
for macro in disc_macro_intervals.items():
if macro[1][0] <= point <= macro[1][1]:
return macro[0]
raise ValueError("Micro bin is not covered by any of the macro intervals!", disc_macro_intervals, point)
def write(log, *args):
join = ' '.join([str(a) for a in args])
if not log:
print(join)
else:
log.write(join)
log.write('\n')
def plot_distances(dir, distances):
dim_count = len(distances)
plt.figure(1)
height = int(math.sqrt(dim_count))
width = int(math.ceil((dim_count) / height))
fig, axes = plt.subplots(nrows=height, ncols=width, squeeze=False)
for curr, dist in enumerate(distances):
ID_threshold = id.compute_ID_threshold(dist[1])
ax1 = axes[int(curr / width), int(curr % width)]
ax1.set_ylim([0, 0.1])
# ax1.hist(distances, bins=100, color='c')
ax1.plot(dist[0], dist[1])
ax1.axhline(ID_threshold, color='b', linestyle='dashed', linewidth=1)
ax1.set_title('dimension ' + str(curr))
plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25, wspace=0.35)
plt.savefig(dir + 'distances.png', format='png')
def compute_distances(bin_map, curr, data, dim_maxes,
cor_measure, method, distance_measure,
k=cst.MAX_SUBSPACE_SIZE,
delta=cst.HETEROGENEOUS_THRESHOLD,
beam_width=cst.BEAM_WIDTH):
if method == cst.Method.ORIGINAL:
return id.compute_IDs(bin_map, curr, data, dim_maxes) if distance_measure == cst.DistanceMeasure.ID \
else cjs.compute_CJSs(bin_map, curr, data, dim_maxes)
if method == cst.Method.GREEDY_TOPK:
subspace = sm.greedy_topk(data, curr, k, cor_measure)
elif method == cst.Method.HET_GREEDY_TOPK:
subspace = sm.het_greedy_topk(data, curr, k, delta, cor_measure)
elif method == cst.Method.BEST_FIRST:
subspace = sm.best_first(data, curr, k, cor_measure)
elif method == cst.Method.BEAM_SEARCH:
subspace = sm.beam_search(data, curr, k, beam_width, cor_measure)
elif method == cst.Method.HET_BEAM_SEARCH:
subspace = sm.het_beam_search(data, curr, k, beam_width, delta, cor_measure)
else:
raise ValueError("there is no such method!")
# todo the rest of the methods
data = data.copy().loc[:, subspace]
dim_maxes = dim_maxes[subspace]
return id.compute_IDs1(bin_map, data, dim_maxes) if distance_measure == cst.DistanceMeasure.ID \
else cjs.compute_CJSs1(bin_map, data, dim_maxes)
def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure=None,
distance_measure=cst.DistanceMeasure.ID, log=None):
start = time.time()
# class labels are not of much use in original ipd..
class_labels = data.pop(data.shape[1] - 1)
dim_count = data.shape[1]
# number of initial dist_bins
# todo old remove later
# init_bins_count = 20 # ceil in original ipd...
init_bins_count = int(math.ceil(math.sqrt(data.shape[0]))) # ceil in original ipd...
write(log, 'row count:', data.shape[0])
write(log, 'init_bins_count:', init_bins_count)
write(log, 'ID_THRESHOLD_QUANTILE:', cst.ID_THRESHOLD_QUANTILE)
# normalization step todo old (optional)
# todo old by default the normalization is optional as it does not influence on the results
# norm_data = data.apply(lambda x: 2 * cst.NORMALIZATION_RADIUS * (x - x.min()) / (
# x.max() - x.min()) - cst.NORMALIZATION_RADIUS if x.max() != x.min() else pd.Series(-np.ones(x.shape)))
norm_data = data
# dimension maximums
dim_maxes = norm_data.max(0)
disc_macro_intervals = []
disc_points = []
distancez = []
# iterate over all the dimensions
for curr in range(dim_count):
binning = Binning(norm_data, curr, init_bins_count)
bin_map = binning.equal_frequency_binning_by_rank()
dist_bins = bin_map.cat.categories
# -----------------------------INTERACTION DISTANCES----------------------------------
distances = compute_distances(bin_map, curr, norm_data, dim_maxes, cor_measure, method,
distance_measure)
# todo python361
# distancez.append([[data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[i].right)].index.tolist()[0], curr] for i in
# range(len(distances))], distances])
# todo python342
distancez.append([[data.loc[binning.rank_data[binning.rank_data[curr]
== math.floor(float(re.search(', (-*\d+\.*\d*e*-*\d*)',
dist_bins[i]).group(1)))]
.index.tolist()[0], curr] for i in range(len(distances))], distances])
ID_threshold = id.compute_ID_threshold(distances)
# todo ext compute sliding average and count ID peaks above the avg (in a sliding window)
# ID_peaks = id.compute_sliding_count(distances, ID_threshold)
# pd.DataFrame(distances).to_csv(prefix + "_IDs_" + str(curr) + ".csv")
# -----------------------------OPTIMAL MERGE STRATEGY----------------------------------
# table with costs, the value in i-th row and j-th column means cost of (i+1) micro bins merged into (j+1)
# macro bins
F, discretizations = dynamic_merging(ID_threshold, distances, init_bins_count)
# pd.DataFrame(F).to_csv(prefix + "_F_" + str(curr) + ".csv")
# pd.DataFrame([[[b[-1] for b in k[:-1]] for k in c] for c in discretizations]).to_csv(prefix + "_bp_" + str(curr) + ".csv")
min_id = np.argmin(F[-1])
(curr_macro_intervals, curr_macro_points) = get_discretized_points(curr, data, discretizations,
dist_bins, min_id, binning.rank_data)
write(log, '-------------------------')
write(log, 'dimension:', curr)
write(log, 'ID_threshold:', ID_threshold)
write(log, 'cost:', F[-1, min_id])
write(log, 'number of macrobins:', len(curr_macro_intervals))
# write(log, 'distances', distances)
write(log, '\ndistances between the macrobins:')
for macro_id, macro_bin in enumerate(discretizations[-1][min_id][:-1]):
write(log, "{0:.2f}".format(curr_macro_intervals[macro_id][1]) + " -", distances[macro_bin[-1]], '[q=' +
str((sorted(distances).index(distances[macro_bin[-1]]) + 1) / len(distances)) + ']')
write(log, '\nnumber of points per macrobin:')
for macro_id in curr_macro_intervals:
write(log, "[" + "{0:.2f}".format(curr_macro_intervals[macro_id][0]) + ",",
"{0:.2f}".format(curr_macro_intervals[macro_id][1]) + "]",
sum([1 for p in curr_macro_points if p == macro_id]))
write(log, '\n')
disc_macro_intervals.append(curr_macro_intervals)
disc_points.append(curr_macro_points)
end = time.time()
write(log, end - start, 'seconds')
return disc_macro_intervals, disc_points, class_labels, distancez
def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_data):
disc_macro_intervals = dict()
for i, macro_bin in enumerate(discretizations[-1][min_id]):
macro_interval = []
for micro_bin_id in macro_bin:
# todo python361
# right = \
# data.loc[rank_data[rank_data[curr] == math.floor(dist_bins[micro_bin_id].right)][curr].index[0]][curr]
# if not len(macro_interval):
# macro_interval.append(
# data.loc[rank_data[rank_data[curr] == math.ceil(dist_bins[micro_bin_id].left)][curr].index[0]][
# curr])
# macro_interval.append(right)
# todo python342
right = \
data.loc[rank_data[rank_data[curr] == math.floor(float(re.search(', (-*\d+\.*\d*e*-*\d*)',
dist_bins[micro_bin_id]).group(1)))][
curr].index[0]][curr]
if not len(macro_interval):
macro_interval.append(
data.loc[rank_data[rank_data[curr] == math.ceil(float(re.search('(-*\d+\.*\d*e*-*\d*),',
dist_bins[micro_bin_id]).group(
1)))][curr].index[0]][
curr])
macro_interval.append(right)
else:
macro_interval[1] = right
disc_macro_intervals[i] = macro_interval
macro_points = []
for point in data.iterrows():
macro_points.append(find_disc_macro_id(disc_macro_intervals, point[1][curr]))
return disc_macro_intervals, macro_points
if __name__ == "__main__":
if len(sys.argv) == 1:
print(
'Usage: main.py -f=<data_file> -d=<delimiter> -c=<number of columns> -m=<[original|greedy_topk]> -cor=<[uds]> '
'-dist=<[id, cjs]>')
command = '-f=synthetic_cases/synthetic_3d_parity_problem.csv -d=; -dist=ID'
print('Running default: ', command)
command_list = command.split(' ')
else:
command_list = sys.argv[1:]
file_arg = list(filter(lambda x: x.startswith("-f="), command_list))
if not file_arg:
raise ValueError('No data file provided!')
delim_arg = list(filter(lambda x: x.startswith("-d="), command_list))
columns_arg = list(filter(lambda x: x.startswith("-c="), command_list))
rows_arg = list(filter(lambda x: x.startswith("-r="), command_list))
method_arg = list(filter(lambda x: x.startswith("-m="), command_list))
corr_measure_arg = list(filter(lambda x: x.startswith("-cor="), command_list))
distance_measure_arg = list(filter(lambda x: x.startswith("-dist="), command_list))
data_file = file_arg[0].replace('-f=', '')
delimiter = delim_arg[0].replace('-d=', '') if delim_arg else ';'
columns = int(columns_arg[0].replace('-c=', '')) if columns_arg else None
rows = int(rows_arg[0].replace('-r=', '')) if rows_arg else None
method = cst.Method[method_arg[0].replace('-m=', '').upper()] if method_arg else cst.Method.ORIGINAL
cor_measure = cst.CorrelationMeasure[corr_measure_arg[0].replace('-cor=', '').upper()] if corr_measure_arg \
else None
if method is not cst.Method.ORIGINAL and cor_measure is None:
raise ValueError('A correlation measure should be given!')
distance_measure = cst.DistanceMeasure[
distance_measure_arg[0].replace('-dist=', '').upper()] if distance_measure_arg \
else cst.DistanceMeasure.ID
# reading data from the file with delimiter and NaN values as "?"
data = pd.read_csv(data_file, delimiter=delimiter, header=None, na_values='?')
# drop a data point if it contains inconsistent data
data = data.dropna(axis=0, how='any')
if columns:
data = data.loc[:, :columns]
if rows:
data = data[:rows]
# defining prefix for the output files
data_file_name = util.get_file_name(data_file)
dir = 'logs3/' + datetime.datetime.now().strftime("%Y%m%d_%H%M%S") \
+ "_" + distance_measure.name \
+ ("_" + cor_measure.name if cor_measure else "") \
+ "_" + method.name \
+ "_" + data_file_name \
+ ("_" + str(columns) if columns else "") \
+ ("_" + str(rows) if rows else "") + "/"
os.makedirs(dir)
print('output files are:', dir + '*')
log_file = dir + "log.txt"
try:
with open(log_file, 'w') as log:
disc_intervals, disc_points, class_labels, distances = compute_optimal_discretization(data, method,
cor_measure,
distance_measure, log)
plot_distances(dir, distances)
write_out_file(dir + cst.FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels)
write_cut_file(dir + cst.FILE_DATA_CUTS, disc_intervals)
except:
print ("Error in " + dir + ":", sys.exc_info()[0])
raise