Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
fixed errors
  • Loading branch information
Tatiana Dembelova committed Aug 18, 2017
1 parent a92a56e commit 40dd425
Show file tree
Hide file tree
Showing 6 changed files with 1,057 additions and 209 deletions.
43 changes: 29 additions & 14 deletions cjs.py
Expand Up @@ -25,16 +25,16 @@ def sum_P(bin, maxes):


def sum_PlogP(sorted_bin, maxes):
maxes_ = maxes.transpose()
bin_ = sorted_bin.reset_index(drop=True)
count = bin_.shape[0]
maxes = maxes.transpose()
sorted_bin = sorted_bin.reset_index(drop=True)
count = sorted_bin.shape[0]

if count == 0:
return pd.Series([0 for i in range(maxes.shape[0])])

cum = np.array([(i + 1) for i in range(count)])

return (pd.concat([bin_.loc[1:], maxes_], ignore_index=True, axis=0) - bin_).transpose() \
return (pd.concat([sorted_bin.loc[1:], maxes], ignore_index=True, axis=0) - sorted_bin).transpose() \
.dot(cum * (np.log2(cum) - math.log(count, 2))) / count


Expand Down Expand Up @@ -76,7 +76,7 @@ def compute_univariate_cjs(binA, binB, maxes):
accepts bins of many dimensions, it returns a series of univariate CJS for each of the dimensions
:param binA:
:param binB:
:param maxes:
:param maxes: pd.Series
:return:
'''
if binA.shape[1] != binB.shape[1]:
Expand All @@ -85,6 +85,7 @@ def compute_univariate_cjs(binA, binB, maxes):
# - done for the parallel computation for all the dimensions
ind_sorted_binA = ind_sort(binA)
ind_sorted_binB = ind_sort(binB)
maxes = maxes.to_frame()

CJSs = sum_PlogP(ind_sorted_binA, maxes) \
- sum_PlogPQ(binA, binB, maxes) \
Expand All @@ -96,13 +97,13 @@ def compute_cond_CJS(binA, binB, binA_point_ids, binB_point_ids, I1, I2, maxes,
if len(I1) != len(I2):
raise ValueError

maxes_ = maxes.loc[dim].to_frame()
max = pd.Series(maxes[dim])
total_binA_points_count = len(binA_point_ids)
if 0 == total_binA_points_count:
return 0
return sum([len(binA_point_ids.intersection(I1[i])) / total_binA_points_count *
compute_univariate_cjs(binA.loc[binA_point_ids.intersection(I1[i]), dim].to_frame(),
binB.loc[binB_point_ids.intersection(I2[i]), dim].to_frame(), maxes_)[0]
binB.loc[binB_point_ids.intersection(I2[i]), dim].to_frame(), max)[0]
for i in range(len(I1))])


Expand Down Expand Up @@ -181,17 +182,31 @@ def extend_I(I, disc):
return disc_


def compute_CJS(init_binA, init_binB, maxes):
def compute_CJS(binA, binB, maxes):
'''
main method to compute CJS
:param binA:
:param binB:
:param maxes:
:return:
'''
if type(maxes) is not pd.Series:
raise ValueError("maxes should be of pd.Series type!")
if len(maxes) != len(binA.columns) or len(maxes) != len(binB.columns):
raise ValueError("For computing CJS bins should have the same number of dimensions as maxes!")
# renaming relevant columns in the basic order
binA = init_binA.rename(columns={init_binA.columns[i]: i for i in range(len(init_binA.columns))})
binB = init_binB.rename(columns={init_binB.columns[i]: i for i in range(len(init_binB.columns))})
binA = binA.rename(columns={binA.columns[i]: i for i in range(len(binA.columns))})
binB = binB.rename(columns={binB.columns[i]: i for i in range(len(binB.columns))})
# reindexing maxes in the basic order
maxes = maxes.reset_index(drop=True)

# fix missing values with mean value
fix_missing_values(binA)
fix_missing_values(binB)

symm_cjs = _compute_CJS(binA, binB, maxes) + _compute_CJS(binB, binA, maxes)
normalization = sum(sum_P(binA, maxes)) + sum(sum_P(binB, maxes))
maxes_frame = maxes.to_frame()
normalization = sum(sum_P(binA, maxes_frame)) + sum(sum_P(binB, maxes_frame))
return symm_cjs / normalization


Expand Down Expand Up @@ -236,15 +251,15 @@ def _compute_CJS(binA, binB, maxes):
binA = data.loc[[i for i in range(200)], attrs]
binB = data.loc[[i for i in range(200, 400)], attrs]

print(str(compute_CJS(binA, binB, pd.DataFrame(np.max(data[attrs]).transpose().reset_index(drop=True)))))
print(str(compute_CJS(binA, binB, binA.max(0)[attrs])))


def compute_CJSs(bin_map, curr, data, dim_maxes):
data_wo_curr = data.copy()
data_wo_curr.pop(curr)
maxes_ = dim_maxes.drop(curr).to_frame().reset_index(drop=True)
dim_maxes = dim_maxes.drop(curr)

return compute_CJSs1(bin_map, data_wo_curr, maxes_)
return compute_CJSs1(bin_map, data_wo_curr, dim_maxes)


def compute_CJSs1(bin_map, data_wo_curr, maxes_):
Expand Down
2 changes: 1 addition & 1 deletion correlation_measures/binning.py
Expand Up @@ -33,7 +33,7 @@ def equal_frequency_binning_duplicate_drop(self):
qcut = self._compute_qcut()

# qcut = qcut.cat.remove_unused_categories()
bounds = [float(re.search(', (-*\d+\.*\d*)', c).group(1)) for c in qcut.cat.categories]
bounds = [float(re.search(', (-*\d+\.*\d*e*-*\d*)', c).group(1)) for c in qcut.cat.categories]
# including global_min with a margin of 1
bounds.insert(0, self.global_min - 1)
self.bounds = pd.Series(bounds)
Expand Down
46 changes: 34 additions & 12 deletions main.py
Expand Up @@ -2,8 +2,10 @@
import sys

import datetime
import time
# todo fix for server push
import matplotlib

matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
Expand Down Expand Up @@ -40,8 +42,6 @@ def write(log, *args):
log.write('\n')




def plot_distances(dir, distances):
dim_count = len(distances)
plt.figure(1)
Expand Down Expand Up @@ -77,15 +77,25 @@ def compute_distances(bin_map, curr, data, dim_maxes,
subspace = sm.greedy_topk(data, curr, k, cor_measure)
elif method == cst.Method.HET_GREEDY_TOPK:
subspace = sm.het_greedy_topk(data, curr, k, delta, cor_measure)
elif method == cst.Method.BEST_FIRST:
subspace = sm.best_first(data, curr, k, cor_measure)
elif method == cst.Method.BEAM_SEARCH:
subspace = sm.beam_search(data, curr, k, beam_width, cor_measure)
elif method == cst.Method.HET_BEAM_SEARCH:
subspace = sm.het_beam_search(data, curr, k, beam_width, delta, cor_measure)
else:
raise ValueError("there is no such method!")
# todo the rest of the methods
data = data.copy().loc[:, subspace]
dim_maxes = dim_maxes[subspace]

return id.compute_IDs1(bin_map, data, dim_maxes) if distance_measure == cst.DistanceMeasure.ID \
else cjs.compute_CJSs1(bin_map, data, dim_maxes)


def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure=None,
distance_measure=cst.DistanceMeasure.ID, log=None):
start = time.time()
# class labels are not of much use in original ipd..
class_labels = data.pop(data.shape[1] - 1)
dim_count = data.shape[1]
Expand Down Expand Up @@ -127,7 +137,7 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure
# range(len(distances))], distances])
# todo python342
distancez.append([[data.loc[binning.rank_data[binning.rank_data[curr]
== math.floor(float(re.search(', (-*\d+\.*\d*)',
== math.floor(float(re.search(', (-*\d+\.*\d*e*-*\d*)',
dist_bins[i]).group(1)))]
.index.tolist()[0], curr] for i in range(len(distances))], distances])

Expand Down Expand Up @@ -171,6 +181,8 @@ def compute_optimal_discretization(data, method=cst.Method.ORIGINAL, cor_measure

disc_macro_intervals.append(curr_macro_intervals)
disc_points.append(curr_macro_points)
end = time.time()
write(log, end - start, 'seconds')
return disc_macro_intervals, disc_points, class_labels, distancez


Expand All @@ -190,10 +202,14 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_

# todo python342
right = \
data.loc[rank_data[rank_data[curr] == math.floor(float(re.search(', (-*\d+\.*\d*)', dist_bins[micro_bin_id]).group(1)))][curr].index[0]][curr]
data.loc[rank_data[rank_data[curr] == math.floor(float(re.search(', (-*\d+\.*\d*e*-*\d*)',
dist_bins[micro_bin_id]).group(1)))][
curr].index[0]][curr]
if not len(macro_interval):
macro_interval.append(
data.loc[rank_data[rank_data[curr] == math.ceil(float(re.search('(-*\d+\.*\d*),', dist_bins[micro_bin_id]).group(1)))][curr].index[0]][
data.loc[rank_data[rank_data[curr] == math.ceil(float(re.search('(-*\d+\.*\d*e*-*\d*),',
dist_bins[micro_bin_id]).group(
1)))][curr].index[0]][
curr])
macro_interval.append(right)
else:
Expand Down Expand Up @@ -235,6 +251,8 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_
method = cst.Method[method_arg[0].replace('-m=', '').upper()] if method_arg else cst.Method.ORIGINAL
cor_measure = cst.CorrelationMeasure[corr_measure_arg[0].replace('-cor=', '').upper()] if corr_measure_arg \
else None
if method is not cst.Method.ORIGINAL and cor_measure is None:
raise ValueError('A correlation measure should be given!')
distance_measure = cst.DistanceMeasure[
distance_measure_arg[0].replace('-dist=', '').upper()] if distance_measure_arg \
else cst.DistanceMeasure.ID
Expand Down Expand Up @@ -263,13 +281,17 @@ def get_discretized_points(curr, data, discretizations, dist_bins, min_id, rank_
print('output files are:', dir + '*')
log_file = dir + "log.txt"

with open(log_file, 'w') as log:
disc_intervals, disc_points, class_labels, distances = compute_optimal_discretization(data, method,
cor_measure,
distance_measure, log)
try:
with open(log_file, 'w') as log:
disc_intervals, disc_points, class_labels, distances = compute_optimal_discretization(data, method,
cor_measure,
distance_measure, log)

plot_distances(dir, distances)
plot_distances(dir, distances)

write_out_file(dir + cst.FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels)
write_out_file(dir + cst.FILE_DATA_OUTPUT, disc_intervals, disc_points, class_labels)

write_cut_file(dir + cst.FILE_DATA_CUTS, disc_intervals)
write_cut_file(dir + cst.FILE_DATA_CUTS, disc_intervals)
except:
print ("Error in " + dir + ":", sys.exc_info()[0])
raise

0 comments on commit 40dd425

Please sign in to comment.