From 6bd12bac6450dfaf75905e3809989fea81ca6345 Mon Sep 17 00:00:00 2001 From: Kailash Budhathoki Date: Sat, 16 Sep 2017 20:59:56 +0200 Subject: [PATCH] size of domain of target variable passed to the stochatic complexity method --- crisp.py | 5 ++-- test_real.py | 33 ++++++++++++++++++----- test_synthetic.py | 68 ++++++++++++++++++++++++++++++++++++----------- 3 files changed, 81 insertions(+), 25 deletions(-) diff --git a/crisp.py b/crisp.py index 6edd1f7..a9f01b7 100644 --- a/crisp.py +++ b/crisp.py @@ -34,6 +34,7 @@ def regress(X, Y): # target Y, feature X max_iterations = 10000 scx = stochastic_complexity(X) + len_dom_y = len(set(Y)) # print scx, f = map_to_majority(X, Y) @@ -42,7 +43,7 @@ def regress(X, Y): pair = zip(X, Y) res = [y - f[x] for x, y in pair] - cur_res_codelen = stochastic_complexity(res) + cur_res_codelen = stochastic_complexity(res, len_dom_y) j = 0 minimized = True @@ -60,7 +61,7 @@ def regress(X, Y): res = [y - f[x] if x != x_to_map else y - cand_y for x, y in pair] - res_codelen = stochastic_complexity(res) + res_codelen = stochastic_complexity(res, len_dom_y) if res_codelen < best_res_codelen: best_res_codelen = res_codelen diff --git a/test_real.py b/test_real.py index e7b6bd0..21145cd 100644 --- a/test_real.py +++ b/test_real.py @@ -9,8 +9,8 @@ import numpy as np -# from cisc import cisc -from anms import cisc +from cisc import cisc +from crisp import crisp from dr import dr from dc import dc from utils import dc_compat @@ -163,10 +163,11 @@ def test_abalone(): for i in xrange(1, ncols): Y = data[:, i] cisc_score = cisc(sex, Y) + crisp_score = crisp(sex, Y) dr_score = dr(sex.tolist(), Y.tolist(), level) dc_score = dc(dc_compat(sex), dc_compat(Y)) - print "CISC::", abs(cisc_score[0] - cisc_score[1]), cisc_score, + print "CISC::", if cisc_score[0] < cisc_score[1]: print "%s ⇒ %s" % ("Sex", colnames[i]), elif cisc_score[0] > cisc_score[1]: @@ -193,6 +194,14 @@ def test_abalone(): print "%s ~ %s" % ("Sex", colnames[i]), print + print "CRISP::", + if crisp_score[0] < crisp_score[1]: + print "%s ⇒ %s" % ("Sex", colnames[i]), + elif crisp_score[0] > crisp_score[1]: + print "%s ⇐ %s" % ("Sex", colnames[i]), + else: + print "%s ~ %s" % ("Sex", colnames[i]), + print print @@ -206,10 +215,11 @@ def test_nlschools(): status = data[:, 1] cisc_score = cisc(score, status) + crisp_score = cisc(score, status) dr_score = dr(score.tolist(), status.tolist(), level) dc_score = dc(dc_compat(score), dc_compat(status)) - print "CISC::", abs(cisc_score[0] - cisc_score[1]), cisc_score, + print "CISC::", if cisc_score[0] < cisc_score[1]: print "%s ⇒ %s" % ("score", "status"), elif cisc_score[0] > cisc_score[1]: @@ -236,6 +246,15 @@ def test_nlschools(): print "%s ~ %s" % ("score", "status"), print + print "CRISP::", + if crisp_score[0] < crisp_score[1]: + print "%s ⇒ %s" % ("score", "status"), + elif crisp_score[0] > crisp_score[1]: + print "%s ⇐ %s" % ("score", "status"), + else: + print "%s ~ %s" % ("score", "status"), + print + def test_acute(): print "testing cisc on acute inflammation dataset" @@ -304,10 +323,10 @@ def test_faces(): if __name__ == "__main__": - test_faces() + # test_faces() # test_car() - # test_abalone() - # test_nlschools() + test_abalone() + test_nlschools() # test_acute() # test_nursery() diff --git a/test_synthetic.py b/test_synthetic.py index 30994cb..4445825 100644 --- a/test_synthetic.py +++ b/test_synthetic.py @@ -183,7 +183,7 @@ def _decision_rate(srcX): def test_accuracy(): nsim = 5000 size = 5000 - level = 0.05 + level = 0.01 suppfX = range(-7, 8) srcsX = ["uniform", "binomial", "negativeBinomial", "geometric", "hypergeometric", "poisson", "multinomial"] @@ -453,14 +453,15 @@ def test_significance(): def test_hypercompression(): - m = 500 - size = 2000 - alpha = 0.001 + m = 100 + size = 100 + alpha = 0.01 suppfX = range(-7, 8) - srcX = "uniform" + srcX = "geometric" - # fp = open("results/no-hypercompression.dat", "w") + fp = open("results/no-hypercompression.dat", "w") diffs = [] + decisions = [] # 1=correct, -1=incorrect, 0=wrong for i in xrange(m): X = generate_X(srcX, size) suppX = list(set(X)) @@ -469,24 +470,58 @@ def test_hypercompression(): Y = [f[X[i]] + N[i] for i in xrange(size)] crisp_score = crisp(X, Y) diff = abs(crisp_score[0] - crisp_score[1]) - if crisp_score[0] > crisp_score[1]: - print "wrong inf", diff + + if crisp_score[0] < crisp_score[1]: + decision = 1 + elif crisp_score[0] > crisp_score[1]: + decision = -1 + else: + continue + diffs.append(int(diff)) - diffs = sorted(diffs, reverse=True) - # fp.write("sn\tdiff\tsig\n") # header + decisions.append(decision) + sorted_diffs_indices = reverse_argsort(diffs) + diffs = [diffs[idx] for idx in sorted_diffs_indices] + decisions = [decisions[idx] for idx in sorted_diffs_indices] + + # flags for coloring + # correct, significant = 1 + # correct, insignificant = 2 + # incorrect, significant = 3 + # incorrect, insignificant = 4 + + fp.write("sn\tdiff\tsig\tdec\tcolor\n") # header for k, diff in enumerate(diffs, 1): log_p_value = -diff bh_stat = k * alpha / m log_bh_stat = math.log(bh_stat, 2) + if log_bh_stat < log_p_value: - # fp.write("%i\t%d\t%d\n" % (k, diff, 0)) #reject: not significant - print k, diff, log_bh_stat, log_p_value, 0 + significant = 0 + if decisions[k - 1] == 1: + color = 2 + else: + color = 4 else: - # fp.write("%i\t%d\t%d\n" % (k, diff, 1)) #accept: significant - print k, diff, log_bh_stat, log_p_value, 1 + significant = 1 + if decisions[k - 1] == 1: + color = 1 + elif decisions[k - 1] == -1: + color = 3 + + fp.write("%i\t%d\t%d\t%d\t%d\n" % + (k, diff, significant, decisions[k - 1], color)) + # if log_bh_stat < log_p_value: + # # reject: not significant + # fp.write("%i\t%d\t%d\t%d\n" % (k, diff, 0, decisions[k - 1])) + # print k, diff, log_bh_stat, log_p_value, 0, decisions[k - 1] + # else: + # fp.write("%i\t%d\t%d\t%d\n" % + # (k, diff, 1, decisions[k - 1])) # accept: significant + # print k, diff, 1, decisions[k - 1] # fp.write("%i\t%d\n" % (k, diff)) - # fp.close() + fp.close() def test_sample_size(): @@ -532,5 +567,6 @@ def test_sample_size(): if __name__ == "__main__": - test_sample_size() + # test_hypercompression() + # test_sample_size() test_accuracy()