From d54f9b9bdc93228bb9b0ba13a9c962af3b7da3c6 Mon Sep 17 00:00:00 2001 From: Kailash Budhathoki Date: Tue, 19 Sep 2017 10:55:59 +0200 Subject: [PATCH] discrete regression with entropy minimisation added --- crisp.py | 26 ++++-------------- crispe.py | 68 +++++++++++++++++++++++++++++++++++++++++++++++ entropy.py | 21 +++++++++++++++ sc.py | 16 ++++++++--- test_synthetic.py | 42 ++++++++++++++++------------- 5 files changed, 130 insertions(+), 43 deletions(-) create mode 100644 crispe.py create mode 100644 entropy.py diff --git a/crisp.py b/crisp.py index a9f01b7..257e2cf 100644 --- a/crisp.py +++ b/crisp.py @@ -118,9 +118,9 @@ def cisc_grp(X, Y): def crisp(X, Y): - # print 'regressing from x to y' + print 'regressing from x to y' sxtoy = regress(X, Y) - # print 'regressing from y to x' + print 'regressing from y to x' sytox = regress(Y, X) return (sxtoy, sytox) @@ -132,23 +132,7 @@ def test(): if __name__ == "__main__": - from test_synthetic import generate_additive_N, generate_X, map_randomly - size = 500000 - suppfX = range(-7, 8) - X = generate_X("multinomial", size) - suppX = list(set(X)) - f = map_randomly(suppX, suppfX) - N = generate_additive_N(size) - Y = [f[X[i]] + N[i] for i in xrange(size)] from test_benchmark import load_pair - X, Y = load_pair(2) - # import cProfile - # cProfile.run('cisc(X, Y)') - # cProfile.run('test()') - # from cisc import cisc - print cisc(X, Y) - # from cisc import cisc - # start = time.time() - # print cisc_grp(X, Y) - # end = time.time() - # print end - start + X, Y = load_pair(99) + print crisp(X, Y) + diff --git a/crispe.py b/crispe.py new file mode 100644 index 0000000..1d02f58 --- /dev/null +++ b/crispe.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from collections import Counter, defaultdict +from copy import copy +import random +import sys +import time + +from crisp import map_to_majority, marginals +from entropy import entropy + + +def regress(X, Y): + # target Y, feature X + max_iterations = 10000 + scx = entropy(X) + len_dom_y = len(set(Y)) + # print scx, + f = map_to_majority(X, Y) + + supp_x = list(set(X)) + supp_y = list(set(Y)) + + pair = zip(X, Y) + res = [y - f[x] for x, y in pair] + cur_res_codelen = entropy(res) + + j = 0 + minimized = True + while j < max_iterations and minimized: + random.shuffle(supp_x) + minimized = False + + for x_to_map in supp_x: + best_res_codelen = sys.float_info.max + best_cand_y = None + + for cand_y in supp_y: + if cand_y == f[x_to_map]: + continue + + res = [y - f[x] if x != x_to_map else y - + cand_y for x, y in pair] + res_codelen = entropy(res) + + if res_codelen < best_res_codelen: + best_res_codelen = res_codelen + best_cand_y = cand_y + + if best_res_codelen < cur_res_codelen: + cur_res_codelen = best_res_codelen + f[x_to_map] = best_cand_y + minimized = True + j += 1 + # print cur_res_codelen + return scx + cur_res_codelen + + +def crispe(X, Y): + sxtoy = regress(X, Y) + sytox = regress(Y, X) + return (sxtoy, sytox) + + +if __name__ == "__main__": + from test_benchmark import load_pair + X, Y = load_pair(99) + print crispe(X, Y) diff --git a/entropy.py b/entropy.py new file mode 100644 index 0000000..40f3a0c --- /dev/null +++ b/entropy.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +"""Computes entropy of a sequence of messages +""" +from __future__ import division +from collections import Counter +from math import log + + +def entropy(sequence): + res = 0 + n = len(sequence) + counts = Counter(sequence).values() + for count in counts: + res -= (count / n) * (log(count, 2) - log(n, 2)) + return res + + +if __name__=="__main__": + print entropy([1, 2, 1, 1, 1, 1]) + print entropy([1, 1, 1, 1, 1, 1]) + print entropy([1, 1, 1, 2, 2, 2]) diff --git a/sc.py b/sc.py index 74787bf..58e6ae9 100644 --- a/sc.py +++ b/sc.py @@ -9,6 +9,8 @@ from decimal import Decimal from math import ceil, e, factorial, log, pi, sqrt +from scipy.special import gammaln + __author__ = "Kailash Budhathoki" __email__ = "kbudhath@mpi-inf.mpg.de" @@ -93,8 +95,14 @@ def stochastic_complexity_slow(X): return sc +def approx_normalizing_term(m, n): + res = ((m - 1) / 2) * log(n / (2 * pi)) + \ + (m/2) * log(pi) - gammaln(m/2) + res = res * log(e, 2) + return res + + if __name__ == "__main__": - import random - X = [random.randint(1, 200) for i in xrange(1000)] - print stochastic_complexity(X) - # print stochastic_complexity_slow(X) + L = 10 + for n in xrange(10000,11000): + print multinomial_with_recurrence(L, n), approx_normalizing_term(L, n) diff --git a/test_synthetic.py b/test_synthetic.py index 4445825..88aaddd 100644 --- a/test_synthetic.py +++ b/test_synthetic.py @@ -15,6 +15,7 @@ from cisc import cisc from crisp import crisp +from crispe import crispe from dc import dc from dr import dr from utils import progress, plot_multiline, reverse_argsort, dc_compat @@ -181,22 +182,22 @@ def _decision_rate(srcX): def test_accuracy(): - nsim = 5000 + nsim = 1000 size = 5000 level = 0.01 suppfX = range(-7, 8) srcsX = ["uniform", "binomial", "negativeBinomial", "geometric", "hypergeometric", "poisson", "multinomial"] - print "-" * 58 - print "%18s%10s%10s%10s%10s" % ("TYPE_X", "DC", "DR", "CISC", "CRISP") - print "-" * 58 + print "-" * 64 + print "%18s%10s%10s%10s%10s%10s" % ("TYPE_X", "DC", "DR", "CISC", "CRISPE", "CRISP") + print "-" * 64 sys.stdout.flush() fp = open("results/acc-dtype.dat", "w") - fp.write("%s\t%s\t%s\t%s\t%s\n" % - ("dtype", "dc", "dr", "cisc", "crisp")) + fp.write("%s\t%s\t%s\t%s\t%s\t%s\n" % + ("dtype", "dc", "dr", "cisc", "crispe", "crisp")) for srcX in srcsX: - nc_dc, nc_dr, nc_cisc, nc_crisp = 0, 0, 0, 0 + nc_dc, nc_dr, nc_cisc, nc_crispe, nc_crisp = 0, 0, 0, 0, 0 for k in xrange(nsim): X = generate_X(srcX, size) suppX = list(set(X)) @@ -207,21 +208,24 @@ def test_accuracy(): dc_score = dc(dc_compat(X), dc_compat(Y)) dr_score = dr(X, Y, level) cisc_score = cisc(X, Y) + crispe_score = crispe(X, Y) crisp_score = crisp(X, Y) nc_dc += int(dc_score[0] < dc_score[1]) nc_dr += int(dr_score[0] > level and dr_score[1] < level) nc_cisc += int(cisc_score[0] < cisc_score[1]) + nc_crispe += int(crispe_score[0] < crispe_score[1]) nc_crisp += int(crisp_score[0] < crisp_score[1]) acc_dc = nc_dc * 100 / nsim acc_dr = nc_dr * 100 / nsim acc_cisc = nc_cisc * 100 / nsim + acc_crispe = nc_crispe * 100 / nsim acc_crisp = nc_crisp * 100 / nsim - print "%18s%10.2f%10.2f%10.2f%10.2f" % (srcX, acc_dc, acc_dr, acc_cisc, acc_crisp) + print "%18s%10.2f%10.2f%10.2f%10.2f%10.2f" % (srcX, acc_dc, acc_dr, acc_cisc, acc_crispe, acc_crisp) sys.stdout.flush() - fp.write("%s\t%.2f\t%.2f\t%.2f\t%.2f\n" % - (srcX, acc_dc, acc_dr, acc_cisc, acc_crisp)) + fp.write("%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n" % + (srcX, acc_dc, acc_dr, acc_cisc, acc_crispe, acc_crisp)) print "-" * 58 sys.stdout.flush() fp.close() @@ -527,16 +531,17 @@ def test_hypercompression(): def test_sample_size(): nsim = 5000 level = 0.05 - sizes = [100, 500, 1000, 2500, 5000, 10000] + sizes = [50, 100, 500, 1000, 2500, 5000] suppfX = range(-7, 8) srcX = "geometric" fp = open("results/acc-size.dat", "w") diffs = [] - fp.write("%s\t%s\t%s\t%s\t%s\n" % ("size", "dc", "dr", "cisc", "crisp")) + fp.write("%s\t%s\t%s\t%s\t%s\t%s\n" % + ("size", "dc", "dr", "cisc", "crispe", "crisp")) # progress(0, len(sizes)) for k, size in enumerate(sizes): - nc_dc, nc_dr, nc_cisc, nc_crisp = 0, 0, 0, 0 + nc_dc, nc_dr, nc_cisc, nc_crispe, nc_crisp = 0, 0, 0, 0, 0 for j in xrange(nsim): X = generate_X(srcX, size) suppX = list(set(X)) @@ -547,26 +552,27 @@ def test_sample_size(): dc_score = dc(dc_compat(X), dc_compat(Y)) dr_score = dr(X, Y, level) cisc_score = cisc(X, Y) + crispe_score = crispe(X, Y) crisp_score = crisp(X, Y) nc_dc += int(dc_score[0] < dc_score[1]) nc_dr += int(dr_score[0] > level and dr_score[1] < level) nc_cisc += int(cisc_score[0] < cisc_score[1]) + nc_crispe += int(crispe_score[0] < crispe_score[1]) nc_crisp += int(crisp_score[0] < crisp_score[1]) acc_dc = nc_dc * 100 / nsim acc_dr = nc_dr * 100 / nsim acc_cisc = nc_cisc * 100 / nsim + acc_crispe = nc_crispe * 100 / nsim acc_crisp = nc_crisp * 100 / nsim - print "%d\t%.2f\t%.2f\t%.2f\t%.2f" % (size, acc_dc, acc_dr, acc_cisc, acc_crisp) + print "%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f" % (size, acc_dc, acc_dr, acc_cisc, acc_crispe, acc_crisp) sys.stdout.flush() - fp.write("%d\t%.2f\t%.2f\t%.2f\t%.2f\n" % - (size, acc_dc, acc_dr, acc_cisc, acc_crisp)) + fp.write("%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n" % + (size, acc_dc, acc_dr, acc_cisc, acc_crispe, acc_crisp)) # progress(k + 1, len(sizes)) fp.close() if __name__ == "__main__": - # test_hypercompression() - # test_sample_size() test_accuracy()