Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
discrete regression with entropy minimisation added
  • Loading branch information
kbudhath committed Sep 19, 2017
1 parent 6bd12ba commit d54f9b9
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 43 deletions.
26 changes: 5 additions & 21 deletions crisp.py
Expand Up @@ -118,9 +118,9 @@ def cisc_grp(X, Y):


def crisp(X, Y):
# print 'regressing from x to y'
print 'regressing from x to y'
sxtoy = regress(X, Y)
# print 'regressing from y to x'
print 'regressing from y to x'
sytox = regress(Y, X)
return (sxtoy, sytox)

Expand All @@ -132,23 +132,7 @@ def test():


if __name__ == "__main__":
from test_synthetic import generate_additive_N, generate_X, map_randomly
size = 500000
suppfX = range(-7, 8)
X = generate_X("multinomial", size)
suppX = list(set(X))
f = map_randomly(suppX, suppfX)
N = generate_additive_N(size)
Y = [f[X[i]] + N[i] for i in xrange(size)]
from test_benchmark import load_pair
X, Y = load_pair(2)
# import cProfile
# cProfile.run('cisc(X, Y)')
# cProfile.run('test()')
# from cisc import cisc
print cisc(X, Y)
# from cisc import cisc
# start = time.time()
# print cisc_grp(X, Y)
# end = time.time()
# print end - start
X, Y = load_pair(99)
print crisp(X, Y)

68 changes: 68 additions & 0 deletions crispe.py
@@ -0,0 +1,68 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from collections import Counter, defaultdict
from copy import copy
import random
import sys
import time

from crisp import map_to_majority, marginals
from entropy import entropy


def regress(X, Y):
# target Y, feature X
max_iterations = 10000
scx = entropy(X)
len_dom_y = len(set(Y))
# print scx,
f = map_to_majority(X, Y)

supp_x = list(set(X))
supp_y = list(set(Y))

pair = zip(X, Y)
res = [y - f[x] for x, y in pair]
cur_res_codelen = entropy(res)

j = 0
minimized = True
while j < max_iterations and minimized:
random.shuffle(supp_x)
minimized = False

for x_to_map in supp_x:
best_res_codelen = sys.float_info.max
best_cand_y = None

for cand_y in supp_y:
if cand_y == f[x_to_map]:
continue

res = [y - f[x] if x != x_to_map else y -
cand_y for x, y in pair]
res_codelen = entropy(res)

if res_codelen < best_res_codelen:
best_res_codelen = res_codelen
best_cand_y = cand_y

if best_res_codelen < cur_res_codelen:
cur_res_codelen = best_res_codelen
f[x_to_map] = best_cand_y
minimized = True
j += 1
# print cur_res_codelen
return scx + cur_res_codelen


def crispe(X, Y):
sxtoy = regress(X, Y)
sytox = regress(Y, X)
return (sxtoy, sytox)


if __name__ == "__main__":
from test_benchmark import load_pair
X, Y = load_pair(99)
print crispe(X, Y)
21 changes: 21 additions & 0 deletions entropy.py
@@ -0,0 +1,21 @@
#!/usr/bin/env python
"""Computes entropy of a sequence of messages
"""
from __future__ import division
from collections import Counter
from math import log


def entropy(sequence):
res = 0
n = len(sequence)
counts = Counter(sequence).values()
for count in counts:
res -= (count / n) * (log(count, 2) - log(n, 2))
return res


if __name__=="__main__":
print entropy([1, 2, 1, 1, 1, 1])
print entropy([1, 1, 1, 1, 1, 1])
print entropy([1, 1, 1, 2, 2, 2])
16 changes: 12 additions & 4 deletions sc.py
Expand Up @@ -9,6 +9,8 @@
from decimal import Decimal
from math import ceil, e, factorial, log, pi, sqrt

from scipy.special import gammaln


__author__ = "Kailash Budhathoki"
__email__ = "kbudhath@mpi-inf.mpg.de"
Expand Down Expand Up @@ -93,8 +95,14 @@ def stochastic_complexity_slow(X):
return sc


def approx_normalizing_term(m, n):
res = ((m - 1) / 2) * log(n / (2 * pi)) + \
(m/2) * log(pi) - gammaln(m/2)
res = res * log(e, 2)
return res


if __name__ == "__main__":
import random
X = [random.randint(1, 200) for i in xrange(1000)]
print stochastic_complexity(X)
# print stochastic_complexity_slow(X)
L = 10
for n in xrange(10000,11000):
print multinomial_with_recurrence(L, n), approx_normalizing_term(L, n)
42 changes: 24 additions & 18 deletions test_synthetic.py
Expand Up @@ -15,6 +15,7 @@

from cisc import cisc
from crisp import crisp
from crispe import crispe
from dc import dc
from dr import dr
from utils import progress, plot_multiline, reverse_argsort, dc_compat
Expand Down Expand Up @@ -181,22 +182,22 @@ def _decision_rate(srcX):


def test_accuracy():
nsim = 5000
nsim = 1000
size = 5000
level = 0.01
suppfX = range(-7, 8)
srcsX = ["uniform", "binomial", "negativeBinomial",
"geometric", "hypergeometric", "poisson", "multinomial"]
print "-" * 58
print "%18s%10s%10s%10s%10s" % ("TYPE_X", "DC", "DR", "CISC", "CRISP")
print "-" * 58
print "-" * 64
print "%18s%10s%10s%10s%10s%10s" % ("TYPE_X", "DC", "DR", "CISC", "CRISPE", "CRISP")
print "-" * 64
sys.stdout.flush()

fp = open("results/acc-dtype.dat", "w")
fp.write("%s\t%s\t%s\t%s\t%s\n" %
("dtype", "dc", "dr", "cisc", "crisp"))
fp.write("%s\t%s\t%s\t%s\t%s\t%s\n" %
("dtype", "dc", "dr", "cisc", "crispe", "crisp"))
for srcX in srcsX:
nc_dc, nc_dr, nc_cisc, nc_crisp = 0, 0, 0, 0
nc_dc, nc_dr, nc_cisc, nc_crispe, nc_crisp = 0, 0, 0, 0, 0
for k in xrange(nsim):
X = generate_X(srcX, size)
suppX = list(set(X))
Expand All @@ -207,21 +208,24 @@ def test_accuracy():
dc_score = dc(dc_compat(X), dc_compat(Y))
dr_score = dr(X, Y, level)
cisc_score = cisc(X, Y)
crispe_score = crispe(X, Y)
crisp_score = crisp(X, Y)

nc_dc += int(dc_score[0] < dc_score[1])
nc_dr += int(dr_score[0] > level and dr_score[1] < level)
nc_cisc += int(cisc_score[0] < cisc_score[1])
nc_crispe += int(crispe_score[0] < crispe_score[1])
nc_crisp += int(crisp_score[0] < crisp_score[1])

acc_dc = nc_dc * 100 / nsim
acc_dr = nc_dr * 100 / nsim
acc_cisc = nc_cisc * 100 / nsim
acc_crispe = nc_crispe * 100 / nsim
acc_crisp = nc_crisp * 100 / nsim
print "%18s%10.2f%10.2f%10.2f%10.2f" % (srcX, acc_dc, acc_dr, acc_cisc, acc_crisp)
print "%18s%10.2f%10.2f%10.2f%10.2f%10.2f" % (srcX, acc_dc, acc_dr, acc_cisc, acc_crispe, acc_crisp)
sys.stdout.flush()
fp.write("%s\t%.2f\t%.2f\t%.2f\t%.2f\n" %
(srcX, acc_dc, acc_dr, acc_cisc, acc_crisp))
fp.write("%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n" %
(srcX, acc_dc, acc_dr, acc_cisc, acc_crispe, acc_crisp))
print "-" * 58
sys.stdout.flush()
fp.close()
Expand Down Expand Up @@ -527,16 +531,17 @@ def test_hypercompression():
def test_sample_size():
nsim = 5000
level = 0.05
sizes = [100, 500, 1000, 2500, 5000, 10000]
sizes = [50, 100, 500, 1000, 2500, 5000]
suppfX = range(-7, 8)
srcX = "geometric"

fp = open("results/acc-size.dat", "w")
diffs = []
fp.write("%s\t%s\t%s\t%s\t%s\n" % ("size", "dc", "dr", "cisc", "crisp"))
fp.write("%s\t%s\t%s\t%s\t%s\t%s\n" %
("size", "dc", "dr", "cisc", "crispe", "crisp"))
# progress(0, len(sizes))
for k, size in enumerate(sizes):
nc_dc, nc_dr, nc_cisc, nc_crisp = 0, 0, 0, 0
nc_dc, nc_dr, nc_cisc, nc_crispe, nc_crisp = 0, 0, 0, 0, 0
for j in xrange(nsim):
X = generate_X(srcX, size)
suppX = list(set(X))
Expand All @@ -547,26 +552,27 @@ def test_sample_size():
dc_score = dc(dc_compat(X), dc_compat(Y))
dr_score = dr(X, Y, level)
cisc_score = cisc(X, Y)
crispe_score = crispe(X, Y)
crisp_score = crisp(X, Y)

nc_dc += int(dc_score[0] < dc_score[1])
nc_dr += int(dr_score[0] > level and dr_score[1] < level)
nc_cisc += int(cisc_score[0] < cisc_score[1])
nc_crispe += int(crispe_score[0] < crispe_score[1])
nc_crisp += int(crisp_score[0] < crisp_score[1])

acc_dc = nc_dc * 100 / nsim
acc_dr = nc_dr * 100 / nsim
acc_cisc = nc_cisc * 100 / nsim
acc_crispe = nc_crispe * 100 / nsim
acc_crisp = nc_crisp * 100 / nsim
print "%d\t%.2f\t%.2f\t%.2f\t%.2f" % (size, acc_dc, acc_dr, acc_cisc, acc_crisp)
print "%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f" % (size, acc_dc, acc_dr, acc_cisc, acc_crispe, acc_crisp)
sys.stdout.flush()
fp.write("%d\t%.2f\t%.2f\t%.2f\t%.2f\n" %
(size, acc_dc, acc_dr, acc_cisc, acc_crisp))
fp.write("%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n" %
(size, acc_dc, acc_dr, acc_cisc, acc_crispe, acc_crisp))
# progress(k + 1, len(sizes))
fp.close()


if __name__ == "__main__":
# test_hypercompression()
# test_sample_size()
test_accuracy()

0 comments on commit d54f9b9

Please sign in to comment.