Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
crisp added in experiments
  • Loading branch information
kbudhath committed Aug 27, 2017
1 parent 7dda70c commit 8363da4
Show file tree
Hide file tree
Showing 224 changed files with 24,264 additions and 19,640 deletions.
21 changes: 17 additions & 4 deletions cisc.py
Expand Up @@ -3,6 +3,8 @@

"""Causal inference on discrete data using stochastic complexity of multinomial.
"""
from math import log

from collections import defaultdict
from sc import stochastic_complexity

Expand All @@ -27,19 +29,30 @@ def cisc(X, Y):
mYgX = marginals(X, Y)
mXgY = marginals(Y, X)

scYgX = sum(stochastic_complexity(Z) for Z in mYgX.itervalues())
scXgY = sum(stochastic_complexity(Z) for Z in mXgY.itervalues())
domX = mYgX.keys()
domY = mXgY.keys()

# plain one
# scYgX = sum(stochastic_complexity(Z, len(domY)) for Z in mYgX.itervalues())
# scXgY = sum(stochastic_complexity(Z, len(domX)) for Z in mXgY.itervalues())

# weighted one
scYgX = sum((len(Z) * 1.0) / len(X) * stochastic_complexity(Z)
for Z in mYgX.itervalues())
scXgY = sum((len(Z) * 1.0) / len(X) * stochastic_complexity(Z)
for Z in mXgY.itervalues())

ciscXtoY = scX + scYgX
ciscYtoX = scY + scXgY
# print "X=%.2f Ygx=%.2f" % (scX, scYgX)
# print "Y=%.2f XgY=%.2f" % (scY, scXgY)

return (ciscXtoY, ciscYtoX)


if __name__ == "__main__":
import random
from test_anm import map_randomly

from test_synthetic import map_randomly
n = 1000
Xd = range(1, 4)
fXd = range(1, 4)
Expand Down
153 changes: 153 additions & 0 deletions crisp.py
@@ -0,0 +1,153 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from collections import Counter, defaultdict
from copy import copy
import random
import sys
import time

from sc import stochastic_complexity


def marginals(X, Y):
Ys = defaultdict(list)
for i, x in enumerate(X):
Ys[x].append(Y[i])
return Ys


def map_to_majority(X, Y):
f = dict()
subgroups_y = defaultdict(list)

for i, x in enumerate(X):
subgroups_y[x].append(Y[i])

for x, subgroup_y in subgroups_y.iteritems():
freq_y, _ = Counter(subgroup_y).most_common(1)[0]
f[x] = freq_y

return f


def regress(X, Y):
# target Y, feature X
max_iterations = 10000
scx = stochastic_complexity(X)
# print scx,
f = map_to_majority(X, Y)

supp_x = list(set(X))
supp_y = list(set(Y))

pair = zip(X, Y)
res = [y - f[x] for x, y in pair]
cur_res_codelen = stochastic_complexity(res)

j = 0
minimized = True
while j < max_iterations and minimized:
random.shuffle(supp_x)
minimized = False

for x_to_map in supp_x:
best_res_codelen = sys.float_info.max
best_cand_y = None

for cand_y in supp_y:
if cand_y == f[x_to_map]:
continue

res = [y - f[x] if x != x_to_map else y -
cand_y for x, y in pair]
res_codelen = stochastic_complexity(res)

if res_codelen < best_res_codelen:
best_res_codelen = res_codelen
best_cand_y = cand_y

if best_res_codelen < cur_res_codelen:
cur_res_codelen = best_res_codelen
f[x_to_map] = best_cand_y
minimized = True
j += 1
# print cur_res_codelen
return scx + cur_res_codelen


def grp(X, Y):
scx = stochastic_complexity(X)
mygx = marginals(X, Y)
ygrps = mygx.values()
sc_ygrps = [stochastic_complexity(Z) for Z in ygrps]
# print "{%.2f}" % (scx + sum(sc_ygrps)),
# print "(%.2f)" % sum(sc_ygrps),

while True:
merge = None
best_gain = 0
for i in range(len(ygrps)):
sci = sc_ygrps[i]
for j in range(i + 1, len(ygrps)):
scj = sc_ygrps[j]
scij = stochastic_complexity(ygrps[i] + ygrps[j])
gain = sci + scj - scij
if gain > best_gain:
merge = (i, j)
best_gain = gain

if not merge:
break

k, l = merge
ygrps[k] = ygrps[k] + ygrps[l]
sc_ygrps[k] = sc_ygrps[k] + sc_ygrps[l] - best_gain
del ygrps[l]
del sc_ygrps[l]
# assert sum(sc_ygrps) == sum(stochastic_complexity(ygrp)
# for ygrp in ygrps)
# print "%.2f" % sum(sc_ygrps),
return scx + sum(sc_ygrps)


def cisc_grp(X, Y):
sxtoy = grp(X, Y)
sytox = grp(Y, X)
return (sxtoy, sytox)


def crisp(X, Y):
# print 'regressing from x to y'
sxtoy = regress(X, Y)
# print 'regressing from y to x'
sytox = regress(Y, X)
return (sxtoy, sytox)


def test():
X = range(10000)
Y = range(10000)
zip(X, Y)


if __name__ == "__main__":
from test_synthetic import generate_additive_N, generate_X, map_randomly
size = 500000
suppfX = range(-7, 8)
X = generate_X("multinomial", size)
suppX = list(set(X))
f = map_randomly(suppX, suppfX)
N = generate_additive_N(size)
Y = [f[X[i]] + N[i] for i in xrange(size)]
from test_benchmark import load_pair
X, Y = load_pair(2)
# import cProfile
# cProfile.run('cisc(X, Y)')
# cProfile.run('test()')
# from cisc import cisc
print cisc(X, Y)
# from cisc import cisc
# start = time.time()
# print cisc_grp(X, Y)
# end = time.time()
# print end - start
File renamed without changes.
1 change: 1 addition & 0 deletions data/nlschools/nlschools.name
@@ -0,0 +1 @@
language test score, social-economic status of pupil's family

0 comments on commit 8363da4

Please sign in to comment.