Skip to content
Permalink
46b9efef77
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
612 lines (506 sloc) 20.8 KB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Here, we test the accuracy of various causal inference techniques in identifying ANMs with different ranges of noise term.
"""
from __future__ import division
from functools import partial
import math
import random
import sys
import time
import numpy as np
from stat_tests import friedman_test, nemenyi_multitest, bonferroni_dunn_test
from cisc import cisc
from crisp import crisp
from crispe import crispe
from dc import dc
from dr import dr
from utils import progress, plot_multiline, reverse_argsort, dc_compat
__author__ = "Kailash Budhathoki"
__email__ = "kbudhath@mpi-inf.mpg.de"
__copyright__ = "Copyright (c) 2017"
__license__ = "MIT"
class InfRes(object):
def __init__(self):
self.ncorrect = 0
self.nwrong = 0
self.nundec = 0
def to_str(self, nsample):
assert self.ncorrect + self.nwrong + self.nundec == nsample
return str.center("%d %d %d %.2f" % (self.ncorrect, self.nwrong, self.nundec, self.ncorrect / nsample), 20)
def map_randomly(Xd, fXd):
f = dict()
for x in Xd:
y = random.choice(fXd)
f[x] = y
# ensure that f is not a constant function
if len(set(f.values())) == 1:
f = map_randomly(Xd, fXd)
assert len(set(f.values())) != 1
return f
def generate_X(src, size):
if src == "uniform":
maxX = random.randint(2, 10)
X = [random.randint(1, maxX) for i in xrange(size)]
elif src == "multinomial":
p_nums = [random.randint(1, 10) for i in xrange(random.randint(2, 11))]
p_vals = [v / sum(p_nums) for v in p_nums]
X = np.random.multinomial(size, p_vals, size=1)[0].tolist()
X = [[i + 1] * f for i, f in enumerate(X)]
X = [j for sublist in X for j in sublist]
elif src == "binomial":
n = random.randint(1, 40)
p = random.uniform(0.1, 0.9)
X = np.random.binomial(n, p, size).tolist()
elif src == "geometric":
p = random.uniform(0.1, 0.9)
X = np.random.geometric(p, size).tolist()
elif src == "hypergeometric":
ngood = random.randint(1, 40)
nbad = random.randint(1, 40)
nsample = random.randint(1, min(40, ngood + nbad - 1))
X = np.random.hypergeometric(ngood, nbad, nsample, size).tolist()
elif src == "poisson":
lam = random.randint(1, 10)
X = np.random.poisson(lam, size).tolist()
elif src == "negativeBinomial":
n = random.randint(1, 40)
p = random.uniform(0.1, 0.9)
X = np.random.negative_binomial(n, p, size).tolist()
return X
def generate_additive_N(size):
t = random.randint(1, 7)
suppN = range(-t, t + 1)
N = [random.choice(suppN) for i in xrange(size)]
return N
def test_decision_rate():
srcsX = ["uniform", "binomial", "negativeBinomial",
"geometric", "hypergeometric", "poisson", "multinomial"]
total = len(srcsX)
count = 0
progress(count, total)
for srcX in srcsX:
count += 1
_decision_rate(srcX)
progress(count, total)
def _decision_rate(srcX):
nsample = 1000
size = 1000
level = 0.05
suppfX = range(-7, 8)
diff_cisc, diff_dc, diff_dr = [], [], []
res_cisc, res_dc, res_dr = [], [], []
for k in xrange(nsample):
X = generate_X(srcX, size)
suppX = list(set(X))
f = map_randomly(suppX, suppfX)
N = generate_additive_N(size)
Y = [f[X[i]] + N[i] for i in xrange(size)]
cisc_score = cisc(X, Y)
dc_score = dc(dc_compat(X), dc_compat(Y))
dr_score = dr(X, Y, level)
undecided_cisc = cisc_score[0] == cisc_score[1]
undecided_dc = dc_score[0] == dc_score[1]
undecided_dr = (dr_score[0] < level and dr_score[1] < level) or (
dr_score[0] > level and dr_score[1] > level)
diff_cisc.append(abs(cisc_score[0] - cisc_score[1]))
diff_dc.append(abs(dc_score[0] - dc_score[1]))
diff_dr.append(abs(dr_score[0] - dr_score[1]))
if undecided_cisc:
res_cisc.append(random.choice([True, False]))
else:
res_cisc.append(cisc_score[0] < cisc_score[1])
if undecided_dc:
res_dc.append(random.choice([True, False]))
else:
res_dc.append(dc_score[0] < dc_score[1])
if undecided_dr:
res_dr.append(random.choice([True, False]))
else:
res_dr.append(dr_score[0] > level and dr_score[1] < level)
indices_cisc = reverse_argsort(diff_cisc)
indices_dc = reverse_argsort(diff_dc)
indices_dr = reverse_argsort(diff_dr)
diff_cisc = [diff_cisc[i] for i in indices_cisc]
diff_dc = [diff_dc[i] for i in indices_dc]
diff_dr = [diff_dr[i] for i in indices_dr]
res_cisc = [res_cisc[i] for i in indices_cisc]
res_dc = [res_dc[i] for i in indices_dc]
res_dr = [res_dr[i] for i in indices_dr]
dec_rate = np.arange(0.01, 1.01, 0.01)
acc_cisc, acc_dc, acc_dr = [], [], []
for r in dec_rate:
dec_cisc_r = res_cisc[:int(r * nsample)]
dec_dc_r = res_dc[:int(r * nsample)]
dec_dr_r = res_dr[:int(r * nsample)]
acc_cisc_r = sum(dec_cisc_r) / len(dec_cisc_r)
acc_dc_r = sum(dec_dc_r) / len(dec_dc_r)
acc_dr_r = sum(dec_dr_r) / len(dec_dr_r)
acc_cisc.append(acc_cisc_r)
acc_dc.append(acc_dc_r)
acc_dr.append(acc_dr_r)
# print dec_rate
# print acc_dc
# print acc_dr
# print acc_cisc
with open("results/dec_rate_synth_%s.dat" % srcX, "w") as writer:
for i, r in enumerate(dec_rate):
writer.write("%.2f %.2f %.2f %.2f\n" %
(r, acc_dc[i], acc_dr[i], acc_cisc[i]))
# plot_multiline([acc_cisc, acc_dc, acc_dr], dec_rate, [
# "CISC", "DC", "DR"], "decision rate", "accuracy", "decision rate versus accuracy", "dec_rate_%sX.png" % srcX)
def are_disjoint(sets):
disjoint = True
union = set()
for s in sets:
for x in s:
if x in union:
disjoint = False
break
union.add(x)
return disjoint
def test_accuracy():
nsim = 1000
size = 1000
level = 0.01
suppfX = range(-7, 8)
srcsX = ["uniform", "binomial", "negativeBinomial",
"geometric", "hypergeometric", "poisson", "multinomial"]
print "-" * 70
print "%18s%10s%10s%10s%10s%10s" % ("TYPE_X", "DC", "DR", "CISC", "CRISPE", "CRISP")
print "-" * 70
sys.stdout.flush()
fp = open("results/acc-dtype.dat", "w")
fp.write("%s\t%s\t%s\t%s\t%s\t%s\n" %
("dtype", "dc", "dr", "cisc", "crispe", "crisp"))
for srcX in srcsX:
nsamples = 0
nc_dc, nc_dr, nc_cisc, nc_crispe, nc_crisp = 0, 0, 0, 0, 0
while nsamples < nsim:
X = generate_X(srcX, size)
suppX = list(set(X))
f = map_randomly(suppX, suppfX)
N = generate_additive_N(size)
Y = [f[X[i]] + N[i] for i in xrange(size)]
# check if f(x) + supp N are disjoint for x in domx
suppN = set(N)
decomps = []
for x in suppX:
fx = f[x]
sum_fx_suppN = set([fx + n for n in suppN])
decomps.append(sum_fx_suppN)
non_overlapping_noise = are_disjoint(decomps)
if non_overlapping_noise:
continue
nsamples += 1
dc_score = dc(dc_compat(X), dc_compat(Y))
dr_score = dr(X, Y, level)
cisc_score = cisc(X, Y)
crispe_score = crispe(X, Y)
crisp_score = crisp(X, Y)
nc_dc += int(dc_score[0] < dc_score[1])
nc_dr += int(dr_score[0] > level and dr_score[1] < level)
nc_cisc += int(cisc_score[0] < cisc_score[1])
nc_crispe += int(crispe_score[0] < crispe_score[1])
nc_crisp += int(crisp_score[0] < crisp_score[1])
assert nsamples == nsim
acc_dc = nc_dc * 100 / nsim
acc_dr = nc_dr * 100 / nsim
acc_cisc = nc_cisc * 100 / nsim
acc_crispe = nc_crispe * 100 / nsim
acc_crisp = nc_crisp * 100 / nsim
print "%18s%10.2f%10.2f%10.2f%10.2f%10.2f" % (srcX, acc_dc, acc_dr, acc_cisc, acc_crispe, acc_crisp)
sys.stdout.flush()
fp.write("%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n" %
(srcX, acc_dc, acc_dr, acc_cisc, acc_crispe, acc_crisp))
print "-" * 70
sys.stdout.flush()
fp.close()
# todo: use crisp
def test_accuracy_at_model_space():
nsim = 1000
size = 1000
level = 0.05
suppfX = range(-7, 8)
S = range(1, 7)
srcsX = ["uniform", "binomial", "negativeBinomial",
"geometric", "hypergeometric", "poisson", "multinomial"]
print "%s | %s | %s | %s" % ("typeX".center(18), "sc_space".center(62), "dr_space".center(62), "full_space".center(62))
print "%18s | %s %s %s | %s %s %s | %s %s %s" % ("", "sc".center(20), "dc".center(20), "dr".center(20), "sc".center(20), "dc".center(20), "dr".center(20), "sc".center(20), "dc".center(20), "dr".center(20))
for srcX in srcsX:
nsample_sc, nsample_anm, nsample_all = 0, 0, 0
rsc_sc, rsc_dc, rsc_dr = InfRes(), InfRes(), InfRes()
ranm_sc, ranm_dc, ranm_dr = InfRes(), InfRes(), InfRes()
rall_sc, rall_dc, rall_dr = InfRes(), InfRes(), InfRes()
for k in xrange(nsim):
X = generate_X(srcX, size)
suppX = list(set(X))
f = map_randomly(suppX, suppfX)
N = generate_additive_N(size)
Y = [f[X[i]] + N[i] for i in xrange(size)]
cisc_score = cisc(X, Y)
dc_score = dc(dc_compat(X), dc_compat(Y))
dr_score = dr(X, Y, level)
undecided_dr = (dr_score[0] < level and dr_score[1] < level) or (
dr_score[0] > level and dr_score[1] > level)
if cisc_score[0] != cisc_score[1]:
nsample_sc += 1
rsc_sc.ncorrect += int(cisc_score[0] < cisc_score[1])
rsc_sc.nwrong += int(cisc_score[0] > cisc_score[1])
rsc_sc.nundec += int(cisc_score[0] == cisc_score[1])
rsc_dc.ncorrect += int(dc_score[0] < dc_score[1])
rsc_dc.nwrong += int(dc_score[0] > dc_score[1])
rsc_dc.nundec += int(dc_score[0] == dc_score[1])
rsc_dr.ncorrect += int(dr_score[0]
> level and dr_score[1] < level)
rsc_dr.nwrong += int(dr_score[0]
< level and dr_score[1] > level)
rsc_dr.nundec += int(undecided_dr)
if not undecided_dr:
nsample_anm += 1
ranm_sc.ncorrect += int(cisc_score[0] < cisc_score[1])
ranm_sc.nwrong += int(cisc_score[0] > cisc_score[1])
ranm_sc.nundec += int(cisc_score[0] == cisc_score[1])
ranm_dc.ncorrect += int(dc_score[0] < dc_score[1])
ranm_dc.nwrong += int(dc_score[0] > dc_score[1])
ranm_dc.nundec += int(dc_score[0] == dc_score[1])
ranm_dr.ncorrect += int(dr_score[0]
> level and dr_score[1] < level)
ranm_dr.nwrong += int(dr_score[0]
< level and dr_score[1] > level)
ranm_dr.nundec += int(undecided_dr)
nsample_all += 1
rall_sc.ncorrect += int(cisc_score[0] < cisc_score[1])
rall_sc.nwrong += int(cisc_score[0] > cisc_score[1])
rall_sc.nundec += int(cisc_score[0] == cisc_score[1])
rall_dc.ncorrect += int(dc_score[0] < dc_score[1])
rall_dc.nwrong += int(dc_score[0] > dc_score[1])
rall_dc.nundec += int(dc_score[0] == dc_score[1])
rall_dr.ncorrect += int(dr_score[0]
> level and dr_score[1] < level)
rall_dr.nwrong += int(dr_score[0] < level and dr_score[1] > level)
rall_dr.nundec += int(undecided_dr)
print "%s | %s %s %s | %s %s %s | %s %s %s" % (srcX.center(18), rsc_sc.to_str(nsample_sc), rsc_dc.to_str(nsample_sc), rsc_dr.to_str(nsample_sc), ranm_sc.to_str(nsample_anm), ranm_dc.to_str(nsample_anm), ranm_dr.to_str(nsample_anm), rall_sc.to_str(nsample_all), rall_dc.to_str(nsample_all), rall_dr.to_str(nsample_all))
def execute_algorithms(X, Y, nloop):
dcX, dcY = dc_compat(X), dc_compat(Y)
dc_t, dr_t, cisc_t, crisp_t = 0, 0, 0, 0
for i in xrange(nloop):
tstart = time.time()
dc(dcX, dcY)
tend = time.time()
dc_t += tend - tstart
tstart = time.time()
dr(X, Y, 0.05)
tend = time.time()
dr_t += tend - tstart
tstart = time.time()
cisc(X, Y)
tend = time.time()
cisc_t += tend - tstart
tstart = time.time()
crisp(X, Y)
tend = time.time()
crisp_t += tend - tstart
return dc_t / nloop, dr_t / nloop, cisc_t / nloop, crisp_t / nloop
def test_size_runtime():
nloop = 5
suppX = range(20)
suppY = range(20)
sizes = range(1000000, 11000000, 1000000)
fp = open("results/time-size.dat", "w")
fp.write("size\tdc\tdr\tcisc\tcrisp\n")
for size in sizes:
X = [random.choice(suppX) for i in xrange(size)]
Y = [random.choice(suppY) for i in xrange(size)]
dc_t, dr_t, cisc_t, crisp_t = execute_algorithms(X, Y, nloop)
print size, dc_t, dr_t, cisc_t, crisp_t
sys.stdout.flush()
fp.write("%d %f %f %f %f\n" % (size, dc_t, dr_t, cisc_t, crisp_t))
fp.close()
def test_domain_runtime():
nloop = 5
size = 10000
domains = range(100, 1100, 100)
fp = open("results/time-domain.dat", "w")
fp.write("domain\tdc\tdr\tcisc\tcrisp\n")
for domain in domains:
suppX = range(domain)
suppY = range(domain)
X = [random.choice(suppX) for i in xrange(size)]
Y = [random.choice(suppY) for i in xrange(size)]
dc_t, dr_t, cisc_t, crisp_t = execute_algorithms(X, Y, nloop)
print domain, dc_t, dr_t, cisc_t, crisp_t
sys.stdout.flush()
fp.write("%d %f %f %f %f\n" % (domain, dc_t, dr_t, cisc_t, crisp_t))
fp.close()
def test_power():
sizes = [1000, 2000, 3000, 4000, 5000]
sizes = [1000]
level = 0.05
srcX = "multinomial"
nsim_cutoff, nsim_power = 100, 100
suppfX = range(-7, 8)
for size in sizes:
diffs_dc, diffs_dr, diffs_cisc = [], [], []
for i in xrange(nsim_cutoff):
X = generate_X(srcX, size)
suppX = list(set(X))
f = map_randomly(suppX, suppfX)
N = generate_additive_N(size)
# Y = [f[X[i]] + N[i] for i in xrange(size)]
# Y = np.random.permutation(Y).tolist()
Y = generate_X(srcX, size)
dc_score = dc(dc_compat(X), dc_compat(Y))
dr_score = dr(X, Y, level)
cisc_score = cisc(X, Y)
diffs_dc.append(abs(dc_score[0] - dc_score[1]))
diffs_dr.append(abs(dr_score[0] - dr_score[1]))
diffs_cisc.append(abs(cisc_score[0] - cisc_score[1]))
cutoff_dc = np.percentile(diffs_dc, 5)
cutoff_dr = np.percentile(diffs_dr, 5)
cutoff_cisc = np.percentile(diffs_cisc, 5)
print cutoff_dc, cutoff_dr, cutoff_cisc
diffs_dc, diffs_dr, diffs_cisc = [], [], []
for i in xrange(nsim_power):
X = generate_X(srcX, size)
suppX = list(set(X))
f = map_randomly(suppX, suppfX)
N = generate_additive_N(size)
Y = [f[X[i]] + N[i] for i in xrange(size)]
dc_score = dc(dc_compat(X), dc_compat(Y))
dr_score = dr(X, Y, level)
cisc_score = cisc(X, Y)
diffs_dc.append(abs(dc_score[0] - dc_score[1]))
diffs_dr.append(abs(dr_score[0] - dr_score[1]))
diffs_cisc.append(abs(cisc_score[0] - cisc_score[1]))
power_dc = sum(diffs_dc > cutoff_dc) / nsim_power
power_dr = sum(diffs_dr > cutoff_dr) / nsim_power
power_cisc = sum(diffs_cisc > cutoff_cisc) / nsim_power
print size, power_dc, power_dr, power_cisc
def test_significance():
"""Perform permutation testing just as with swap randomisation"""
size = 1000
nperm = 50000
srcX = "uniform"
suppfX = range(-7, 8)
X = generate_X(srcX, size)
suppX = list(set(X))
f = map_randomly(suppX, suppfX)
N = generate_additive_N(size)
Y = [f[X[i]] + N[i] for i in xrange(size)]
cisc_score = cisc(X, Y)
diff_orig = abs(cisc_score[0] - cisc_score[1])
diff_perms = []
for i in xrange(nperm):
Y_perm = np.random.permutation(Y)
cisc_score = cisc(X, Y_perm)
diff_perm = abs(cisc_score[0] - cisc_score[1])
diff_perms.append(diff_perm)
p_value = sum(diff_orig < diff_perm for diff_perm in diff_perms) / nperm
print diff_orig, sum(diff_orig < diff_perm for diff_perm in diff_perms), p_value
def test_hypercompression():
m = 100
size = 100
alpha = 0.01
suppfX = range(-7, 8)
srcX = "geometric"
fp = open("results/no-hypercompression.dat", "w")
diffs = []
decisions = [] # 1=correct, -1=incorrect, 0=wrong
for i in xrange(m):
X = generate_X(srcX, size)
suppX = list(set(X))
f = map_randomly(suppX, suppfX)
N = generate_additive_N(size)
Y = [f[X[i]] + N[i] for i in xrange(size)]
crisp_score = crisp(X, Y)
diff = abs(crisp_score[0] - crisp_score[1])
if crisp_score[0] < crisp_score[1]:
decision = 1
elif crisp_score[0] > crisp_score[1]:
decision = -1
else:
continue
diffs.append(int(diff))
decisions.append(decision)
sorted_diffs_indices = reverse_argsort(diffs)
diffs = [diffs[idx] for idx in sorted_diffs_indices]
decisions = [decisions[idx] for idx in sorted_diffs_indices]
# flags for coloring
# correct, significant = 1
# correct, insignificant = 2
# incorrect, significant = 3
# incorrect, insignificant = 4
fp.write("sn\tdiff\tsig\tdec\tcolor\n") # header
for k, diff in enumerate(diffs, 1):
log_p_value = -diff
bh_stat = k * alpha / m
log_bh_stat = math.log(bh_stat, 2)
if log_bh_stat < log_p_value:
significant = 0
if decisions[k - 1] == 1:
color = 2
else:
color = 4
else:
significant = 1
if decisions[k - 1] == 1:
color = 1
elif decisions[k - 1] == -1:
color = 3
fp.write("%i\t%d\t%d\t%d\t%d\n" %
(k, diff, significant, decisions[k - 1], color))
# if log_bh_stat < log_p_value:
# # reject: not significant
# fp.write("%i\t%d\t%d\t%d\n" % (k, diff, 0, decisions[k - 1]))
# print k, diff, log_bh_stat, log_p_value, 0, decisions[k - 1]
# else:
# fp.write("%i\t%d\t%d\t%d\n" %
# (k, diff, 1, decisions[k - 1])) # accept: significant
# print k, diff, 1, decisions[k - 1]
# fp.write("%i\t%d\n" % (k, diff))
fp.close()
def test_sample_size():
nsim = 500
level = 0.05
sizes = [100, 500, 1000, 2500, 5000]
suppfX = range(-7, 8)
srcX = "geometric"
fp = open("results/acc-size.dat", "w")
diffs = []
fp.write("%s\t%s\t%s\t%s\t%s\t%s\n" %
("size", "dc", "dr", "cisc", "crispe", "crisp"))
print "%s\t%s\t%s\t%s\t%s\t%s" % ("size", "dc", "dr", "cisc", "crispe", "crisp")
sys.stdout.flush()
# progress(0, len(sizes))
for k, size in enumerate(sizes):
nc_dc, nc_dr, nc_cisc, nc_crispe, nc_crisp = 0, 0, 0, 0, 0
for j in xrange(nsim):
X = generate_X(srcX, size)
suppX = list(set(X))
f = map_randomly(suppX, suppfX)
N = generate_additive_N(size)
Y = [f[X[i]] + N[i] for i in xrange(size)]
dc_score = dc(dc_compat(X), dc_compat(Y))
dr_score = dr(X, Y, level)
cisc_score = cisc(X, Y)
crispe_score = crispe(X, Y)
crisp_score = crisp(X, Y)
nc_dc += int(dc_score[0] < dc_score[1])
nc_dr += int(dr_score[0] > level and dr_score[1] < level)
nc_cisc += int(cisc_score[0] < cisc_score[1])
nc_crispe += int(crispe_score[0] < crispe_score[1])
nc_crisp += int(crisp_score[0] < crisp_score[1])
acc_dc = nc_dc * 100 / nsim
acc_dr = nc_dr * 100 / nsim
acc_cisc = nc_cisc * 100 / nsim
acc_crispe = nc_crispe * 100 / nsim
acc_crisp = nc_crisp * 100 / nsim
print "%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f" % (size, acc_dc, acc_dr, acc_cisc, acc_crispe, acc_crisp)
sys.stdout.flush()
fp.write("%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n" %
(size, acc_dc, acc_dr, acc_cisc, acc_crispe, acc_crisp))
# progress(k + 1, len(sizes))
fp.close()
if __name__ == "__main__":
test_accuracy()