diff --git a/acid.py b/acid.py deleted file mode 100644 index 1332ce3..0000000 --- a/acid.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -from collections import Counter, defaultdict -from copy import copy -import random -import sys -import time - -from entropy import entropy - - -def marginals(X, Y): - Ys = defaultdict(list) - for i, x in enumerate(X): - Ys[x].append(Y[i]) - return Ys - - -def map_to_majority(X, Y): - f = dict() - subgroups_y = defaultdict(list) - - for i, x in enumerate(X): - subgroups_y[x].append(Y[i]) - - for x, subgroup_y in subgroups_y.iteritems(): - freq_y, _ = Counter(subgroup_y).most_common(1)[0] - f[x] = freq_y - - return f - - -def regress(X, Y): - # target Y, feature X - max_iterations = 10000 - hx = entropy(X) - len_dom_y = len(set(Y)) - f = map_to_majority(X, Y) - - supp_x = list(set(X)) - supp_y = list(set(Y)) - - pair = zip(X, Y) - res = [y - f[x] for x, y in pair] - cur_res_codelen = entropy(res) - - j = 0 - minimized = True - while j < max_iterations and minimized: - minimized = False - - for x_to_map in supp_x: - best_res_codelen = sys.float_info.max - best_cand_y = None - - for cand_y in supp_y: - if cand_y == f[x_to_map]: - continue - - res = [y - f[x] if x != x_to_map else y - - cand_y for x, y in pair] - res_codelen = entropy(res) - - if res_codelen < best_res_codelen: - best_res_codelen = res_codelen - best_cand_y = cand_y - - if best_res_codelen < cur_res_codelen: - cur_res_codelen = best_res_codelen - f[x_to_map] = best_cand_y - minimized = True - j += 1 - return hx + cur_res_codelen - - -def acid(X, Y): - hxtoy = regress(X, Y) - hytox = regress(Y, X) - return (hxtoy, hytox) - - -if __name__ == "__main__": - from test_benchmark import load_pair - X, Y = load_pair(99) - print acid(X, Y) diff --git a/anm.py b/anm.py new file mode 100644 index 0000000..70052b3 --- /dev/null +++ b/anm.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""This module implements the paper titled `Accurate Causal Inference on +Discrete Data`. We can also compute the total information content in the +sample by encoding the function and using the stochastic complexity on top of +regression model. For more detail, please refer to the manuscript at +http://people.mpi-inf.mpg.de/~kbudhath/manuscript/acid.pdf +""" +from collections import Counter +from math import log +import sys + +from formatter import stratify +from measures import DependenceMeasure, DMType + + +def choose(n, k): + """Computes the binomial coefficient `n choose k`. + """ + if 0 <= k <= n: + ntok = 1 + ktok = 1 + for t in range(1, min(k, n - k) + 1): + ntok *= n + ktok *= t + n -= 1 + return ntok // ktok + else: + return 0 + + +def univ_enc(n): + """Computes the universal code length of the given integer. + Reference: J. Rissanen. A Universal Prior for Integers and Estimation by + Minimum Description Length. Annals of Statistics 11(2) pp.416-431, 1983. + """ + ucl = log(2.86504, 2) + previous = n + while True: + previous = log(previous, 2) + if previous < 1.0: + break + ucl += previous + return ucl + + +def encode_func(f): + """Encodes the function by enumerating the set of all possible functions. + + Args: + ndom (int): number of elements in the domain of the function + nimg (int): number of elements in the image of the function + + Returns: + (float): encoded size of the function + """ + # nones = len(set(f.values())) + # return univ_enc(nones) + log(choose(ndom * nimg, nones), 2) + ndom = len(f.keys()) + nimg = len(set(f.values())) + return univ_enc(ndom) + univ_enc(nimg) + log(ndom ** nimg, 2) + + +def map_to_majority(X, Y): + """Creates a function that maps y to the frequently co-occuring x. + + Args: + X (sequence): sequence of discrete outcomes + Y (sequence): sequence of discrete outcomes + + Returns: + (dict): map from Y-values to frequently co-occuring X-values + """ + f = dict() + Y_grps = stratify(X, Y) + for x, Ys in Y_grps.items(): + frequent_y, _ = Counter(Ys).most_common(1)[0] + f[x] = frequent_y + return f + + +def regress(X, Y, dep_measure, max_niterations, enc_func=False): + """Performs discrete regression with Y as a dependent variable and X as + an independent variable. + + Args: + X (sequence): sequence of discrete outcomes + Y (sequence): sequence of discrete outcomes + dep_measure (DependenceMeasure): subclass of DependenceMeasure + max_niterations (int): maximum number of iterations + enc_func (bool): whether to encode the function or not + + Returns: + (float): p-value (or information content) after fitting ANM from X->Y + """ + # todo: make it work with chi-squared test of independence or G^2 test + supp_X = list(set(X)) + supp_Y = list(set(Y)) + f = map_to_majority(X, Y) + + pair = list(zip(X, Y)) + res = [y - f[x] for x, y in pair] + cur_res_inf = dep_measure.measure(res, X) + + j = 0 + minimized = True + while j < max_niterations and minimized: + minimized = False + + for x_to_map in supp_X: + best_res_inf = sys.float_info.max + best_y = None + + for cand_y in supp_Y: + if cand_y == f[x_to_map]: + continue + + res = [y - f[x] if x != x_to_map else y - + cand_y for x, y in pair] + res_inf = dep_measure.measure(res, X) + + if res_inf < best_res_inf: + best_res_inf = res_inf + best_y = cand_y + + if best_res_inf < cur_res_inf: + cur_res_inf = best_res_inf + f[x_to_map] = best_y + minimized = True + j += 1 + + if dep_measure.type == DMType.INFO and not enc_func: + return dep_measure.measure(X) + cur_res_inf + elif dep_measure.type == DMType.INFO and enc_func: + return dep_measure.measure(X) + encode_func(f) + cur_res_inf + else: + _, p_value = dep_measure.nhst([y - f[x] for x, y in pair], X) + return p_value + + +def anm(X, Y, dep_measure, max_niterations=1000, enc_func=False): + """Fits the Additive Noise Model from X to Y and vice versa. + + Args: + X (sequence): sequence of discrete outcomes + Y (sequence): sequence of discrete outcomes + dep_measure (DependenceMeasure): subclass of DependenceMeasure + max_niterations (int): maximum number of iterations + enc_func (bool): whether to encode the function or not + + Returns: + (float, float): p-value (or information content) after fitting ANM + from X->Y and vice versa. + """ + assert issubclass(dep_measure, DependenceMeasure), "dependence measure "\ + "must be a subclass of DependenceMeasure abstract class" + xtoy = regress(X, Y, dep_measure, max_niterations, enc_func) + ytox = regress(Y, X, dep_measure, max_niterations, enc_func) + return (xtoy, ytox) + + +if __name__ == "__main__": + import numpy as np + from measures import Entropy, StochasticComplexity, ChiSquaredTest + + X = np.random.choice([1, 2, 4, -1], 1000) + Y = np.random.choice([-2, -1, 0, 1, 2], 1000) + + print(anm(X, Y, Entropy)) + print(anm(X, Y, StochasticComplexity)) + print(anm(X, Y, StochasticComplexity, enc_func=True)) + print(anm(X, Y, ChiSquaredTest)) diff --git a/cisc.py b/cisc.py index efec3ba..7d40e79 100644 --- a/cisc.py +++ b/cisc.py @@ -1,59 +1,58 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- - -"""Causal inference on discrete data using stochastic complexity of multinomial. +"""This module implements the paper titled `MDL for Causal Inference on +Discrete Data`. For more detail, please refer to the manuscript at +http://people.mpi-inf.mpg.de/~kbudhath/manuscript/cisc.pdf """ -from math import log +from formatter import stratify +from sc import sc + -from collections import defaultdict -from sc import stochastic_complexity +def cisc(X, Y, plain=False): + """Computes the total stochastic complexity from X to Y and vice versa. + Args: + X (sequence): sequence of discrete outcomes + Y (sequence): sequence of discrete outcomes + plain (bool): whether to compute the plain conditional stochastic + complexity or not. If not provided, we compute the weighted one. -def marginals(X, Y): - Ys = defaultdict(list) - for i, x in enumerate(X): - Ys[x].append(Y[i]) - return Ys + Returns: + (float, float): the total multinomial stochastic complexity of X and Y + in the direction from X to Y, and vice versa. + """ + assert len(X) == len(Y) + n = len(X) -def cisc(X, Y): - scX = stochastic_complexity(X) - scY = stochastic_complexity(Y) + scX = sc(X) + scY = sc(Y) - mYgX = marginals(X, Y) - mXgY = marginals(Y, X) + YgX = stratify(X, Y) + XgY = stratify(Y, X) - domX = mYgX.keys() - domY = mXgY.keys() + domX = YgX.keys() + domY = XgY.keys() - # plain one - # scYgX = sum(stochastic_complexity(Z, len(domY)) for Z in mYgX.itervalues()) - # scXgY = sum(stochastic_complexity(Z, len(domX)) for Z in mXgY.itervalues()) + ndomX = len(domX) + ndomY = len(domY) - # weighted one - scYgX = sum((len(Z) * 1.0) / len(X) * stochastic_complexity(Z, len(domY)) - for Z in mYgX.itervalues()) - scXgY = sum((len(Z) * 1.0) / len(X) * stochastic_complexity(Z, len(domX)) - for Z in mXgY.itervalues()) + if plain: + scYgX = sum(sc(Yp, ndomY) for Yp in YgX.values()) + scXgY = sum(sc(Xp, ndomX) for Xp in XgY.values()) + else: + scYgX = sum(len(Yp) / n * sc(Yp, ndomY) for Yp in YgX.values()) + scXgY = sum(len(Xp) / n * sc(Xp, ndomX) for Xp in XgY.values()) ciscXtoY = scX + scYgX ciscYtoX = scY + scXgY - # print "X=%.2f Ygx=%.2f" % (scX, scYgX) - # print "Y=%.2f XgY=%.2f" % (scY, scXgY) return (ciscXtoY, ciscYtoX) if __name__ == "__main__": import random - from test_synthetic import map_randomly - n = 1000 - Xd = range(1, 4) - fXd = range(1, 4) - f = map_randomly(Xd, fXd) - N = range(-2, 3) - - X = [random.choice(Xd) for i in xrange(n)] - Y = [f[X[i]] + random.choice(N) for i in xrange(n)] - - print cisc(X, Y) + n = 100 + X = [random.randint(0, 10) for i in range(n)] + Y = [random.randint(0, 10) for i in range(n)] + print(cisc(X, Y)) diff --git a/dc.py b/dc.py index 00ac6d6..0cd92be 100644 --- a/dc.py +++ b/dc.py @@ -1,25 +1,43 @@ -#!/usr/bin/env python -"""Implementation of Causal Inference on Discrete Data via Estimating -Distance Correlations. -Link: http://www.mitpressjournals.org/doi/pdf/10.1162/NECO_a_00820 +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""This module implements the work on `Causal Inference on Discrete Data via +Estimating Distance Correlations`. For more detail, please refer to the +manuscript at http://www.mitpressjournals.org/doi/pdf/10.1162/NECO_a_00820 """ -from collections import defaultdict -from math import sqrt - import numpy as np -from scipy.spatial.distance import pdist, squareform from dcor import dcor def dc(X, Y): - prob_X, marg_X, prob_Y, marg_Y = distributions(X, Y) - dXtoY = dcor(prob_X, marg_Y) - dYtoX = dcor(prob_Y, marg_X) + """Computes dCorr(P(X), P(Y|X)), and dCorr(P(Y), P(X|Y)). + + Args: + X (nested sequence): nested sequence of discrete outcomes + Y (nested sequence): nested sequence of discrete outcomes + + Returns: + (float, float): (dCorr(P(X), P(Y|X)), dCorr(P(Y), P(X|Y))) + """ + assert len(X) == len(Y) + marg_X, cond_X, marg_Y, cond_Y = distributions(X, Y) + dXtoY = dcor(marg_X, cond_Y)[0] + dYtoX = dcor(marg_Y, cond_X)[0] return (dXtoY, dYtoX) def distributions(X, Y): + """Computes empirical marginal and conditional distributions of X and Y. + + Args: + X (nested sequence): nested sequence of discrete outcomes + Y (nested sequence): nested sequence of discrete outcomes + + Returns: + (sequence, sequence, sequence, sequence): (P(X), P(X|Y), P(Y), P(Y|X)). + If X has L unique values, and Y has M unique values. The dimension are + as follows: P(X)=Lx1, P(Y|X)=LxM, P(Y)=Mx1, and P(X|Y)=MxL. + """ N = len(X) unq_X = set(map(tuple, X)) unq_Y = set(map(tuple, Y)) @@ -35,18 +53,17 @@ def distributions(X, Y): freq_X = np.sum(freq_XY, axis=1)[np.newaxis] freq_Y = np.sum(freq_XY, axis=0)[np.newaxis] - prob_X = (freq_X / np.sum(freq_X)).transpose() - prob_Y = (freq_Y / np.sum(freq_Y)).transpose() + marg_X = (freq_X / np.sum(freq_X)).transpose() + marg_Y = (freq_Y / np.sum(freq_Y)).transpose() freqs_X = np.tile(freq_X.transpose(), (1, len(unq_Y))) freqs_Y = np.tile(freq_Y, (len(unq_X), 1)) - marg_X = (freq_XY / freqs_X).transpose() - marg_Y = freq_XY / freqs_Y - return prob_X, marg_X, prob_Y, marg_Y + cond_X = (freq_XY / freqs_X).transpose() + cond_Y = (freq_XY / freqs_Y) + return marg_X, cond_X, marg_Y, cond_Y if __name__ == "__main__": - X = [[2, 3], [2, 3], [2, 4], [2], [2], [3], [3], [3, 4], [2, 3]] - Y = [[1], [1], [1], [1], [0], [0], [0], [0], [1]] - - print dc(X, Y) + X = [[2, 3], [2, 3], [2, 4], [2], [2], [3], [3], [3, 4], [2, 3], [2]] + Y = [[1], [1], [1], [1], [1], [0], [0], [0], [0], [0]] + print(dc(X, Y)) diff --git a/dcor.py b/dcor.py index 87320ba..0c27c6b 100644 --- a/dcor.py +++ b/dcor.py @@ -1,16 +1,21 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- - -"""Computes the distance correlation between two matrices. -https://en.wikipedia.org/wiki/Distance_correlation +"""Computes the distance correlation between two matrices. For more detail, +please refer to https://en.wikipedia.org/wiki/Distance_correlation """ - import numpy as np from scipy.spatial.distance import pdist, squareform def dcov(X, Y): """Computes the distance covariance between matrices X and Y. + + Args: + X (np.ndarray): multidimensional array of numbers + Y (np.ndaaray): multidimensional array of numbers + + Returns: + (float): the distance covariance between X and Y """ n = X.shape[0] XY = np.multiply(X, Y) @@ -20,13 +25,25 @@ def dcov(X, Y): def dvar(X): """Computes the distance variance of a matrix X. + + Args: + X (np.ndarray): multidimensional array of numbers + + Returns: + (float): the distance variance of X """ return np.sqrt(np.sum(X ** 2 / X.shape[0] ** 2)) def cent_dist(X): - """Computes the pairwise euclidean distance between rows of X and centers - each cell of the distance matrix with row mean, column mean, and grand mean. + """Computes pairwise euclidean distance between rows of X and centers each + cell of the distance matrix with row mean, column mean, and grand mean. + + Args: + X (np.ndarray): multidimensional array of numbers + + Returns: + (np.ndarray): doubly centered distance matrix of X """ M = squareform(pdist(X)) # distance matrix rmean = M.mean(axis=1) @@ -48,6 +65,14 @@ def dcor(X, Y): >>> Y = np.matrix('1;2;9;4;4') >>> dcor(X, Y) 0.76267624241686649 + + Args: + X (np.ndarray): multidimensional array of numbers + Y (np.ndarray): multidimensional array of numbers + + Returns: + (float, float, float, float): (dCorr(X, Y), dCov(X, Y), dVar(X), + dVar(Y)) """ assert X.shape[0] == Y.shape[0] @@ -62,10 +87,11 @@ def dcor(X, Y): if dvar_A > 0.0 and dvar_B > 0.0: dcor = dcov_AB / np.sqrt(dvar_A * dvar_B) - return dcor + return dcor, dcov_AB, dvar_A, dvar_B if __name__ == "__main__": X = np.matrix('1;2;3;4;5') Y = np.matrix('1;2;9;4;4') - print dcor(X, Y) + print(dcor(X, Y)) + # print(dcor(np.matrix('1 7 3; 8 2 9; 1 2 7'), np.matrix('9 6; 2 3; 1 8'))) diff --git a/dr/dr.py b/dr/dr.py index 37832cd..fbfa885 100644 --- a/dr/dr.py +++ b/dr/dr.py @@ -3,20 +3,20 @@ """Python wrapper for the matlab implementation of Causal Inference on Discrete Data using Additive Noise Models. """ import os -import StringIO +import io import sys import matlab.engine -print "loading matlab engine ... ", +print("loading matlab engine ... "), sys.stdout.flush() cur_dir = os.path.dirname(__file__) mengine = matlab.engine.start_matlab() mengine.addpath(cur_dir) -out = StringIO.StringIO() -err = StringIO.StringIO() -print "\b[✓]" +out = io.StringIO() +err = io.StringIO() +print("\b[✓]") sys.stdout.flush() @@ -29,6 +29,6 @@ def dr(X, Y, level): if __name__ == "__main__": import random - X = [random.randint(1, 4) for i in xrange(1000)] - Y = [X[i] + random.randint(1, 3) for i in xrange(1000)] - print dr(X, Y, 0.05) + X = [random.randint(1, 4) for i in range(1000)] + Y = [X[i] + random.randint(1, 3) for i in range(1000)] + print(dr(X, Y, 0.05)) diff --git a/entropic.py b/entropic.py index 7670067..191eeda 100644 --- a/entropic.py +++ b/entropic.py @@ -1,33 +1,9 @@ -# Entropic Causality -# Input: A text file with two columns, no headers. First column is called X, second column is called Y. -# If no argument is given, program prompts the user to enter the name of a text file. -# User can also give the name of the input file as an argument. - -# Algorithm determines the quantization based on the number of unique values and number of samples. -# Estimates the conditional probability tables and calculates exogenous variable with minimum entropy in both X->Y direction and X<-Y direction. -# The score is an indicator of how confident the algorithm is. Larger -# scores indicate more confidence. Scores are greater than 0. - -# INSTRUCTIONS: Put this file in the same folder as a text file input.txt -# which has two columns of data - -import sys -import pandas as pd +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""The module implelements the "Entropic Causal Inference" paper. +""" import numpy as np -import matplotlib.pyplot as plt -from sklearn import svm -# from sklearn.ensemble import IsolationForest -from numpy.linalg import norm - - -def read_file(file_name): - df = pd.read_csv(file_name, header=None, delim_whitespace=True) - - # Following is a test to check if random numbers give a causal direction - #df = pd.DataFrame(np.random.randn(500,2)) - return df - class CausalPair(object): @@ -47,12 +23,13 @@ def quantize_data(pair): # We simply select n as large as possible and make sure on average each # state has around 10 samples. p = pair.nofsamples - # We also need the number of unique values to make sure we know when inputs are discrete variables + # We also need the number of unique values to make sure we know + # when inputs are discrete variables # A variable is deemed discrete if noofunique values is less than 1/5th of # number of samples uniqueX = pair.X.unique() uniqueY = pair.Y.unique() - n = decide_quantization(p, uniqueX, uniqueY) + n = int(decide_quantization(p, uniqueX, uniqueY)) deltaX = (pair.Xmax - pair.Xmin) / n rulerX = [pair.Xmin + i * deltaX for i in range(0, n - 1)] rulerX.append(pair.Xmax) @@ -74,8 +51,7 @@ def decide_quantization(p, uniqueX, uniqueY): if 5 * noUniqueY < p: discreteY = 1 # 256 is chosen as the upper limit on the number of states - n = np.min([256, discreteX * discreteY * np.max([noUniqueX, noUniqueY]) + discreteX * (1 - discreteY) * np.max([noUniqueX, - p / 10]) + discreteY * (1 - discreteX) * np.max([noUniqueY, p / 10]) + (1 - discreteY) * (1 - discreteX) * p / 10]) + n = np.min([256, discreteX*discreteY*np.max([noUniqueX,noUniqueY]) + discreteX*(1-discreteY)*np.max([noUniqueX, p/10]) + discreteY*(1-discreteX)*np.max([noUniqueY, p/10]) + (1-discreteY)*(1-discreteX)*p/10]) return n @@ -174,24 +150,4 @@ def entropic(df): hX = calc_entropy(pX) hY = calc_entropy(pY) - # if abs(hYX + hX - (hXY + hY)) < 0.02 * (hYX + hX + hXY + hY) * 0.25: - # print "Test Indecisive: Entropy values are too close" - # return 0 - # elif hYX + hX - (hXY + hY) > 0: - # print "Y -> X, Score: %f" % (abs(hYX + hX - (hXY + hY)) / ((hYX + hX + hXY + hY) * 0.25) - 0.02) - # return -1 - # elif hYX + hX - (hXY + hY) < 0: - # print "X -> Y, Score: %f" % (abs(hYX + hX - (hXY + hY)) / ((hYX + hX + hXY + hY) * 0.25) - 0.02) - # return 1 return hX + hYX, hY + hXY - - -if __name__ == '__main__': - #file_name = raw_input("Name of data file:") - if len(sys.argv) < 2: - file_name = raw_input("Name of data file:") - entropic(file_name) - elif len(sys.argv) == 2: - entropic(sys.argv[1]) - else: - print "Too many input arguments!" diff --git a/entropy.py b/entropy.py index 27352fa..b205083 100644 --- a/entropy.py +++ b/entropy.py @@ -1,21 +1,29 @@ -#!/usr/bin/env python -"""Computes entropy of a sequence of messages +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""This module implements the Shannon entropy of a discrete sequence. """ -from __future__ import division from collections import Counter from math import log -def entropy(sequence): +def entropy(X): + """Compute the entropy of a message. + + Args: + X (sequence): a sequence of discrete outcomes + + Returns: + (float): the entropy of X + """ res = 0 - n = len(sequence) - counts = Counter(sequence).values() + n = len(X) + counts = Counter(X).values() for count in counts: res -= (count / n) * (log(count, 2) - log(n, 2)) return res if __name__ == "__main__": - print entropy([1, 2, 1, 1, 1, 1]) - print entropy([1, 1, 1, 1, 1, 1]) - print entropy([1, 1, 1, 2, 2, 2]) + print(entropy([1, 2, 1, 1, 1, 1])) + print(entropy([1, 1, 1, 1, 1, 1])) + print(entropy([1, 1, 1, 2, 2, 2])) diff --git a/formatter.py b/formatter.py new file mode 100644 index 0000000..f79c681 --- /dev/null +++ b/formatter.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""This module provides common methods for manipulating data""" +from collections import defaultdict + + +def stratify(X, Y): + """Stratifies Y based on unique values of X. + + Args: + X (sequence): sequence of discrete outcomes + Y (sequence): sequence of discrete outcomes + + Returns: + (dict): list of Y-values for a X-value + """ + Y_grps = defaultdict(list) + for i, x in enumerate(X): + Y_grps[x].append(Y[i]) + return Y_grps + + +def to_nested(X): + """Converts the given sequence to a nested sequence. + + Args: + X (sequence): sequence of discrete outcomes + + Returns: + (nested sequence): nested sequence of X + """ + return [[x] for x in X] diff --git a/measures.py b/measures.py new file mode 100644 index 0000000..6be7151 --- /dev/null +++ b/measures.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""This module abstracts information measures.""" +import abc +from enum import Enum + +import numpy as np +from scipy.stats import chi2_contingency + +from entropy import entropy +from sc import sc + + +class DMType(Enum): + NHST = 1 # Null Hypothesis Significance Testing + INFO = 2 # Information-theoretic + + +class DependenceMeasure(abc.ABC): + + @property + @abc.abstractmethod + def type(self): + pass + + @staticmethod + @abc.abstractmethod + def measure(seq1, seq2=None): + pass + + +class Entropy(DependenceMeasure): + type = DMType.INFO + + def measure(seq1, seq2=None): + return entropy(seq1) + + +class StochasticComplexity(DependenceMeasure): + type = DMType.INFO + + def measure(seq1, seq2=None): + return sc(seq1) + + +class ChiSquaredTest(DependenceMeasure): + type = DMType.NHST + + @staticmethod + def contingency_table(seq1, seq2): + dom_seq1 = list(set(seq1)) + dom_seq2 = list(set(seq2)) + + ndom_seq1 = len(dom_seq1) + ndom_seq2 = len(dom_seq2) + + indices1 = dict(zip(dom_seq1, range(ndom_seq1))) + indices2 = dict(zip(dom_seq2, range(ndom_seq2))) + + table = np.zeros((ndom_seq1, ndom_seq2)) + + for k, v1 in enumerate(seq1): + v2 = seq2[k] + i, j = indices1[v1], indices2[v2] + table[i, j] += 1 + + return table + + @staticmethod + def nhst(seq1, seq2): + assert len(seq1) == len(seq2), "samples are not of the same size" + + table = ChiSquaredTest.contingency_table(seq1, seq2) + chi2, p_value, _, _ = chi2_contingency(table, correction=True) + return chi2, p_value + + @staticmethod + def measure(seq1, seq2=None): + chi2, p_value = ChiSquaredTest.nhst(seq1, seq2) + + # we want to minimise the dependence between seq1 and seq2 in ANM + # that is, maximise the independence between seq1 and seq2 in ANM + # H0: seq1 and seq2 are independent + # H0 becomes true if p-value is greater than a threshold + # thus we want to maximise p-value, or minimise the negative of p-value + p_value *= -1 + + # as chi2 gets smaller, p-value increases (check the chi2 plot in wiki) + # if the p-value is too small, we reject H0 anyway + # in such a case, we want to minimise chi2 + if p_value < 10 ** -16: + p_value = chi2 + return p_value + + +if __name__ == "__main__": + print(ChiSquaredTest.measure(np.random.choice( + [1, 2, 3], 10), np.random.choice([1, 2], 10))) + print(Entropy.measure(np.random.choice([1, 2, 3], 10))) + print(StochasticComplexity.measure(np.random.choice([1, 2, 3], 10))) diff --git a/sc.py b/sc.py index 949eb59..fbb1b36 100644 --- a/sc.py +++ b/sc.py @@ -1,32 +1,34 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- - -"""Computes the stochastic complexity of multinomial distribution. -http://www.sciencedirect.com/science/article/pii/S0020019007000944 +"""This module implements the linear algorithm for computing the stochastic +complexity of a discrete sequence relative to a parametric family of +multinomial distributions. For more detail, please refer to +http://pgm08.cs.aau.dk/Papers/31_Paper.pdf """ from __future__ import division from collections import Counter -from decimal import Decimal -from math import ceil, e, factorial, log, pi, sqrt - -from scipy.special import gammaln +from math import ceil, log, sqrt -log2 = lambda n: log(n or 1, 2) -fact = lambda n: factorial(n) +def log2(n): + return log(n or 1, 2) -def composition(n): - half = int(n / 2) - for i in range(half + 1): - r1, r2 = i, n - i - yield r1, r2 +def model_cost(ndistinct_vals, n): + """Computes the logarithm of the normalising term of multinomial + stochastic complexity. + Args: + ndistinct_vals (int): number of distinct values of a multinomial r.v. + n (int): number of trials -def multinomial_with_recurrence(L, n): + Returns: + float: the model cost of the parametric family of multinomials + """ total = 1.0 b = 1.0 d = 10 + bound = int(ceil(2 + sqrt(2 * n * d * log(10)))) # using equation (38) for k in range(1, bound + 1): b = (n - k + 1) / n * b @@ -35,7 +37,7 @@ def multinomial_with_recurrence(L, n): log_old_sum = log2(1.0) log_total = log2(total) log_n = log2(n) - for j in range(3, L + 1): + for j in range(3, ndistinct_vals + 1): log_x = log_n + log_old_sum - log_total - log2(j - 2) x = 2 ** log_x # log_one_plus_x = (x + 8 * x / (2 + x) + x / (1 + x)) / 6 @@ -46,57 +48,32 @@ def multinomial_with_recurrence(L, n): log_old_sum = log_total log_total = log_new_sum # print log_total, - if L == 1: + + if ndistinct_vals == 1: log_total = log2(1.0) + return log_total -def stochastic_complexity(X, L=None): - freqs = Counter(X) - n = len(X) - L = L or len(freqs) - loglikelihood = 0.0 - for freq in freqs.itervalues(): - loglikelihood += freq * (log2(n) - log2(freq)) - log_normalising_term = multinomial_with_recurrence(L, n) - sc = loglikelihood + log_normalising_term - return sc +def sc(X, ndistinct_vals=None): + """Computes the stochastic complexity of a discrete sequence. + Args: + X (sequence): sequence of discrete outcomes + ndistinct_vals (int): number of distinct values of the multinomial + r.v. X. If not provided, we take it directly from X. -def stochastic_complexity_slow(X): + Returns: + float: the multinomial stochastic complexity of X + """ freqs = Counter(X) - n, K = len(X), len(freqs) - likelihood = 1.0 - for freq in freqs.itervalues(): - likelihood *= (freq / n) ** freq - - prev1 = 1.0 - prev2 = 0.0 - n_fact = fact(n) - for r1, r2 in composition(n): - prev2 += (n_fact / (fact(r1) * fact(r2))) * \ - ((r1 / n) ** r1) * ((r2 / n) ** r2) - - for k in range(1, K - 1): - temp = prev2 + n * prev1 / k - prev1, prev2 = prev2, temp - - loglikelihood = 0.0 - for freq in freqs.itervalues(): - loglikelihood += freq * (log2(n) - log2(freq)) - - sc = loglikelihood + log2(prev2) - return sc - - -def approx_normalizing_term(m, n): - res = ((m - 1) / 2) * log(n / (2 * pi)) + \ - (m / 2) * log(pi) - gammaln(m / 2) - res = res * log(e, 2) - return res + n = len(X) + ndistinct_vals = ndistinct_vals or len(freqs) + data_cost = 0.0 + for freq in freqs.values(): + data_cost += freq * (log2(n) - log2(freq)) + return data_cost + model_cost(ndistinct_vals, n) if __name__ == "__main__": - L = 10 - for n in xrange(10000, 11000): - print multinomial_with_recurrence(L, n), approx_normalizing_term(L, n) + print(sc([1, 2, 3, 2, 1, 2])) diff --git a/test_real.py b/test_real.py index ce90f9d..95b721e 100644 --- a/test_real.py +++ b/test_real.py @@ -1,27 +1,50 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - -"""Tests on real-world data. +"""This module assess the performance of various discrete causal inference +methods on real-world discrete cause-effect pairs. """ from __future__ import division -import csv +import enum import os import numpy as np import pandas as pd +from anm import anm from cisc import cisc -from crisp import crisp -from entropic import entropic -from acid import acid -from dr import dr from dc import dc -from utils import dc_compat +from entropic import entropic +from formatter import to_nested +from measures import ChiSquaredTest, Entropy, StochasticComplexity -def test_car(): - print "testing car dataset" +class ScoreType(enum.Enum): + PVAL = 1, + INFO = 2 + + +def pprint(method_name, score, score_type, llabel, rlabel): level = 0.05 + arrow = "~" + if score_type == ScoreType.INFO: + if score[0] < score[1]: + arrow = "⇒" + elif score[0] > score[1]: + arrow = "⇐" + conf = abs(score[0] - score[1]) + out_str = "%s:: %s %s %s\t Δ=%.2f" % (method_name, + llabel, arrow, rlabel, conf) + elif score_type == ScoreType.PVAL: + if score[0] > level and score[1] < level: + arrow = "⇒" + elif score[0] < level and score[1] > level: + arrow = "⇐" + out_str = "%s:: %s %s %s" % (method_name, llabel, arrow, rlabel) + print(out_str) + + +def test_car(): + print("evaluating on car dataset") car_dir = os.path.join(os.path.dirname(__file__), "data", "car") car_dat_path = os.path.join(car_dir, "car.dat") @@ -31,215 +54,75 @@ def test_car(): data = np.loadtxt(car_dat_path) nattr = data.shape[1] Y = data[:, nattr - 1] - for i in xrange(nattr - 2, nattr - 1): + for i in range(nattr - 2, nattr - 1): X = data[:, i] + dc_score = dc(to_nested(X), to_nested(Y)) + ent_score = entropic(pd.DataFrame(np.column_stack((X, Y)))) + dr_score = anm(X, Y, ChiSquaredTest) cisc_score = cisc(X, Y) - acid_score = acid(X, Y) - dr_score = dr(X.tolist(), Y.tolist(), level) - dc_score = dc(dc_compat(X), dc_compat(Y)) - entropic_score = entropic(pd.DataFrame(np.column_stack((X, Y)))) - - print "CISC::", - if cisc_score[0] < cisc_score[1]: - print "%s ⇒ %s" % (X_labels[i], Y_label), - elif cisc_score[0] > cisc_score[1]: - print "%s ⇐ %s" % (X_labels[i], Y_label), - else: - print "%s ~ %s" % (X_labels[i], Y_label), - print abs(cisc_score[0] - cisc_score[1]), - print - - print "ACID::", - if acid_score[0] < acid_score[1]: - print "%s ⇒ %s" % (X_labels[i], Y_label), - elif acid_score[0] > acid_score[1]: - print "%s ⇐ %s" % (X_labels[i], Y_label), - else: - print "%s ~ %s" % (X_labels[i], Y_label), - print acid_score, - print - - print "DC::", - if dc_score[0] < dc_score[1]: - print "%s ⇒ %s" % (X_labels[i], Y_label), - elif dc_score[0] > dc_score[1]: - print "%s ⇐ %s" % (X_labels[i], Y_label), - else: - print "%s ~ %s" % (X_labels[i], Y_label), - print + acid_score = anm(X, Y, Entropy) + crisp_score = anm(X, Y, StochasticComplexity, enc_func=True) - print "DR::", - if dr_score[0] > level and dr_score[1] < level: - print "%s ⇒ %s" % (X_labels[i], Y_label), - elif dr_score[0] < level and dr_score[1] > level: - print "%s ⇐ %s" % (X_labels[i], Y_label), - else: - print "%s ~ %s" % (X_labels[i], Y_label), - print - - print "ENTROPIC::", - if entropic_score[0] < entropic_score[1]: - print "%s ⇒ %s" % (X_labels[i], Y_label), - elif acid_score[0] > acid_score[1]: - print "%s ⇐ %s" % (X_labels[i], Y_label), - else: - print "%s ~ %s" % (X_labels[i], Y_label), - print - print + pprint(" dc", dc_score, ScoreType.INFO, X_labels[i], Y_label) + pprint(" ent", ent_score, ScoreType.INFO, X_labels[i], Y_label) + pprint(" dr", dr_score, ScoreType.PVAL, X_labels[i], Y_label) + pprint(" cisc", cisc_score, ScoreType.INFO, X_labels[i], Y_label) + pprint(" acid", acid_score, ScoreType.INFO, X_labels[i], Y_label) + pprint("crisp", crisp_score, ScoreType.INFO, X_labels[i], Y_label) def test_abalone(): # We do not discretise the variables just as what DR paper does - print "testing cisc on abalone dataset" - level = 0.05 + print("evaluating on abalone dataset") abalone_dir = os.path.join(os.path.dirname(__file__), "data", "abalone") abalone_dat_path = os.path.join(abalone_dir, "abalone.dat") data = np.loadtxt(abalone_dat_path) ncols = data.shape[1] - sex = data[:, 0] + X = data[:, 0] - colnames = ["Sex", "Length", "Diameter", "Height"] - for i in xrange(1, ncols): + colnames = ["sex", "length", "diameter", "height"] + for i in range(1, ncols): Y = data[:, i] - cisc_score = cisc(sex, Y) - crisp_score = crisp(sex, Y) - acid_score = acid(sex, Y) - dr_score = dr(sex.tolist(), Y.tolist(), level) - dc_score = dc(dc_compat(sex), dc_compat(Y)) - entropic_score = entropic(pd.DataFrame(np.column_stack((sex, Y)))) - - print "CISC::", - if cisc_score[0] < cisc_score[1]: - print "%s ⇒ %s" % ("Sex", colnames[i]), - elif cisc_score[0] > cisc_score[1]: - print "%s ⇐ %s" % ("Sex", colnames[i]), - else: - print "%s ~ %s" % ("Sex", colnames[i]), - print - - print "DC::", - if dc_score[0] < dc_score[1]: - print "%s ⇒ %s" % ("Sex", colnames[i]), - elif dc_score[0] > dc_score[1]: - print "%s ⇐ %s" % ("Sex", colnames[i]), - else: - print "%s ~ %s" % ("Sex", colnames[i]), - print - - print "DR::", - if dr_score[0] > level and dr_score[1] < level: - print "%s ⇒ %s" % ("Sex", colnames[i]), - elif dr_score[0] < level and dr_score[1] > level: - print "%s ⇐ %s" % ("Sex", colnames[i]), - else: - print "%s ~ %s" % ("Sex", colnames[i]), - print - - print "CRISP::", - if crisp_score[0] < crisp_score[1]: - print "%s ⇒ %s" % ("Sex", colnames[i]), - elif crisp_score[0] > crisp_score[1]: - print "%s ⇐ %s" % ("Sex", colnames[i]), - else: - print "%s ~ %s" % ("Sex", colnames[i]), - print - - print "ACID::", - if acid_score[0] < acid_score[1]: - print "%s ⇒ %s" % ("Sex", colnames[i]), - elif acid_score[0] > acid_score[1]: - print "%s ⇐ %s" % ("Sex", colnames[i]), - else: - print "%s ~ %s" % ("Sex", colnames[i]), - print data.shape[0] * acid_score[0], data.shape[0] * acid_score[1], - print + dc_score = dc(to_nested(X), to_nested(Y)) + ent_score = entropic(pd.DataFrame(np.column_stack((X, Y)))) + dr_score = anm(X, Y, ChiSquaredTest) + cisc_score = cisc(X, Y) + acid_score = anm(X, Y, Entropy) + crisp_score = anm(X, Y, StochasticComplexity, enc_func=True) - print "ENTROPIC::", - if entropic_score[0] < entropic_score[1]: - print "%s ⇒ %s" % ("Sex", colnames[i]), - elif acid_score[0] > acid_score[1]: - print "%s ⇐ %s" % ("Sex", colnames[i]), - else: - print "%s ~ %s" % ("Sex", colnames[i]), - print - print + pprint(" dc", dc_score, ScoreType.INFO, colnames[0], colnames[i]) + pprint(" ent", ent_score, ScoreType.INFO, colnames[0], colnames[i]) + pprint(" dr", dr_score, ScoreType.PVAL, colnames[0], colnames[i]) + pprint(" cisc", cisc_score, ScoreType.INFO, colnames[0], colnames[i]) + pprint(" acid", acid_score, ScoreType.INFO, colnames[0], colnames[i]) + pprint("crisp", crisp_score, ScoreType.INFO, colnames[0], colnames[i]) def test_nlschools(): - print "testing nlschools dataset" - level = 0.05 + print("evaluating on nlschools dataset") abalone_dir = os.path.join(os.path.dirname(__file__), "data", "nlschools") abalone_dat_path = os.path.join(abalone_dir, "nlschools.dat") data = np.loadtxt(abalone_dat_path) - score = data[:, 0] - status = data[:, 1] - - cisc_score = cisc(score, status) - crisp_score = crisp(score, status) - acid_score = acid(score, status) - dr_score = dr(score.tolist(), status.tolist(), level) - dc_score = dc(dc_compat(score), dc_compat(status)) - entropic_score = entropic(pd.DataFrame(np.column_stack((score, status)))) - - print "CISC::", - if cisc_score[0] < cisc_score[1]: - print "%s ⇒ %s" % ("score", "status"), - elif cisc_score[0] > cisc_score[1]: - print "%s ⇐ %s" % ("score", "status"), - else: - print "%s ~ %s" % ("score", "status"), - print - - print "ACID::", - if acid_score[0] < acid_score[1]: - print "%s ⇒ %s" % ("score", "status"), - elif acid_score[0] > acid_score[1]: - print "%s ⇐ %s" % ("score", "status"), - else: - print "%s ~ %s" % ("score", "status"), - print acid_score, - print + X = data[:, 0] + Y = data[:, 1] - print "DC::", - if dc_score[0] < dc_score[1]: - print "%s ⇒ %s" % ("score", "status"), - elif dc_score[0] > dc_score[1]: - print "%s ⇐ %s" % ("score", "status"), - else: - print "%s ~ %s" % ("score", "status"), - print + dc_score = dc(to_nested(X), to_nested(Y)) + ent_score = entropic(pd.DataFrame(np.column_stack((X, Y)))) + dr_score = anm(X, Y, ChiSquaredTest) + cisc_score = cisc(X, Y) + acid_score = anm(X, Y, Entropy) + crisp_score = anm(X, Y, StochasticComplexity, enc_func=True) - print "DR::", - if dr_score[0] > level and dr_score[1] < level: - print "%s ⇒ %s" % ("score", "status"), - elif dr_score[0] < level and dr_score[1] > level: - print "%s ⇐ %s" % ("score", "status"), - else: - print "%s ~ %s" % ("score", "status"), - print - - print "CRISP::", - if crisp_score[0] < crisp_score[1]: - print "%s ⇒ %s" % ("score", "status"), - elif crisp_score[0] > crisp_score[1]: - print "%s ⇐ %s" % ("score", "status"), - else: - print "%s ~ %s" % ("score", "status"), - print - - print "ENTROPIC::", - if entropic_score[0] < entropic_score[1]: - print "%s ⇒ %s" % ("score", "status"), - elif acid_score[0] > acid_score[1]: - print "%s ⇐ %s" % ("score", "status"), - else: - print "%s ~ %s" % ("score", "status"), - print + pprint(" dc", dc_score, ScoreType.INFO, "score", "status") + pprint(" ent", ent_score, ScoreType.INFO, "score", "status") + pprint(" dr", dr_score, ScoreType.PVAL, "score", "status") + pprint(" cisc", cisc_score, ScoreType.INFO, "score", "status") + pprint(" acid", acid_score, ScoreType.INFO, "score", "status") + pprint("crisp", crisp_score, ScoreType.INFO, "score", "status") def test_horse_colic(): - print "testing on horse colic dataset" - level = 0.05 + print("evaluating on horse colic dataset") horse_dir = os.path.join(os.path.dirname(__file__), "data", "horse") def get_data(fname): @@ -256,51 +139,21 @@ def get_data(fname): abdomen_test, surgical_test = get_data("horse_test.dat") abdomen = np.concatenate((abdomen_train, abdomen_test)) surgical = np.concatenate((surgical_train, surgical_test)) - assert len(abdomen) == len(surgical) - print len(surgical) - cisc_score = cisc(abdomen, surgical) - print "CISC::", - if cisc_score[0] < cisc_score[1]: - print "%s ⇒ %s" % ("abdomen", "surgical"), - elif cisc_score[0] > cisc_score[1]: - print "%s ⇐ %s" % ("abdomen", "surgical"), - else: - print "%s ~ %s" % ("abdomen", "surgical"), - print cisc_score, - print - - dr_score = dr(abdomen.tolist(), surgical.tolist(), level) - print "DR::", - if dr_score[0] > level and dr_score[1] < level: - print "%s ⇒ %s" % ("abdomen", "surgical"), - elif dr_score[0] < level and dr_score[1] > level: - print "%s ⇐ %s" % ("abdomen", "surgical"), - else: - print "%s ~ %s" % ("abdomen", "surgical"), - print - acid_score = acid(abdomen, surgical) - print "ACID::", - if acid_score[0] < acid_score[1]: - print "%s ⇒ %s" % ("abdomen", "surgical"), - elif acid_score[0] > acid_score[1]: - print "%s ⇐ %s" % ("abdomen", "surgical"), - else: - print "%s ~ %s" % ("abdomen", "surgical"), - print len(surgical) * acid_score[0], len(surgical) * acid_score[1] - print - - entropic_score = entropic(pd.DataFrame( - np.column_stack((abdomen, surgical)))) - print "ENTROPIC::", - if entropic_score[0] < entropic_score[1]: - print "%s ⇒ %s" % ("abdomen", "surgical"), - elif acid_score[0] > acid_score[1]: - print "%s ⇐ %s" % ("abdomen", "surgical"), - else: - print "%s ~ %s" % ("abdomen", "surgical"), - print + dc_score = dc(to_nested(abdomen), to_nested(surgical)) + ent_score = entropic(pd.DataFrame(np.column_stack((abdomen, surgical)))) + dr_score = anm(abdomen, surgical, ChiSquaredTest) + cisc_score = cisc(abdomen, surgical) + acid_score = anm(abdomen, surgical, Entropy) + crisp_score = anm(abdomen, surgical, StochasticComplexity, enc_func=True) + + pprint(" dc", dc_score, ScoreType.INFO, "abdomen", "surgical") + pprint(" ent", ent_score, ScoreType.INFO, "abdomen", "surgical") + pprint(" dr", dr_score, ScoreType.PVAL, "abdomen", "surgical") + pprint(" cisc", cisc_score, ScoreType.INFO, "abdomen", "surgical") + pprint(" acid", acid_score, ScoreType.INFO, "abdomen", "surgical") + pprint("crisp", crisp_score, ScoreType.INFO, "abdomen", "surgical") if __name__ == "__main__": diff --git a/test_synth.py b/test_synth.py index 67f4ee5..63910d8 100644 --- a/test_synth.py +++ b/test_synth.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - -"""Here, we test the accuracy of various causal inference techniques in identifying ANMs with different ranges of noise term. +"""This module assess the performance of various discrete causal inference +methods on synthetic cause-effect pairs. """ from __future__ import division import math @@ -12,63 +12,65 @@ import numpy as np import pandas as pd +from anm import anm from cisc import cisc -from acid import acid from dc import dc -from dr import dr from entropic import entropic -from utils import reverse_argsort, dc_compat +from formatter import to_nested +from measures import ChiSquaredTest, Entropy, StochasticComplexity + +def reverse_argsort(sequence): + indices = range(len(sequence)) + indices.sort(key=sequence.__getitem__, reverse=True) + return indices -def map_randomly(Xd, fXd): - f = dict() - for x in Xd: - y = random.choice(fXd) - f[x] = y +def map_randomly(dom_f, img_f): + f = dict((x, random.choice(img_f)) for x in dom_f) # ensure that f is not a constant function if len(set(f.values())) == 1: - f = map_randomly(Xd, fXd) + f = map_randomly(dom_f, img_f) assert len(set(f.values())) != 1 return f -def generate_X(src, size): - if src == "uniform": +def generate_X(type, size): + if type == "uniform": maxX = random.randint(2, 10) - X = [random.randint(1, maxX) for i in xrange(size)] - elif src == "multinomial": - p_nums = [random.randint(1, 10) for i in xrange(random.randint(2, 11))] + X = [random.randint(1, maxX) for i in range(size)] + elif type == "multinomial": + p_nums = [random.randint(1, 10) for i in range(random.randint(2, 11))] p_vals = [v / sum(p_nums) for v in p_nums] X = np.random.multinomial(size, p_vals, size=1)[0].tolist() X = [[i + 1] * f for i, f in enumerate(X)] X = [j for sublist in X for j in sublist] - elif src == "binomial": + elif type == "binomial": n = random.randint(1, 40) p = random.uniform(0.1, 0.9) X = np.random.binomial(n, p, size).tolist() - elif src == "geometric": + elif type == "geometric": p = random.uniform(0.1, 0.9) X = np.random.geometric(p, size).tolist() - elif src == "hypergeometric": + elif type == "hypergeometric": ngood = random.randint(1, 40) nbad = random.randint(1, 40) nsample = random.randint(1, min(40, ngood + nbad - 1)) X = np.random.hypergeometric(ngood, nbad, nsample, size).tolist() - elif src == "poisson": - lam = random.randint(1, 10) - X = np.random.poisson(lam, size).tolist() - elif src == "negativeBinomial": + elif type == "poisson": + rate = random.randint(1, 10) + X = np.random.poisson(rate, size).tolist() + elif type == "negativeBinomial": n = random.randint(1, 40) p = random.uniform(0.1, 0.9) X = np.random.negative_binomial(n, p, size).tolist() return X -def generate_additive_N(size): +def generate_additive_N(n): t = random.randint(1, 7) suppN = range(-t, t + 1) - N = [random.choice(suppN) for i in xrange(size)] + N = [random.choice(suppN) for i in range(n)] return N @@ -84,20 +86,20 @@ def are_disjoint(sets): return disjoint -def identifiable(suppX, f, N): - # check if f(x) + supp N are disjoint for x in domx - suppN = set(N) +def identifiable(dom_f, f, N): + # check if f(x) + supp N are disjoint for x in domain of f + supp_N = set(N) decomps = [] - for x in suppX: - fx = f[x] - sum_fx_suppN = set([fx + n for n in suppN]) - decomps.append(sum_fx_suppN) + for x in dom_f: + u = f[x] + supp_addition = set([u + n for n in supp_N]) + decomps.append(supp_addition) non_overlapping_noise = are_disjoint(decomps) return not non_overlapping_noise def calibrate_dr_alpha(): - print "testing various critical thresholds for DR" + print("testing various critical thresholds for DR") sys.stdout.flush() nsim = 1000 size = 1000 @@ -105,9 +107,9 @@ def calibrate_dr_alpha(): suppfX = range(-7, 8) srcX = "geometric" levels = [0.05, 0.04, 0.03, 0.02, 0.01, 0.005, 0.001] - print "-" * 24 - print "%8s%15s" % ("ALPHA", "ACCURACY(DR)") - print "-" * 24 + print("-" * 24) + print("%8s%15s" % ("ALPHA", "ACCURACY(DR)")) + print("-" * 24) for level in levels: nsamples = 0 ncorrect = 0 @@ -116,74 +118,88 @@ def calibrate_dr_alpha(): suppX = list(set(X)) f = map_randomly(suppX, suppfX) N = generate_additive_N(size) - Y = [f[X[i]] + N[i] for i in xrange(size)] + Y = [f[X[i]] + N[i] for i in range(size)] if not identifiable(suppX, f, N): continue nsamples += 1 - dr_score = dr(X, Y, level) + dr_score = anm(X, Y, ChiSquaredTest, 100) ncorrect += int(dr_score[0] > level and dr_score[1] < level) assert nsamples == nsim acc = ncorrect * 100 / nsim - print "%8.3f%15.2f" % (level, acc) + print("%8.3f%15.2f" % (level, acc)) sys.stdout.flush() - print "-" * 24 + print("-" * 24) def test_accuracy_data_type(): nsim = 1000 sample_size = 1000 level = 0.05 - suppfX = range(-7, 8) + img_f = range(-7, 8) srcsX = ["uniform", "binomial", "negativeBinomial", "geometric", "hypergeometric", "poisson", "multinomial"] - print "-" * 70 - print "%18s%10s%10s%10s%10s%10s" % ("TYPE_X", "DC", "DR", "CISC", "ACID", "ENTROPIC") - print "-" * 70 + print("-" * 80) + print("%18s%10s%10s%10s%10s%10s%10s" % + ("X", "DC", "ENT", "DR", "CISC", "ACID", "CRISP")) + print("-" * 80) sys.stdout.flush() fp = open("results/acc-dtype.dat", "w") - fp.write("%s\t%s\t%s\t%s\t%s\t%s\n" % - ("dtype", "dc", "dr", "cisc", "acid", "entropic")) + fp.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % + ("dtype", "dc", "ent", "dr", "cisc", "acid", "crisp")) for srcX in srcsX: nsamples = 0 - nc_dc, nc_dr, nc_cisc, nc_acid, nc_entropic = 0, 0, 0, 0, 0 + nc_dc, nc_ent, nc_dr, nc_cisc, nc_acid, nc_crisp = 0, 0, 0, 0, 0, 0 while nsamples < nsim: X = generate_X(srcX, sample_size) - suppX = list(set(X)) - f = map_randomly(suppX, suppfX) + dom_f = list(set(X)) + f = map_randomly(dom_f, img_f) N = generate_additive_N(sample_size) - Y = [f[X[i]] + N[i] for i in xrange(sample_size)] + Y = [f[X[i]] + N[i] for i in range(sample_size)] - if not identifiable(suppX, f, N): + assert len(X) == len(Y) == len(N) + + if not identifiable(dom_f, f, N): continue nsamples += 1 - dc_score = dc(dc_compat(X), dc_compat(Y)) - dr_score = dr(X, Y, level) + dc_score = dc(to_nested(X), to_nested(Y)) + ent_score = entropic(pd.DataFrame(np.column_stack((X, Y)))) + dr_score = anm(X, Y, ChiSquaredTest, 100) cisc_score = cisc(X, Y) - acid_score = acid(X, Y) - entropic_score = entropic(pd.DataFrame(np.column_stack((X, Y)))) + acid_score = anm(X, Y, Entropy, 100) + crisp_score = anm(X, Y, StochasticComplexity, 100, True) + # dc_score = (0, 0) + # dr_score = (0, 0) + # cisc_score = (0, 0) + # acid_score = (0, 0) + # crisp_score = (0, 0) nc_dc += int(dc_score[0] < dc_score[1]) + nc_ent += int(ent_score[0] < ent_score[1]) nc_dr += int(dr_score[0] > level and dr_score[1] < level) nc_cisc += int(cisc_score[0] < cisc_score[1]) nc_acid += int(acid_score[0] < acid_score[1]) - nc_entropic += int(entropic_score[0] < entropic_score[1]) + nc_crisp += int(crisp_score[0] < crisp_score[1]) assert nsamples == nsim acc_dc = nc_dc * 100 / nsim + acc_ent = nc_ent * 100 / nsim acc_dr = nc_dr * 100 / nsim acc_cisc = nc_cisc * 100 / nsim acc_acid = nc_acid * 100 / nsim - acc_entropic = nc_entropic * 100 / nsim - print "%18s%10.2f%10.2f%10.2f%10.2f%10.2f" % (srcX, acc_dc, acc_dr, acc_cisc, acc_acid, acc_entropic) + acc_crisp = nc_crisp * 100 / nsim + print("%18s%10.2f%10.2f%10.2f%10.2f%10.2f%10.2f" % + (srcX, acc_dc, acc_ent, acc_dr, acc_cisc, acc_acid, acc_crisp)) sys.stdout.flush() - fp.write("%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n" % - (srcX, acc_dc, acc_dr, acc_cisc, acc_acid, acc_entropic)) - print "-" * 70 + fp.write( + "%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n" % + (srcX, acc_dc, acc_ent, acc_dr, acc_cisc, acc_acid, acc_crisp) + ) + print("-" * 80) sys.stdout.flush() fp.close() @@ -191,51 +207,59 @@ def test_accuracy_data_type(): def test_accuracy_sample_size(): nsim = 1000 level = 0.05 - sizes = [400, 700, 1000, 1300, 1600, 2000, 10000] + sizes = [100, 400, 700, 1000, 1300, 1600, 2000, 10000] suppfX = range(-7, 8) srcX = "geometric" fp = open("results/acc-size.dat", "w") - diffs = [] fp.write("%s\t%s\t%s\t%s\t%s\t%s\n" % ("size", "dc", "dr", "cisc", "acid", "entropic")) - print "%s\t%s\t%s\t%s\t%s\t%s" % ("size", "dc", "dr", "cisc", "acid", "entropic") + print("%s\t%s\t%s\t%s\t%s\t%s" % + ("size", "dc", "dr", "cisc", "acid", "entropic")) sys.stdout.flush() for k, size in enumerate(sizes): nsamples = 0 - nc_dc, nc_dr, nc_cisc, nc_acid, nc_entropic = 0, 0, 0, 0, 0 + nc_dc, nc_ent, nc_dr, nc_cisc, nc_acid, nc_crisp = 0, 0, 0, 0, 0, 0 while nsamples < nsim: X = generate_X(srcX, size) suppX = list(set(X)) f = map_randomly(suppX, suppfX) N = generate_additive_N(size) - Y = [f[X[i]] + N[i] for i in xrange(size)] + Y = [f[X[i]] + N[i] for i in range(size)] if not identifiable(suppX, f, N): continue nsamples += 1 - dc_score = dc(dc_compat(X), dc_compat(Y)) - dr_score = dr(X, Y, level) + dc_score = dc(to_nested(X), to_nested(Y)) + ent_score = entropic(pd.DataFrame(np.column_stack((X, Y)))) + dr_score = anm(X, Y, ChiSquaredTest, 100) cisc_score = cisc(X, Y) - acid_score = acid(X, Y) - entropic_score = entropic(pd.DataFrame(np.column_stack((X, Y)))) + acid_score = anm(X, Y, Entropy, 100) + crisp_score = anm(X, Y, StochasticComplexity, 100, True) nc_dc += int(dc_score[0] < dc_score[1]) + nc_ent += int(ent_score[0] < ent_score[1]) nc_dr += int(dr_score[0] > level and dr_score[1] < level) nc_cisc += int(cisc_score[0] < cisc_score[1]) nc_acid += int(acid_score[0] < acid_score[1]) - nc_entropic += int(entropic_score[0] < entropic_score[1]) + nc_crisp += int(crisp_score[0] < crisp_score[1]) + + assert nsamples == nsim acc_dc = nc_dc * 100 / nsim + acc_ent = nc_ent * 100 / nsim acc_dr = nc_dr * 100 / nsim acc_cisc = nc_cisc * 100 / nsim acc_acid = nc_acid * 100 / nsim - acc_entropic = nc_entropic * 100 / nsim - print "%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f" % (size, acc_dc, acc_dr, acc_cisc, acc_acid, acc_entropic) + acc_crisp = nc_crisp * 100 / nsim + + print("%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f" % + (size, acc_dc, acc_ent, acc_dr, acc_cisc, acc_acid, acc_crisp)) sys.stdout.flush() - fp.write("%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n" % - (size, acc_dc, acc_dr, acc_cisc, acc_acid, acc_entropic)) + fp.write( + "%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n" % + (size, acc_dc, acc_ent, acc_dr, acc_cisc, acc_acid, acc_crisp)) fp.close() @@ -255,13 +279,13 @@ def test_hypercompression(): suppX = list(set(X)) f = map_randomly(suppX, suppfX) N = generate_additive_N(size) - Y = [f[X[i]] + N[i] for i in xrange(size)] + Y = [f[X[i]] + N[i] for i in range(size)] if not identifiable(suppX, f, N): continue nsamples += 1 - acid_score = acid(X, Y) + acid_score = anm(X, Y, Entropy, 100) diff = size * abs(acid_score[0] - acid_score[1]) if acid_score[0] < acid_score[1]: @@ -273,6 +297,7 @@ def test_hypercompression(): diffs.append(int(diff)) decisions.append(decision) + sorted_diffs_indices = reverse_argsort(diffs) diffs = [diffs[idx] for idx in sorted_diffs_indices] decisions = [decisions[idx] for idx in sorted_diffs_indices] @@ -323,38 +348,39 @@ def test_domain_runtime_acid(): supps = [20, 40, 80, 160] for supp in supps: pool = range(supp) - X = [random.choice(pool) for i in xrange(size)] - Y = [random.choice(pool) for i in xrange(size)] + X = [random.choice(pool) for i in range(size)] + Y = [random.choice(pool) for i in range(size)] acid_t = 0 - for i in xrange(nloop): + for i in range(nloop): tstart = time.time() - acid(X, Y) + anm(X, Y, Entropy, 100) tend = time.time() acid_t += tend - tstart - print supp, acid_t / nloop + print(supp, acid_t / nloop) sys.stdout.flush() def test_size_runtime_acid(): + pool = range(-7, 8) nloop = 5 sizes = [10000, 20000, 40000, 80000, 160000, 320000, 640000, 1280000] for size in sizes: - X = [random.choice(suppX) for i in xrange(size)] - Y = [random.choice(suppY) for i in xrange(size)] + X = [random.choice(pool) for i in range(size)] + Y = [random.choice(pool) for i in range(size)] acid_t = 0 - for i in xrange(nloop): + for i in range(nloop): tstart = time.time() - acid(X, Y) + anm(X, Y, Entropy, 100) tend = time.time() acid_t += tend - tstart - print size, acid_t / nloop + print(size, acid_t / nloop) sys.stdout.flush() if __name__ == "__main__": - calibrate_dr_alpha() + # calibrate_dr_alpha() test_accuracy_data_type() - test_accuracy_sample_size() - test_hypercompression() - test_size_runtime_acid() - test_domain_runtime_acid() + # test_accuracy_sample_size() + # test_hypercompression() + # test_size_runtime_acid() + # test_domain_runtime_acid()