198 lines
5.8 KiB
Python
198 lines
5.8 KiB
Python
import numpy as np
|
|
from numpy.testing import (assert_equal, assert_array_equal,
|
|
assert_array_almost_equal, assert_approx_equal, assert_allclose)
|
|
from pytest import raises as assert_raises
|
|
|
|
from scipy.special import xlogy
|
|
from scipy.stats.contingency import margins, expected_freq, chi2_contingency
|
|
|
|
|
|
def test_margins():
|
|
a = np.array([1])
|
|
m = margins(a)
|
|
assert_equal(len(m), 1)
|
|
m0 = m[0]
|
|
assert_array_equal(m0, np.array([1]))
|
|
|
|
a = np.array([[1]])
|
|
m0, m1 = margins(a)
|
|
expected0 = np.array([[1]])
|
|
expected1 = np.array([[1]])
|
|
assert_array_equal(m0, expected0)
|
|
assert_array_equal(m1, expected1)
|
|
|
|
a = np.arange(12).reshape(2, 6)
|
|
m0, m1 = margins(a)
|
|
expected0 = np.array([[15], [51]])
|
|
expected1 = np.array([[6, 8, 10, 12, 14, 16]])
|
|
assert_array_equal(m0, expected0)
|
|
assert_array_equal(m1, expected1)
|
|
|
|
a = np.arange(24).reshape(2, 3, 4)
|
|
m0, m1, m2 = margins(a)
|
|
expected0 = np.array([[[66]], [[210]]])
|
|
expected1 = np.array([[[60], [92], [124]]])
|
|
expected2 = np.array([[[60, 66, 72, 78]]])
|
|
assert_array_equal(m0, expected0)
|
|
assert_array_equal(m1, expected1)
|
|
assert_array_equal(m2, expected2)
|
|
|
|
|
|
def test_expected_freq():
|
|
assert_array_equal(expected_freq([1]), np.array([1.0]))
|
|
|
|
observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
|
|
e = expected_freq(observed)
|
|
assert_array_equal(e, np.ones_like(observed))
|
|
|
|
observed = np.array([[10, 10, 20], [20, 20, 20]])
|
|
e = expected_freq(observed)
|
|
correct = np.array([[12., 12., 16.], [18., 18., 24.]])
|
|
assert_array_almost_equal(e, correct)
|
|
|
|
|
|
def test_chi2_contingency_trivial():
|
|
# Some very simple tests for chi2_contingency.
|
|
|
|
# A trivial case
|
|
obs = np.array([[1, 2], [1, 2]])
|
|
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
|
|
assert_equal(chi2, 0.0)
|
|
assert_equal(p, 1.0)
|
|
assert_equal(dof, 1)
|
|
assert_array_equal(obs, expected)
|
|
|
|
# A *really* trivial case: 1-D data.
|
|
obs = np.array([1, 2, 3])
|
|
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
|
|
assert_equal(chi2, 0.0)
|
|
assert_equal(p, 1.0)
|
|
assert_equal(dof, 0)
|
|
assert_array_equal(obs, expected)
|
|
|
|
|
|
def test_chi2_contingency_R():
|
|
# Some test cases that were computed independently, using R.
|
|
|
|
# Rcode = \
|
|
# """
|
|
# # Data vector.
|
|
# data <- c(
|
|
# 12, 34, 23, 4, 47, 11,
|
|
# 35, 31, 11, 34, 10, 18,
|
|
# 12, 32, 9, 18, 13, 19,
|
|
# 12, 12, 14, 9, 33, 25
|
|
# )
|
|
#
|
|
# # Create factor tags:r=rows, c=columns, t=tiers
|
|
# r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
|
|
# c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3")))
|
|
# t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2")))
|
|
#
|
|
# # 3-way Chi squared test of independence
|
|
# s = summary(xtabs(data~r+c+t))
|
|
# print(s)
|
|
# """
|
|
# Routput = \
|
|
# """
|
|
# Call: xtabs(formula = data ~ r + c + t)
|
|
# Number of cases in table: 478
|
|
# Number of factors: 3
|
|
# Test for independence of all factors:
|
|
# Chisq = 102.17, df = 17, p-value = 3.514e-14
|
|
# """
|
|
obs = np.array(
|
|
[[[12, 34, 23],
|
|
[35, 31, 11],
|
|
[12, 32, 9],
|
|
[12, 12, 14]],
|
|
[[4, 47, 11],
|
|
[34, 10, 18],
|
|
[18, 13, 19],
|
|
[9, 33, 25]]])
|
|
chi2, p, dof, expected = chi2_contingency(obs)
|
|
assert_approx_equal(chi2, 102.17, significant=5)
|
|
assert_approx_equal(p, 3.514e-14, significant=4)
|
|
assert_equal(dof, 17)
|
|
|
|
# Rcode = \
|
|
# """
|
|
# # Data vector.
|
|
# data <- c(
|
|
# #
|
|
# 12, 17,
|
|
# 11, 16,
|
|
# #
|
|
# 11, 12,
|
|
# 15, 16,
|
|
# #
|
|
# 23, 15,
|
|
# 30, 22,
|
|
# #
|
|
# 14, 17,
|
|
# 15, 16
|
|
# )
|
|
#
|
|
# # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
|
|
# r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2")))
|
|
# c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2")))
|
|
# d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2")))
|
|
# t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2")))
|
|
#
|
|
# # 4-way Chi squared test of independence
|
|
# s = summary(xtabs(data~r+c+d+t))
|
|
# print(s)
|
|
# """
|
|
# Routput = \
|
|
# """
|
|
# Call: xtabs(formula = data ~ r + c + d + t)
|
|
# Number of cases in table: 262
|
|
# Number of factors: 4
|
|
# Test for independence of all factors:
|
|
# Chisq = 8.758, df = 11, p-value = 0.6442
|
|
# """
|
|
obs = np.array(
|
|
[[[[12, 17],
|
|
[11, 16]],
|
|
[[11, 12],
|
|
[15, 16]]],
|
|
[[[23, 15],
|
|
[30, 22]],
|
|
[[14, 17],
|
|
[15, 16]]]])
|
|
chi2, p, dof, expected = chi2_contingency(obs)
|
|
assert_approx_equal(chi2, 8.758, significant=4)
|
|
assert_approx_equal(p, 0.6442, significant=4)
|
|
assert_equal(dof, 11)
|
|
|
|
|
|
def test_chi2_contingency_g():
|
|
c = np.array([[15, 60], [15, 90]])
|
|
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=False)
|
|
assert_allclose(g, 2*xlogy(c, c/e).sum())
|
|
|
|
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=True)
|
|
c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
|
|
assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())
|
|
|
|
c = np.array([[10, 12, 10], [12, 10, 10]])
|
|
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
|
|
assert_allclose(g, 2*xlogy(c, c/e).sum())
|
|
|
|
|
|
def test_chi2_contingency_bad_args():
|
|
# Test that "bad" inputs raise a ValueError.
|
|
|
|
# Negative value in the array of observed frequencies.
|
|
obs = np.array([[-1, 10], [1, 2]])
|
|
assert_raises(ValueError, chi2_contingency, obs)
|
|
|
|
# The zeros in this will result in zeros in the array
|
|
# of expected frequencies.
|
|
obs = np.array([[0, 1], [0, 1]])
|
|
assert_raises(ValueError, chi2_contingency, obs)
|
|
|
|
# A degenerate case: `observed` has size 0.
|
|
obs = np.empty((0, 8))
|
|
assert_raises(ValueError, chi2_contingency, obs)
|
|
|