148 lines
5.7 KiB
Python
148 lines
5.7 KiB
Python
from collections import namedtuple
|
|
import numpy as np
|
|
import warnings
|
|
from ._continuous_distns import chi2
|
|
from . import _wilcoxon_data
|
|
|
|
|
|
Epps_Singleton_2sampResult = namedtuple('Epps_Singleton_2sampResult',
|
|
('statistic', 'pvalue'))
|
|
|
|
|
|
def epps_singleton_2samp(x, y, t=(0.4, 0.8)):
|
|
"""
|
|
Compute the Epps-Singleton (ES) test statistic.
|
|
|
|
Test the null hypothesis that two samples have the same underlying
|
|
probability distribution.
|
|
|
|
Parameters
|
|
----------
|
|
x, y : array-like
|
|
The two samples of observations to be tested. Input must not have more
|
|
than one dimension. Samples can have different lengths.
|
|
t : array-like, optional
|
|
The points (t1, ..., tn) where the empirical characteristic function is
|
|
to be evaluated. It should be positive distinct numbers. The default
|
|
value (0.4, 0.8) is proposed in [1]_. Input must not have more than
|
|
one dimension.
|
|
|
|
Returns
|
|
-------
|
|
statistic : float
|
|
The test statistic.
|
|
pvalue : float
|
|
The associated p-value based on the asymptotic chi2-distribution.
|
|
|
|
See Also
|
|
--------
|
|
ks_2samp, anderson_ksamp
|
|
|
|
Notes
|
|
-----
|
|
Testing whether two samples are generated by the same underlying
|
|
distribution is a classical question in statistics. A widely used test is
|
|
the Kolmogorov-Smirnov (KS) test which relies on the empirical
|
|
distribution function. Epps and Singleton introduce a test based on the
|
|
empirical characteristic function in [1]_.
|
|
|
|
One advantage of the ES test compared to the KS test is that is does
|
|
not assume a continuous distribution. In [1]_, the authors conclude
|
|
that the test also has a higher power than the KS test in many
|
|
examples. They recommend the use of the ES test for discrete samples as
|
|
well as continuous samples with at least 25 observations each, whereas
|
|
`anderson_ksamp` is recommended for smaller sample sizes in the
|
|
continuous case.
|
|
|
|
The p-value is computed from the asymptotic distribution of the test
|
|
statistic which follows a `chi2` distribution. If the sample size of both
|
|
`x` and `y` is below 25, the small sample correction proposed in [1]_ is
|
|
applied to the test statistic.
|
|
|
|
The default values of `t` are determined in [1]_ by considering
|
|
various distributions and finding good values that lead to a high power
|
|
of the test in general. Table III in [1]_ gives the optimal values for
|
|
the distributions tested in that study. The values of `t` are scaled by
|
|
the semi-interquartile range in the implementation, see [1]_.
|
|
|
|
References
|
|
----------
|
|
.. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample
|
|
problem using the empirical characteristic function", Journal of
|
|
Statistical Computation and Simulation 26, p. 177--203, 1986.
|
|
|
|
.. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions
|
|
- the Epps-Singleton two-sample test using the empirical characteristic
|
|
function", The Stata Journal 9(3), p. 454--465, 2009.
|
|
|
|
"""
|
|
|
|
x, y, t = np.asarray(x), np.asarray(y), np.asarray(t)
|
|
# check if x and y are valid inputs
|
|
if x.ndim > 1:
|
|
raise ValueError('x must be 1d, but x.ndim equals {}.'.format(x.ndim))
|
|
if y.ndim > 1:
|
|
raise ValueError('y must be 1d, but y.ndim equals {}.'.format(y.ndim))
|
|
nx, ny = len(x), len(y)
|
|
if (nx < 5) or (ny < 5):
|
|
raise ValueError('x and y should have at least 5 elements, but len(x) '
|
|
'= {} and len(y) = {}.'.format(nx, ny))
|
|
if not np.isfinite(x).all():
|
|
raise ValueError('x must not contain nonfinite values.')
|
|
if not np.isfinite(y).all():
|
|
raise ValueError('y must not contain nonfinite values.')
|
|
n = nx + ny
|
|
|
|
# check if t is valid
|
|
if t.ndim > 1:
|
|
raise ValueError('t must be 1d, but t.ndim equals {}.'.format(t.ndim))
|
|
if np.less_equal(t, 0).any():
|
|
raise ValueError('t must contain positive elements only.')
|
|
|
|
# rescale t with semi-iqr as proposed in [1]; import iqr here to avoid
|
|
# circular import
|
|
from scipy.stats import iqr
|
|
sigma = iqr(np.hstack((x, y))) / 2
|
|
ts = np.reshape(t, (-1, 1)) / sigma
|
|
|
|
# covariance estimation of ES test
|
|
gx = np.vstack((np.cos(ts*x), np.sin(ts*x))).T # shape = (nx, 2*len(t))
|
|
gy = np.vstack((np.cos(ts*y), np.sin(ts*y))).T
|
|
cov_x = np.cov(gx.T, bias=True) # the test uses biased cov-estimate
|
|
cov_y = np.cov(gy.T, bias=True)
|
|
est_cov = (n/nx)*cov_x + (n/ny)*cov_y
|
|
est_cov_inv = np.linalg.pinv(est_cov)
|
|
r = np.linalg.matrix_rank(est_cov_inv)
|
|
if r < 2*len(t):
|
|
warnings.warn('Estimated covariance matrix does not have full rank. '
|
|
'This indicates a bad choice of the input t and the '
|
|
'test might not be consistent.') # see p. 183 in [1]_
|
|
|
|
# compute test statistic w distributed asympt. as chisquare with df=r
|
|
g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0)
|
|
w = n*np.dot(g_diff.T, np.dot(est_cov_inv, g_diff))
|
|
|
|
# apply small-sample correction
|
|
if (max(nx, ny) < 25):
|
|
corr = 1.0/(1.0 + n**(-0.45) + 10.1*(nx**(-1.7) + ny**(-1.7)))
|
|
w = corr * w
|
|
|
|
p = chi2.sf(w, r)
|
|
|
|
return Epps_Singleton_2sampResult(w, p)
|
|
|
|
|
|
def _get_wilcoxon_distr(n):
|
|
"""
|
|
Distribution of counts of the Wilcoxon ranksum statistic r_plus (sum of
|
|
ranks of positive differences).
|
|
Returns an array with the counts/frequencies of all the possible ranks
|
|
r = 0, ..., n*(n+1)/2
|
|
"""
|
|
cnt = _wilcoxon_data.COUNTS.get(n)
|
|
|
|
if cnt is None:
|
|
raise ValueError("The exact distribution of the Wilcoxon test "
|
|
"statistic is not implemented for n={}".format(n))
|
|
|
|
return np.array(cnt, dtype=int)
|