Fixed database typo and removed unnecessary class identifier.
This commit is contained in:
parent
00ad49a143
commit
45fb349a7d
5098 changed files with 952558 additions and 85 deletions
148
venv/Lib/site-packages/scipy/stats/_hypotests.py
Normal file
148
venv/Lib/site-packages/scipy/stats/_hypotests.py
Normal file
|
@ -0,0 +1,148 @@
|
|||
from collections import namedtuple
|
||||
import numpy as np
|
||||
import warnings
|
||||
from ._continuous_distns import chi2
|
||||
from . import _wilcoxon_data
|
||||
|
||||
|
||||
Epps_Singleton_2sampResult = namedtuple('Epps_Singleton_2sampResult',
|
||||
('statistic', 'pvalue'))
|
||||
|
||||
|
||||
def epps_singleton_2samp(x, y, t=(0.4, 0.8)):
|
||||
"""
|
||||
Compute the Epps-Singleton (ES) test statistic.
|
||||
|
||||
Test the null hypothesis that two samples have the same underlying
|
||||
probability distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x, y : array-like
|
||||
The two samples of observations to be tested. Input must not have more
|
||||
than one dimension. Samples can have different lengths.
|
||||
t : array-like, optional
|
||||
The points (t1, ..., tn) where the empirical characteristic function is
|
||||
to be evaluated. It should be positive distinct numbers. The default
|
||||
value (0.4, 0.8) is proposed in [1]_. Input must not have more than
|
||||
one dimension.
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : float
|
||||
The test statistic.
|
||||
pvalue : float
|
||||
The associated p-value based on the asymptotic chi2-distribution.
|
||||
|
||||
See Also
|
||||
--------
|
||||
ks_2samp, anderson_ksamp
|
||||
|
||||
Notes
|
||||
-----
|
||||
Testing whether two samples are generated by the same underlying
|
||||
distribution is a classical question in statistics. A widely used test is
|
||||
the Kolmogorov-Smirnov (KS) test which relies on the empirical
|
||||
distribution function. Epps and Singleton introduce a test based on the
|
||||
empirical characteristic function in [1]_.
|
||||
|
||||
One advantage of the ES test compared to the KS test is that is does
|
||||
not assume a continuous distribution. In [1]_, the authors conclude
|
||||
that the test also has a higher power than the KS test in many
|
||||
examples. They recommend the use of the ES test for discrete samples as
|
||||
well as continuous samples with at least 25 observations each, whereas
|
||||
`anderson_ksamp` is recommended for smaller sample sizes in the
|
||||
continuous case.
|
||||
|
||||
The p-value is computed from the asymptotic distribution of the test
|
||||
statistic which follows a `chi2` distribution. If the sample size of both
|
||||
`x` and `y` is below 25, the small sample correction proposed in [1]_ is
|
||||
applied to the test statistic.
|
||||
|
||||
The default values of `t` are determined in [1]_ by considering
|
||||
various distributions and finding good values that lead to a high power
|
||||
of the test in general. Table III in [1]_ gives the optimal values for
|
||||
the distributions tested in that study. The values of `t` are scaled by
|
||||
the semi-interquartile range in the implementation, see [1]_.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample
|
||||
problem using the empirical characteristic function", Journal of
|
||||
Statistical Computation and Simulation 26, p. 177--203, 1986.
|
||||
|
||||
.. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions
|
||||
- the Epps-Singleton two-sample test using the empirical characteristic
|
||||
function", The Stata Journal 9(3), p. 454--465, 2009.
|
||||
|
||||
"""
|
||||
|
||||
x, y, t = np.asarray(x), np.asarray(y), np.asarray(t)
|
||||
# check if x and y are valid inputs
|
||||
if x.ndim > 1:
|
||||
raise ValueError('x must be 1d, but x.ndim equals {}.'.format(x.ndim))
|
||||
if y.ndim > 1:
|
||||
raise ValueError('y must be 1d, but y.ndim equals {}.'.format(y.ndim))
|
||||
nx, ny = len(x), len(y)
|
||||
if (nx < 5) or (ny < 5):
|
||||
raise ValueError('x and y should have at least 5 elements, but len(x) '
|
||||
'= {} and len(y) = {}.'.format(nx, ny))
|
||||
if not np.isfinite(x).all():
|
||||
raise ValueError('x must not contain nonfinite values.')
|
||||
if not np.isfinite(y).all():
|
||||
raise ValueError('y must not contain nonfinite values.')
|
||||
n = nx + ny
|
||||
|
||||
# check if t is valid
|
||||
if t.ndim > 1:
|
||||
raise ValueError('t must be 1d, but t.ndim equals {}.'.format(t.ndim))
|
||||
if np.less_equal(t, 0).any():
|
||||
raise ValueError('t must contain positive elements only.')
|
||||
|
||||
# rescale t with semi-iqr as proposed in [1]; import iqr here to avoid
|
||||
# circular import
|
||||
from scipy.stats import iqr
|
||||
sigma = iqr(np.hstack((x, y))) / 2
|
||||
ts = np.reshape(t, (-1, 1)) / sigma
|
||||
|
||||
# covariance estimation of ES test
|
||||
gx = np.vstack((np.cos(ts*x), np.sin(ts*x))).T # shape = (nx, 2*len(t))
|
||||
gy = np.vstack((np.cos(ts*y), np.sin(ts*y))).T
|
||||
cov_x = np.cov(gx.T, bias=True) # the test uses biased cov-estimate
|
||||
cov_y = np.cov(gy.T, bias=True)
|
||||
est_cov = (n/nx)*cov_x + (n/ny)*cov_y
|
||||
est_cov_inv = np.linalg.pinv(est_cov)
|
||||
r = np.linalg.matrix_rank(est_cov_inv)
|
||||
if r < 2*len(t):
|
||||
warnings.warn('Estimated covariance matrix does not have full rank. '
|
||||
'This indicates a bad choice of the input t and the '
|
||||
'test might not be consistent.') # see p. 183 in [1]_
|
||||
|
||||
# compute test statistic w distributed asympt. as chisquare with df=r
|
||||
g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0)
|
||||
w = n*np.dot(g_diff.T, np.dot(est_cov_inv, g_diff))
|
||||
|
||||
# apply small-sample correction
|
||||
if (max(nx, ny) < 25):
|
||||
corr = 1.0/(1.0 + n**(-0.45) + 10.1*(nx**(-1.7) + ny**(-1.7)))
|
||||
w = corr * w
|
||||
|
||||
p = chi2.sf(w, r)
|
||||
|
||||
return Epps_Singleton_2sampResult(w, p)
|
||||
|
||||
|
||||
def _get_wilcoxon_distr(n):
|
||||
"""
|
||||
Distribution of counts of the Wilcoxon ranksum statistic r_plus (sum of
|
||||
ranks of positive differences).
|
||||
Returns an array with the counts/frequencies of all the possible ranks
|
||||
r = 0, ..., n*(n+1)/2
|
||||
"""
|
||||
cnt = _wilcoxon_data.COUNTS.get(n)
|
||||
|
||||
if cnt is None:
|
||||
raise ValueError("The exact distribution of the Wilcoxon test "
|
||||
"statistic is not implemented for n={}".format(n))
|
||||
|
||||
return np.array(cnt, dtype=int)
|
Loading…
Add table
Add a link
Reference in a new issue