Fixed database typo and removed unnecessary class identifier.

2020-10-14 10:10:37 -04:00 · 2020-10-14 10:10:37 -04:00 · 45fb349a7d
commit 45fb349a7d
parent 00ad49a143
5098 changed files with 952558 additions and 85 deletions
--- a/venv/Lib/site-packages/scipy/stats/_hypotests.py
+++ b/venv/Lib/site-packages/scipy/stats/_hypotests.py
@ -0,0 +1,148 @@
+from collections import namedtuple
+import numpy as np
+import warnings
+from ._continuous_distns import chi2
+from . import _wilcoxon_data
+
+
+Epps_Singleton_2sampResult = namedtuple('Epps_Singleton_2sampResult',
+                                        ('statistic', 'pvalue'))
+
+
+def epps_singleton_2samp(x, y, t=(0.4, 0.8)):
+    """
+    Compute the Epps-Singleton (ES) test statistic.
+
+    Test the null hypothesis that two samples have the same underlying
+    probability distribution.
+
+    Parameters
+    ----------
+    x, y : array-like
+        The two samples of observations to be tested. Input must not have more
+        than one dimension. Samples can have different lengths.
+    t : array-like, optional
+        The points (t1, ..., tn) where the empirical characteristic function is
+        to be evaluated. It should be positive distinct numbers. The default
+        value (0.4, 0.8) is proposed in [1]_. Input must not have more than
+        one dimension.
+
+    Returns
+    -------
+    statistic : float
+        The test statistic.
+    pvalue : float
+        The associated p-value based on the asymptotic chi2-distribution.
+
+    See Also
+    --------
+    ks_2samp, anderson_ksamp
+
+    Notes
+    -----
+    Testing whether two samples are generated by the same underlying
+    distribution is a classical question in statistics. A widely used test is
+    the Kolmogorov-Smirnov (KS) test which relies on the empirical
+    distribution function. Epps and Singleton introduce a test based on the
+    empirical characteristic function in [1]_.
+
+    One advantage of the ES test compared to the KS test is that is does
+    not assume a continuous distribution. In [1]_, the authors conclude
+    that the test also has a higher power than the KS test in many
+    examples. They recommend the use of the ES test for discrete samples as
+    well as continuous samples with at least 25 observations each, whereas
+    `anderson_ksamp` is recommended for smaller sample sizes in the
+    continuous case.
+
+    The p-value is computed from the asymptotic distribution of the test
+    statistic which follows a `chi2` distribution. If the sample size of both
+    `x` and `y` is below 25, the small sample correction proposed in [1]_ is
+    applied to the test statistic.
+
+    The default values of `t` are determined in [1]_ by considering
+    various distributions and finding good values that lead to a high power
+    of the test in general. Table III in [1]_ gives the optimal values for
+    the distributions tested in that study. The values of `t` are scaled by
+    the semi-interquartile range in the implementation, see [1]_.
+
+    References
+    ----------
+    .. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample
+       problem using the empirical characteristic function", Journal of
+       Statistical Computation and Simulation 26, p. 177--203, 1986.
+
+    .. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions
+       - the Epps-Singleton two-sample test using the empirical characteristic
+       function", The Stata Journal 9(3), p. 454--465, 2009.
+
+    """
+
+    x, y, t = np.asarray(x), np.asarray(y), np.asarray(t)
+    # check if x and y are valid inputs
+    if x.ndim > 1:
+        raise ValueError('x must be 1d, but x.ndim equals {}.'.format(x.ndim))
+    if y.ndim > 1:
+        raise ValueError('y must be 1d, but y.ndim equals {}.'.format(y.ndim))
+    nx, ny = len(x), len(y)
+    if (nx < 5) or (ny < 5):
+        raise ValueError('x and y should have at least 5 elements, but len(x) '
+                         '= {} and len(y) = {}.'.format(nx, ny))
+    if not np.isfinite(x).all():
+        raise ValueError('x must not contain nonfinite values.')
+    if not np.isfinite(y).all():
+        raise ValueError('y must not contain nonfinite values.')
+    n = nx + ny
+
+    # check if t is valid
+    if t.ndim > 1:
+        raise ValueError('t must be 1d, but t.ndim equals {}.'.format(t.ndim))
+    if np.less_equal(t, 0).any():
+        raise ValueError('t must contain positive elements only.')
+
+    # rescale t with semi-iqr as proposed in [1]; import iqr here to avoid
+    # circular import
+    from scipy.stats import iqr
+    sigma = iqr(np.hstack((x, y))) / 2
+    ts = np.reshape(t, (-1, 1)) / sigma
+
+    # covariance estimation of ES test
+    gx = np.vstack((np.cos(ts*x), np.sin(ts*x))).T  # shape = (nx, 2*len(t))
+    gy = np.vstack((np.cos(ts*y), np.sin(ts*y))).T
+    cov_x = np.cov(gx.T, bias=True)  # the test uses biased cov-estimate
+    cov_y = np.cov(gy.T, bias=True)
+    est_cov = (n/nx)*cov_x + (n/ny)*cov_y
+    est_cov_inv = np.linalg.pinv(est_cov)
+    r = np.linalg.matrix_rank(est_cov_inv)
+    if r < 2*len(t):
+        warnings.warn('Estimated covariance matrix does not have full rank. '
+                      'This indicates a bad choice of the input t and the '
+                      'test might not be consistent.')  # see p. 183 in [1]_
+
+    # compute test statistic w distributed asympt. as chisquare with df=r
+    g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0)
+    w = n*np.dot(g_diff.T, np.dot(est_cov_inv, g_diff))
+
+    # apply small-sample correction
+    if (max(nx, ny) < 25):
+        corr = 1.0/(1.0 + n**(-0.45) + 10.1*(nx**(-1.7) + ny**(-1.7)))
+        w = corr * w
+
+    p = chi2.sf(w, r)
+
+    return Epps_Singleton_2sampResult(w, p)
+
+
+def _get_wilcoxon_distr(n):
+    """
+    Distribution of counts of the Wilcoxon ranksum statistic r_plus (sum of
+    ranks of positive differences).
+    Returns an array with the counts/frequencies of all the possible ranks
+    r = 0, ..., n*(n+1)/2
+    """
+    cnt = _wilcoxon_data.COUNTS.get(n)
+
+    if cnt is None:
+        raise ValueError("The exact distribution of the Wilcoxon test "
+                         "statistic is not implemented for n={}".format(n))
+
+    return np.array(cnt, dtype=int)