Fixed database typo and removed unnecessary class identifier.

This commit is contained in:
Batuhan Berk Başoğlu 2020-10-14 10:10:37 -04:00
parent 00ad49a143
commit 45fb349a7d
5098 changed files with 952558 additions and 85 deletions

View file

@ -0,0 +1,401 @@
"""
.. _statsrefmanual:
==========================================
Statistical functions (:mod:`scipy.stats`)
==========================================
.. currentmodule:: scipy.stats
This module contains a large number of probability distributions as
well as a growing library of statistical functions.
Each univariate distribution is an instance of a subclass of `rv_continuous`
(`rv_discrete` for discrete distributions):
.. autosummary::
:toctree: generated/
rv_continuous
rv_discrete
rv_histogram
Continuous distributions
========================
.. autosummary::
:toctree: generated/
alpha -- Alpha
anglit -- Anglit
arcsine -- Arcsine
argus -- Argus
beta -- Beta
betaprime -- Beta Prime
bradford -- Bradford
burr -- Burr (Type III)
burr12 -- Burr (Type XII)
cauchy -- Cauchy
chi -- Chi
chi2 -- Chi-squared
cosine -- Cosine
crystalball -- Crystalball
dgamma -- Double Gamma
dweibull -- Double Weibull
erlang -- Erlang
expon -- Exponential
exponnorm -- Exponentially Modified Normal
exponweib -- Exponentiated Weibull
exponpow -- Exponential Power
f -- F (Snecdor F)
fatiguelife -- Fatigue Life (Birnbaum-Saunders)
fisk -- Fisk
foldcauchy -- Folded Cauchy
foldnorm -- Folded Normal
frechet_r -- Deprecated. Alias for weibull_min
frechet_l -- Deprecated. Alias for weibull_max
genlogistic -- Generalized Logistic
gennorm -- Generalized normal
genpareto -- Generalized Pareto
genexpon -- Generalized Exponential
genextreme -- Generalized Extreme Value
gausshyper -- Gauss Hypergeometric
gamma -- Gamma
gengamma -- Generalized gamma
genhalflogistic -- Generalized Half Logistic
geninvgauss -- Generalized Inverse Gaussian
gilbrat -- Gilbrat
gompertz -- Gompertz (Truncated Gumbel)
gumbel_r -- Right Sided Gumbel, Log-Weibull, Fisher-Tippett, Extreme Value Type I
gumbel_l -- Left Sided Gumbel, etc.
halfcauchy -- Half Cauchy
halflogistic -- Half Logistic
halfnorm -- Half Normal
halfgennorm -- Generalized Half Normal
hypsecant -- Hyperbolic Secant
invgamma -- Inverse Gamma
invgauss -- Inverse Gaussian
invweibull -- Inverse Weibull
johnsonsb -- Johnson SB
johnsonsu -- Johnson SU
kappa4 -- Kappa 4 parameter
kappa3 -- Kappa 3 parameter
ksone -- Distribution of Kolmogorov-Smirnov one-sided test statistic
kstwo -- Distribution of Kolmogorov-Smirnov two-sided test statistic
kstwobign -- Limiting Distribution of scaled Kolmogorov-Smirnov two-sided test statistic.
laplace -- Laplace
levy -- Levy
levy_l
levy_stable
logistic -- Logistic
loggamma -- Log-Gamma
loglaplace -- Log-Laplace (Log Double Exponential)
lognorm -- Log-Normal
loguniform -- Log-Uniform
lomax -- Lomax (Pareto of the second kind)
maxwell -- Maxwell
mielke -- Mielke's Beta-Kappa
moyal -- Moyal
nakagami -- Nakagami
ncx2 -- Non-central chi-squared
ncf -- Non-central F
nct -- Non-central Student's T
norm -- Normal (Gaussian)
norminvgauss -- Normal Inverse Gaussian
pareto -- Pareto
pearson3 -- Pearson type III
powerlaw -- Power-function
powerlognorm -- Power log normal
powernorm -- Power normal
rdist -- R-distribution
rayleigh -- Rayleigh
rice -- Rice
recipinvgauss -- Reciprocal Inverse Gaussian
semicircular -- Semicircular
skewnorm -- Skew normal
t -- Student's T
trapz -- Trapezoidal
triang -- Triangular
truncexpon -- Truncated Exponential
truncnorm -- Truncated Normal
tukeylambda -- Tukey-Lambda
uniform -- Uniform
vonmises -- Von-Mises (Circular)
vonmises_line -- Von-Mises (Line)
wald -- Wald
weibull_min -- Minimum Weibull (see Frechet)
weibull_max -- Maximum Weibull (see Frechet)
wrapcauchy -- Wrapped Cauchy
Multivariate distributions
==========================
.. autosummary::
:toctree: generated/
multivariate_normal -- Multivariate normal distribution
matrix_normal -- Matrix normal distribution
dirichlet -- Dirichlet
wishart -- Wishart
invwishart -- Inverse Wishart
multinomial -- Multinomial distribution
special_ortho_group -- SO(N) group
ortho_group -- O(N) group
unitary_group -- U(N) group
random_correlation -- random correlation matrices
Discrete distributions
======================
.. autosummary::
:toctree: generated/
bernoulli -- Bernoulli
betabinom -- Beta-Binomial
binom -- Binomial
boltzmann -- Boltzmann (Truncated Discrete Exponential)
dlaplace -- Discrete Laplacian
geom -- Geometric
hypergeom -- Hypergeometric
logser -- Logarithmic (Log-Series, Series)
nbinom -- Negative Binomial
planck -- Planck (Discrete Exponential)
poisson -- Poisson
randint -- Discrete Uniform
skellam -- Skellam
zipf -- Zipf
yulesimon -- Yule-Simon
An overview of statistical functions is given below.
Several of these functions have a similar version in
`scipy.stats.mstats` which work for masked arrays.
Summary statistics
==================
.. autosummary::
:toctree: generated/
describe -- Descriptive statistics
gmean -- Geometric mean
hmean -- Harmonic mean
kurtosis -- Fisher or Pearson kurtosis
mode -- Modal value
moment -- Central moment
skew -- Skewness
kstat --
kstatvar --
tmean -- Truncated arithmetic mean
tvar -- Truncated variance
tmin --
tmax --
tstd --
tsem --
variation -- Coefficient of variation
find_repeats
trim_mean
gstd -- Geometric Standard Deviation
iqr
sem
bayes_mvs
mvsdist
entropy
median_absolute_deviation
median_abs_deviation
Frequency statistics
====================
.. autosummary::
:toctree: generated/
cumfreq
itemfreq
percentileofscore
scoreatpercentile
relfreq
.. autosummary::
:toctree: generated/
binned_statistic -- Compute a binned statistic for a set of data.
binned_statistic_2d -- Compute a 2-D binned statistic for a set of data.
binned_statistic_dd -- Compute a d-D binned statistic for a set of data.
Correlation functions
=====================
.. autosummary::
:toctree: generated/
f_oneway
pearsonr
spearmanr
pointbiserialr
kendalltau
weightedtau
linregress
siegelslopes
theilslopes
multiscale_graphcorr
Statistical tests
=================
.. autosummary::
:toctree: generated/
ttest_1samp
ttest_ind
ttest_ind_from_stats
ttest_rel
chisquare
power_divergence
kstest
ks_1samp
ks_2samp
epps_singleton_2samp
mannwhitneyu
tiecorrect
rankdata
ranksums
wilcoxon
kruskal
friedmanchisquare
brunnermunzel
combine_pvalues
jarque_bera
.. autosummary::
:toctree: generated/
ansari
bartlett
levene
shapiro
anderson
anderson_ksamp
binom_test
fligner
median_test
mood
skewtest
kurtosistest
normaltest
Transformations
===============
.. autosummary::
:toctree: generated/
boxcox
boxcox_normmax
boxcox_llf
yeojohnson
yeojohnson_normmax
yeojohnson_llf
obrientransform
sigmaclip
trimboth
trim1
zmap
zscore
Statistical distances
=====================
.. autosummary::
:toctree: generated/
wasserstein_distance
energy_distance
Random variate generation
=========================
.. autosummary::
:toctree: generated/
rvs_ratio_uniforms
Circular statistical functions
==============================
.. autosummary::
:toctree: generated/
circmean
circvar
circstd
Contingency table functions
===========================
.. autosummary::
:toctree: generated/
chi2_contingency
contingency.expected_freq
contingency.margins
fisher_exact
Plot-tests
==========
.. autosummary::
:toctree: generated/
ppcc_max
ppcc_plot
probplot
boxcox_normplot
yeojohnson_normplot
Masked statistics functions
===========================
.. toctree::
stats.mstats
Univariate and multivariate kernel density estimation
=====================================================
.. autosummary::
:toctree: generated/
gaussian_kde
Warnings used in :mod:`scipy.stats`
===================================
.. autosummary::
:toctree: generated/
F_onewayConstantInputWarning
F_onewayBadInputSizesWarning
PearsonRConstantInputWarning
PearsonRNearConstantInputWarning
SpearmanRConstantInputWarning
For many more stat related functions install the software R and the
interface package rpy.
"""
from .stats import *
from .distributions import *
from .morestats import *
from ._binned_statistic import *
from .kde import gaussian_kde
from . import mstats
from .contingency import chi2_contingency
from ._multivariate import *
__all__ = [s for s in dir() if not s.startswith("_")] # Remove dunders.
from scipy._lib._testutils import PytestTester
test = PytestTester(__name__)
del PytestTester

View file

@ -0,0 +1,722 @@
import builtins
import numpy as np
from numpy.testing import suppress_warnings
from operator import index
from collections import namedtuple
__all__ = ['binned_statistic',
'binned_statistic_2d',
'binned_statistic_dd']
BinnedStatisticResult = namedtuple('BinnedStatisticResult',
('statistic', 'bin_edges', 'binnumber'))
def binned_statistic(x, values, statistic='mean',
bins=10, range=None):
"""
Compute a binned statistic for one or more sets of data.
This is a generalization of a histogram function. A histogram divides
the space into bins, and returns the count of the number of points in
each bin. This function allows the computation of the sum, mean, median,
or other statistic of the values (or set of values) within each bin.
Parameters
----------
x : (N,) array_like
A sequence of values to be binned.
values : (N,) array_like or list of (N,) array_like
The data on which the statistic will be computed. This must be
the same shape as `x`, or a set of sequences - each the same shape as
`x`. If `values` is a set of sequences, the statistic will be computed
on each independently.
statistic : string or callable, optional
The statistic to compute (default is 'mean').
The following statistics are available:
* 'mean' : compute the mean of values for points within each bin.
Empty bins will be represented by NaN.
* 'std' : compute the standard deviation within each bin. This
is implicitly calculated with ddof=0.
* 'median' : compute the median of values for points within each
bin. Empty bins will be represented by NaN.
* 'count' : compute the count of points within each bin. This is
identical to an unweighted histogram. `values` array is not
referenced.
* 'sum' : compute the sum of values for points within each bin.
This is identical to a weighted histogram.
* 'min' : compute the minimum of values for points within each bin.
Empty bins will be represented by NaN.
* 'max' : compute the maximum of values for point within each bin.
Empty bins will be represented by NaN.
* function : a user-defined function which takes a 1D array of
values, and outputs a single numerical statistic. This function
will be called on the values in each bin. Empty bins will be
represented by function([]), or NaN if this returns an error.
bins : int or sequence of scalars, optional
If `bins` is an int, it defines the number of equal-width bins in the
given range (10 by default). If `bins` is a sequence, it defines the
bin edges, including the rightmost edge, allowing for non-uniform bin
widths. Values in `x` that are smaller than lowest bin edge are
assigned to bin number 0, values beyond the highest bin are assigned to
``bins[-1]``. If the bin edges are specified, the number of bins will
be, (nx = len(bins)-1).
range : (float, float) or [(float, float)], optional
The lower and upper range of the bins. If not provided, range
is simply ``(x.min(), x.max())``. Values outside the range are
ignored.
Returns
-------
statistic : array
The values of the selected statistic in each bin.
bin_edges : array of dtype float
Return the bin edges ``(length(statistic)+1)``.
binnumber: 1-D ndarray of ints
Indices of the bins (corresponding to `bin_edges`) in which each value
of `x` belongs. Same length as `values`. A binnumber of `i` means the
corresponding value is between (bin_edges[i-1], bin_edges[i]).
See Also
--------
numpy.digitize, numpy.histogram, binned_statistic_2d, binned_statistic_dd
Notes
-----
All but the last (righthand-most) bin is half-open. In other words, if
`bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
but excluding 2) and the second ``[2, 3)``. The last bin, however, is
``[3, 4]``, which *includes* 4.
.. versionadded:: 0.11.0
Examples
--------
>>> from scipy import stats
>>> import matplotlib.pyplot as plt
First some basic examples:
Create two evenly spaced bins in the range of the given sample, and sum the
corresponding values in each of those bins:
>>> values = [1.0, 1.0, 2.0, 1.5, 3.0]
>>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
BinnedStatisticResult(statistic=array([4. , 4.5]),
bin_edges=array([1., 4., 7.]), binnumber=array([1, 1, 1, 2, 2]))
Multiple arrays of values can also be passed. The statistic is calculated
on each set independently:
>>> values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]]
>>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
BinnedStatisticResult(statistic=array([[4. , 4.5],
[8. , 9. ]]), bin_edges=array([1., 4., 7.]),
binnumber=array([1, 1, 1, 2, 2]))
>>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
... bins=3)
BinnedStatisticResult(statistic=array([1., 2., 4.]),
bin_edges=array([1., 2., 3., 4.]),
binnumber=array([1, 2, 1, 2, 3]))
As a second example, we now generate some random data of sailing boat speed
as a function of wind speed, and then determine how fast our boat is for
certain wind speeds:
>>> windspeed = 8 * np.random.rand(500)
>>> boatspeed = .3 * windspeed**.5 + .2 * np.random.rand(500)
>>> bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed,
... boatspeed, statistic='median', bins=[1,2,3,4,5,6,7])
>>> plt.figure()
>>> plt.plot(windspeed, boatspeed, 'b.', label='raw data')
>>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5,
... label='binned statistic of data')
>>> plt.legend()
Now we can use ``binnumber`` to select all datapoints with a windspeed
below 1:
>>> low_boatspeed = boatspeed[binnumber == 0]
As a final example, we will use ``bin_edges`` and ``binnumber`` to make a
plot of a distribution that shows the mean and distribution around that
mean per bin, on top of a regular histogram and the probability
distribution function:
>>> x = np.linspace(0, 5, num=500)
>>> x_pdf = stats.maxwell.pdf(x)
>>> samples = stats.maxwell.rvs(size=10000)
>>> bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf,
... statistic='mean', bins=25)
>>> bin_width = (bin_edges[1] - bin_edges[0])
>>> bin_centers = bin_edges[1:] - bin_width/2
>>> plt.figure()
>>> plt.hist(samples, bins=50, density=True, histtype='stepfilled',
... alpha=0.2, label='histogram of data')
>>> plt.plot(x, x_pdf, 'r-', label='analytical pdf')
>>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2,
... label='binned statistic of data')
>>> plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5)
>>> plt.legend(fontsize=10)
>>> plt.show()
"""
try:
N = len(bins)
except TypeError:
N = 1
if N != 1:
bins = [np.asarray(bins, float)]
if range is not None:
if len(range) == 2:
range = [range]
medians, edges, binnumbers = binned_statistic_dd(
[x], values, statistic, bins, range)
return BinnedStatisticResult(medians, edges[0], binnumbers)
BinnedStatistic2dResult = namedtuple('BinnedStatistic2dResult',
('statistic', 'x_edge', 'y_edge',
'binnumber'))
def binned_statistic_2d(x, y, values, statistic='mean',
bins=10, range=None, expand_binnumbers=False):
"""
Compute a bidimensional binned statistic for one or more sets of data.
This is a generalization of a histogram2d function. A histogram divides
the space into bins, and returns the count of the number of points in
each bin. This function allows the computation of the sum, mean, median,
or other statistic of the values (or set of values) within each bin.
Parameters
----------
x : (N,) array_like
A sequence of values to be binned along the first dimension.
y : (N,) array_like
A sequence of values to be binned along the second dimension.
values : (N,) array_like or list of (N,) array_like
The data on which the statistic will be computed. This must be
the same shape as `x`, or a list of sequences - each with the same
shape as `x`. If `values` is such a list, the statistic will be
computed on each independently.
statistic : string or callable, optional
The statistic to compute (default is 'mean').
The following statistics are available:
* 'mean' : compute the mean of values for points within each bin.
Empty bins will be represented by NaN.
* 'std' : compute the standard deviation within each bin. This
is implicitly calculated with ddof=0.
* 'median' : compute the median of values for points within each
bin. Empty bins will be represented by NaN.
* 'count' : compute the count of points within each bin. This is
identical to an unweighted histogram. `values` array is not
referenced.
* 'sum' : compute the sum of values for points within each bin.
This is identical to a weighted histogram.
* 'min' : compute the minimum of values for points within each bin.
Empty bins will be represented by NaN.
* 'max' : compute the maximum of values for point within each bin.
Empty bins will be represented by NaN.
* function : a user-defined function which takes a 1D array of
values, and outputs a single numerical statistic. This function
will be called on the values in each bin. Empty bins will be
represented by function([]), or NaN if this returns an error.
bins : int or [int, int] or array_like or [array, array], optional
The bin specification:
* the number of bins for the two dimensions (nx = ny = bins),
* the number of bins in each dimension (nx, ny = bins),
* the bin edges for the two dimensions (x_edge = y_edge = bins),
* the bin edges in each dimension (x_edge, y_edge = bins).
If the bin edges are specified, the number of bins will be,
(nx = len(x_edge)-1, ny = len(y_edge)-1).
range : (2,2) array_like, optional
The leftmost and rightmost edges of the bins along each dimension
(if not specified explicitly in the `bins` parameters):
[[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
considered outliers and not tallied in the histogram.
expand_binnumbers : bool, optional
'False' (default): the returned `binnumber` is a shape (N,) array of
linearized bin indices.
'True': the returned `binnumber` is 'unraveled' into a shape (2,N)
ndarray, where each row gives the bin numbers in the corresponding
dimension.
See the `binnumber` returned value, and the `Examples` section.
.. versionadded:: 0.17.0
Returns
-------
statistic : (nx, ny) ndarray
The values of the selected statistic in each two-dimensional bin.
x_edge : (nx + 1) ndarray
The bin edges along the first dimension.
y_edge : (ny + 1) ndarray
The bin edges along the second dimension.
binnumber : (N,) array of ints or (2,N) ndarray of ints
This assigns to each element of `sample` an integer that represents the
bin in which this observation falls. The representation depends on the
`expand_binnumbers` argument. See `Notes` for details.
See Also
--------
numpy.digitize, numpy.histogram2d, binned_statistic, binned_statistic_dd
Notes
-----
Binedges:
All but the last (righthand-most) bin is half-open. In other words, if
`bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
but excluding 2) and the second ``[2, 3)``. The last bin, however, is
``[3, 4]``, which *includes* 4.
`binnumber`:
This returned argument assigns to each element of `sample` an integer that
represents the bin in which it belongs. The representation depends on the
`expand_binnumbers` argument. If 'False' (default): The returned
`binnumber` is a shape (N,) array of linearized indices mapping each
element of `sample` to its corresponding bin (using row-major ordering).
If 'True': The returned `binnumber` is a shape (2,N) ndarray where
each row indicates bin placements for each dimension respectively. In each
dimension, a binnumber of `i` means the corresponding value is between
(D_edge[i-1], D_edge[i]), where 'D' is either 'x' or 'y'.
.. versionadded:: 0.11.0
Examples
--------
>>> from scipy import stats
Calculate the counts with explicit bin-edges:
>>> x = [0.1, 0.1, 0.1, 0.6]
>>> y = [2.1, 2.6, 2.1, 2.1]
>>> binx = [0.0, 0.5, 1.0]
>>> biny = [2.0, 2.5, 3.0]
>>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny])
>>> ret.statistic
array([[2., 1.],
[1., 0.]])
The bin in which each sample is placed is given by the `binnumber`
returned parameter. By default, these are the linearized bin indices:
>>> ret.binnumber
array([5, 6, 5, 9])
The bin indices can also be expanded into separate entries for each
dimension using the `expand_binnumbers` parameter:
>>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny],
... expand_binnumbers=True)
>>> ret.binnumber
array([[1, 1, 1, 2],
[1, 2, 1, 1]])
Which shows that the first three elements belong in the xbin 1, and the
fourth into xbin 2; and so on for y.
"""
# This code is based on np.histogram2d
try:
N = len(bins)
except TypeError:
N = 1
if N != 1 and N != 2:
xedges = yedges = np.asarray(bins, float)
bins = [xedges, yedges]
medians, edges, binnumbers = binned_statistic_dd(
[x, y], values, statistic, bins, range,
expand_binnumbers=expand_binnumbers)
return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
BinnedStatisticddResult = namedtuple('BinnedStatisticddResult',
('statistic', 'bin_edges',
'binnumber'))
def binned_statistic_dd(sample, values, statistic='mean',
bins=10, range=None, expand_binnumbers=False,
binned_statistic_result=None):
"""
Compute a multidimensional binned statistic for a set of data.
This is a generalization of a histogramdd function. A histogram divides
the space into bins, and returns the count of the number of points in
each bin. This function allows the computation of the sum, mean, median,
or other statistic of the values within each bin.
Parameters
----------
sample : array_like
Data to histogram passed as a sequence of N arrays of length D, or
as an (N,D) array.
values : (N,) array_like or list of (N,) array_like
The data on which the statistic will be computed. This must be
the same shape as `sample`, or a list of sequences - each with the
same shape as `sample`. If `values` is such a list, the statistic
will be computed on each independently.
statistic : string or callable, optional
The statistic to compute (default is 'mean').
The following statistics are available:
* 'mean' : compute the mean of values for points within each bin.
Empty bins will be represented by NaN.
* 'median' : compute the median of values for points within each
bin. Empty bins will be represented by NaN.
* 'count' : compute the count of points within each bin. This is
identical to an unweighted histogram. `values` array is not
referenced.
* 'sum' : compute the sum of values for points within each bin.
This is identical to a weighted histogram.
* 'std' : compute the standard deviation within each bin. This
is implicitly calculated with ddof=0. If the number of values
within a given bin is 0 or 1, the computed standard deviation value
will be 0 for the bin.
* 'min' : compute the minimum of values for points within each bin.
Empty bins will be represented by NaN.
* 'max' : compute the maximum of values for point within each bin.
Empty bins will be represented by NaN.
* function : a user-defined function which takes a 1D array of
values, and outputs a single numerical statistic. This function
will be called on the values in each bin. Empty bins will be
represented by function([]), or NaN if this returns an error.
bins : sequence or positive int, optional
The bin specification must be in one of the following forms:
* A sequence of arrays describing the bin edges along each dimension.
* The number of bins for each dimension (nx, ny, ... = bins).
* The number of bins for all dimensions (nx = ny = ... = bins).
range : sequence, optional
A sequence of lower and upper bin edges to be used if the edges are
not given explicitly in `bins`. Defaults to the minimum and maximum
values along each dimension.
expand_binnumbers : bool, optional
'False' (default): the returned `binnumber` is a shape (N,) array of
linearized bin indices.
'True': the returned `binnumber` is 'unraveled' into a shape (D,N)
ndarray, where each row gives the bin numbers in the corresponding
dimension.
See the `binnumber` returned value, and the `Examples` section of
`binned_statistic_2d`.
binned_statistic_result : binnedStatisticddResult
Result of a previous call to the function in order to reuse bin edges
and bin numbers with new values and/or a different statistic.
To reuse bin numbers, `expand_binnumbers` must have been set to False
(the default)
.. versionadded:: 0.17.0
Returns
-------
statistic : ndarray, shape(nx1, nx2, nx3,...)
The values of the selected statistic in each two-dimensional bin.
bin_edges : list of ndarrays
A list of D arrays describing the (nxi + 1) bin edges for each
dimension.
binnumber : (N,) array of ints or (D,N) ndarray of ints
This assigns to each element of `sample` an integer that represents the
bin in which this observation falls. The representation depends on the
`expand_binnumbers` argument. See `Notes` for details.
See Also
--------
numpy.digitize, numpy.histogramdd, binned_statistic, binned_statistic_2d
Notes
-----
Binedges:
All but the last (righthand-most) bin is half-open in each dimension. In
other words, if `bins` is ``[1, 2, 3, 4]``, then the first bin is
``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The
last bin, however, is ``[3, 4]``, which *includes* 4.
`binnumber`:
This returned argument assigns to each element of `sample` an integer that
represents the bin in which it belongs. The representation depends on the
`expand_binnumbers` argument. If 'False' (default): The returned
`binnumber` is a shape (N,) array of linearized indices mapping each
element of `sample` to its corresponding bin (using row-major ordering).
If 'True': The returned `binnumber` is a shape (D,N) ndarray where
each row indicates bin placements for each dimension respectively. In each
dimension, a binnumber of `i` means the corresponding value is between
(bin_edges[D][i-1], bin_edges[D][i]), for each dimension 'D'.
.. versionadded:: 0.11.0
Examples
--------
>>> from scipy import stats
>>> import matplotlib.pyplot as plt
>>> from mpl_toolkits.mplot3d import Axes3D
Take an array of 600 (x, y) coordinates as an example.
`binned_statistic_dd` can handle arrays of higher dimension `D`. But a plot
of dimension `D+1` is required.
>>> mu = np.array([0., 1.])
>>> sigma = np.array([[1., -0.5],[-0.5, 1.5]])
>>> multinormal = stats.multivariate_normal(mu, sigma)
>>> data = multinormal.rvs(size=600, random_state=235412)
>>> data.shape
(600, 2)
Create bins and count how many arrays fall in each bin:
>>> N = 60
>>> x = np.linspace(-3, 3, N)
>>> y = np.linspace(-3, 4, N)
>>> ret = stats.binned_statistic_dd(data, np.arange(600), bins=[x, y],
... statistic='count')
>>> bincounts = ret.statistic
Set the volume and the location of bars:
>>> dx = x[1] - x[0]
>>> dy = y[1] - y[0]
>>> x, y = np.meshgrid(x[:-1]+dx/2, y[:-1]+dy/2)
>>> z = 0
>>> bincounts = bincounts.ravel()
>>> x = x.ravel()
>>> y = y.ravel()
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111, projection='3d')
>>> with np.errstate(divide='ignore'): # silence random axes3d warning
... ax.bar3d(x, y, z, dx, dy, bincounts)
Reuse bin numbers and bin edges with new values:
>>> ret2 = stats.binned_statistic_dd(data, -np.arange(600),
... binned_statistic_result=ret,
... statistic='mean')
"""
known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max']
if not callable(statistic) and statistic not in known_stats:
raise ValueError('invalid statistic %r' % (statistic,))
try:
bins = index(bins)
except TypeError:
# bins is not an integer
pass
# If bins was an integer-like object, now it is an actual Python int.
# NOTE: for _bin_edges(), see e.g. gh-11365
if isinstance(bins, int) and not np.isfinite(sample).all():
raise ValueError('%r contains non-finite values.' % (sample,))
# `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
# `Dlen` is the length of elements along each dimension.
# This code is based on np.histogramdd
try:
# `sample` is an ND-array.
Dlen, Ndim = sample.shape
except (AttributeError, ValueError):
# `sample` is a sequence of 1D arrays.
sample = np.atleast_2d(sample).T
Dlen, Ndim = sample.shape
# Store initial shape of `values` to preserve it in the output
values = np.asarray(values)
input_shape = list(values.shape)
# Make sure that `values` is 2D to iterate over rows
values = np.atleast_2d(values)
Vdim, Vlen = values.shape
# Make sure `values` match `sample`
if(statistic != 'count' and Vlen != Dlen):
raise AttributeError('The number of `values` elements must match the '
'length of each `sample` dimension.')
try:
M = len(bins)
if M != Ndim:
raise AttributeError('The dimension of bins must be equal '
'to the dimension of the sample x.')
except TypeError:
bins = Ndim * [bins]
if binned_statistic_result is None:
nbin, edges, dedges = _bin_edges(sample, bins, range)
binnumbers = _bin_numbers(sample, nbin, edges, dedges)
else:
edges = binned_statistic_result.bin_edges
nbin = np.array([len(edges[i]) + 1 for i in builtins.range(Ndim)])
# +1 for outlier bins
dedges = [np.diff(edges[i]) for i in builtins.range(Ndim)]
binnumbers = binned_statistic_result.binnumber
result = np.empty([Vdim, nbin.prod()], float)
if statistic == 'mean':
result.fill(np.nan)
flatcount = np.bincount(binnumbers, None)
a = flatcount.nonzero()
for vv in builtins.range(Vdim):
flatsum = np.bincount(binnumbers, values[vv])
result[vv, a] = flatsum[a] / flatcount[a]
elif statistic == 'std':
result.fill(0)
flatcount = np.bincount(binnumbers, None)
a = flatcount.nonzero()
for vv in builtins.range(Vdim):
for i in np.unique(binnumbers):
# NOTE: take std dev by bin, np.std() is 2-pass and stable
binned_data = values[vv, binnumbers == i]
# calc std only when binned data is 2 or more for speed up.
if len(binned_data) >= 2:
result[vv, i] = np.std(binned_data)
elif statistic == 'count':
result.fill(0)
flatcount = np.bincount(binnumbers, None)
a = np.arange(len(flatcount))
result[:, a] = flatcount[np.newaxis, :]
elif statistic == 'sum':
result.fill(0)
for vv in builtins.range(Vdim):
flatsum = np.bincount(binnumbers, values[vv])
a = np.arange(len(flatsum))
result[vv, a] = flatsum
elif statistic == 'median':
result.fill(np.nan)
for i in np.unique(binnumbers):
for vv in builtins.range(Vdim):
result[vv, i] = np.median(values[vv, binnumbers == i])
elif statistic == 'min':
result.fill(np.nan)
for i in np.unique(binnumbers):
for vv in builtins.range(Vdim):
result[vv, i] = np.min(values[vv, binnumbers == i])
elif statistic == 'max':
result.fill(np.nan)
for i in np.unique(binnumbers):
for vv in builtins.range(Vdim):
result[vv, i] = np.max(values[vv, binnumbers == i])
elif callable(statistic):
with np.errstate(invalid='ignore'), suppress_warnings() as sup:
sup.filter(RuntimeWarning)
try:
null = statistic([])
except Exception:
null = np.nan
result.fill(null)
for i in np.unique(binnumbers):
for vv in builtins.range(Vdim):
result[vv, i] = statistic(values[vv, binnumbers == i])
# Shape into a proper matrix
result = result.reshape(np.append(Vdim, nbin))
# Remove outliers (indices 0 and -1 for each bin-dimension).
core = tuple([slice(None)] + Ndim * [slice(1, -1)])
result = result[core]
# Unravel binnumbers into an ndarray, each row the bins for each dimension
if(expand_binnumbers and Ndim > 1):
binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))
if np.any(result.shape[1:] != nbin - 2):
raise RuntimeError('Internal Shape Error')
# Reshape to have output (`result`) match input (`values`) shape
result = result.reshape(input_shape[:-1] + list(nbin-2))
return BinnedStatisticddResult(result, edges, binnumbers)
def _bin_edges(sample, bins=None, range=None):
""" Create edge arrays
"""
Dlen, Ndim = sample.shape
nbin = np.empty(Ndim, int) # Number of bins in each dimension
edges = Ndim * [None] # Bin edges for each dim (will be 2D array)
dedges = Ndim * [None] # Spacing between edges (will be 2D array)
# Select range for each dimension
# Used only if number of bins is given.
if range is None:
smin = np.atleast_1d(np.array(sample.min(axis=0), float))
smax = np.atleast_1d(np.array(sample.max(axis=0), float))
else:
smin = np.zeros(Ndim)
smax = np.zeros(Ndim)
for i in builtins.range(Ndim):
smin[i], smax[i] = range[i]
# Make sure the bins have a finite width.
for i in builtins.range(len(smin)):
if smin[i] == smax[i]:
smin[i] = smin[i] - .5
smax[i] = smax[i] + .5
# Create edge arrays
for i in builtins.range(Ndim):
if np.isscalar(bins[i]):
nbin[i] = bins[i] + 2 # +2 for outlier bins
edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1)
else:
edges[i] = np.asarray(bins[i], float)
nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
dedges[i] = np.diff(edges[i])
nbin = np.asarray(nbin)
return nbin, edges, dedges
def _bin_numbers(sample, nbin, edges, dedges):
"""Compute the bin number each sample falls into, in each dimension
"""
Dlen, Ndim = sample.shape
sampBin = [
np.digitize(sample[:, i], edges[i])
for i in range(Ndim)
]
# Using `digitize`, values that fall on an edge are put in the right bin.
# For the rightmost bin, we want values equal to the right
# edge to be counted in the last bin, and not as an outlier.
for i in range(Ndim):
# Find the rounding precision
dedges_min = dedges[i].min()
if dedges_min == 0:
raise ValueError('The smallest edge difference is numerically 0.')
decimal = int(-np.log10(dedges_min)) + 6
# Find which points are on the rightmost edge.
on_edge = np.where(np.around(sample[:, i], decimal) ==
np.around(edges[i][-1], decimal))[0]
# Shift these points one bin to the left.
sampBin[i][on_edge] -= 1
# Compute the sample indices in the flattened statistic matrix.
binnumbers = np.ravel_multi_index(sampBin, nbin)
return binnumbers

View file

@ -0,0 +1,31 @@
"""
Statistics-related constants.
"""
import numpy as np
# The smallest representable positive number such that 1.0 + _EPS != 1.0.
_EPS = np.finfo(float).eps
# The largest [in magnitude] usable floating value.
_XMAX = np.finfo(float).max
# The log of the largest usable floating value; useful for knowing
# when exp(something) will overflow
_LOGXMAX = np.log(_XMAX)
# The smallest [in magnitude] usable floating value.
_XMIN = np.finfo(float).tiny
# -special.psi(1)
_EULER = 0.577215664901532860606512090082402431042
# special.zeta(3, 1) Apery's constant
_ZETA3 = 1.202056903159594285399738161511449990765
# sqrt(2/pi)
_SQRT_2_OVER_PI = 0.7978845608028654
# log(sqrt(2/pi))
_LOG_SQRT_2_OVER_PI = -0.22579135264472744

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,134 @@
"""
Sane parameters for stats.distributions.
"""
distcont = [
['alpha', (3.5704770516650459,)],
['anglit', ()],
['arcsine', ()],
['argus', (1.0,)],
['beta', (2.3098496451481823, 0.62687954300963677)],
['betaprime', (5, 6)],
['bradford', (0.29891359763170633,)],
['burr', (10.5, 4.3)],
['burr12', (10, 4)],
['cauchy', ()],
['chi', (78,)],
['chi2', (55,)],
['cosine', ()],
['crystalball', (2.0, 3.0)],
['dgamma', (1.1023326088288166,)],
['dweibull', (2.0685080649914673,)],
['erlang', (10,)],
['expon', ()],
['exponnorm', (1.5,)],
['exponpow', (2.697119160358469,)],
['exponweib', (2.8923945291034436, 1.9505288745913174)],
['f', (29, 18)],
['fatiguelife', (29,)], # correction numargs = 1
['fisk', (3.0857548622253179,)],
['foldcauchy', (4.7164673455831894,)],
['foldnorm', (1.9521253373555869,)],
['frechet_l', (3.6279911255583239,)],
['frechet_r', (1.8928171603534227,)],
['gamma', (1.9932305483800778,)],
['gausshyper', (13.763771604130699, 3.1189636648681431,
2.5145980350183019, 5.1811649903971615)], # veryslow
['genexpon', (9.1325976465418908, 16.231956600590632, 3.2819552690843983)],
['genextreme', (-0.1,)],
['gengamma', (4.4162385429431925, 3.1193091679242761)],
['gengamma', (4.4162385429431925, -3.1193091679242761)],
['genhalflogistic', (0.77274727809929322,)],
['geninvgauss', (2.3, 1.5)],
['genlogistic', (0.41192440799679475,)],
['gennorm', (1.2988442399460265,)],
['halfgennorm', (0.6748054997000371,)],
['genpareto', (0.1,)], # use case with finite moments
['gilbrat', ()],
['gompertz', (0.94743713075105251,)],
['gumbel_l', ()],
['gumbel_r', ()],
['halfcauchy', ()],
['halflogistic', ()],
['halfnorm', ()],
['hypsecant', ()],
['invgamma', (4.0668996136993067,)],
['invgauss', (0.14546264555347513,)],
['invweibull', (10.58,)],
['johnsonsb', (4.3172675099141058, 3.1837781130785063)],
['johnsonsu', (2.554395574161155, 2.2482281679651965)],
['kappa4', (0.0, 0.0)],
['kappa4', (-0.1, 0.1)],
['kappa4', (0.0, 0.1)],
['kappa4', (0.1, 0.0)],
['kappa3', (1.0,)],
['ksone', (1000,)], # replace 22 by 100 to avoid failing range, ticket 956
['kstwo', (10,)],
['kstwobign', ()],
['laplace', ()],
['levy', ()],
['levy_l', ()],
['levy_stable', (1.8, -0.5)],
['loggamma', (0.41411931826052117,)],
['logistic', ()],
['loglaplace', (3.2505926592051435,)],
['lognorm', (0.95368226960575331,)],
['loguniform', (0.01, 1)],
['lomax', (1.8771398388773268,)],
['maxwell', ()],
['mielke', (10.4, 4.6)],
['moyal', ()],
['nakagami', (4.9673794866666237,)],
['ncf', (27, 27, 0.41578441799226107)],
['nct', (14, 0.24045031331198066)],
['ncx2', (21, 1.0560465975116415)],
['norm', ()],
['norminvgauss', (1., 0.5)],
['pareto', (2.621716532144454,)],
['pearson3', (0.1,)],
['powerlaw', (1.6591133289905851,)],
['powerlognorm', (2.1413923530064087, 0.44639540782048337)],
['powernorm', (4.4453652254590779,)],
['rayleigh', ()],
['rdist', (1.6,)],
['recipinvgauss', (0.63004267809369119,)],
['reciprocal', (0.01, 1)],
['rice', (0.7749725210111873,)],
['semicircular', ()],
['skewnorm', (4.0,)],
['t', (2.7433514990818093,)],
['trapz', (0.2, 0.8)],
['triang', (0.15785029824528218,)],
['truncexpon', (4.6907725456810478,)],
['truncnorm', (-1.0978730080013919, 2.7306754109031979)],
['truncnorm', (0.1, 2.)],
['tukeylambda', (3.1321477856738267,)],
['uniform', ()],
['vonmises', (3.9939042581071398,)],
['vonmises_line', (3.9939042581071398,)],
['wald', ()],
['weibull_max', (2.8687961709100187,)],
['weibull_min', (1.7866166930421596,)],
['wrapcauchy', (0.031071279018614728,)]]
distdiscrete = [
['bernoulli',(0.3,)],
['betabinom', (5, 2.3, 0.63)],
['binom', (5, 0.4)],
['boltzmann',(1.4, 19)],
['dlaplace', (0.8,)], # 0.5
['geom', (0.5,)],
['hypergeom',(30, 12, 6)],
['hypergeom',(21,3,12)], # numpy.random (3,18,12) numpy ticket:921
['hypergeom',(21,18,11)], # numpy.random (18,3,11) numpy ticket:921
['logser', (0.6,)], # re-enabled, numpy ticket:921
['nbinom', (5, 0.5)],
['nbinom', (0.4, 0.4)], # from tickets: 583
['planck', (0.51,)], # 4.1
['poisson', (0.6,)],
['randint', (7, 31)],
['skellam', (15, 8)],
['zipf', (6.5,)],
['yulesimon',(11.0,)]
]

View file

@ -0,0 +1,148 @@
from collections import namedtuple
import numpy as np
import warnings
from ._continuous_distns import chi2
from . import _wilcoxon_data
Epps_Singleton_2sampResult = namedtuple('Epps_Singleton_2sampResult',
('statistic', 'pvalue'))
def epps_singleton_2samp(x, y, t=(0.4, 0.8)):
"""
Compute the Epps-Singleton (ES) test statistic.
Test the null hypothesis that two samples have the same underlying
probability distribution.
Parameters
----------
x, y : array-like
The two samples of observations to be tested. Input must not have more
than one dimension. Samples can have different lengths.
t : array-like, optional
The points (t1, ..., tn) where the empirical characteristic function is
to be evaluated. It should be positive distinct numbers. The default
value (0.4, 0.8) is proposed in [1]_. Input must not have more than
one dimension.
Returns
-------
statistic : float
The test statistic.
pvalue : float
The associated p-value based on the asymptotic chi2-distribution.
See Also
--------
ks_2samp, anderson_ksamp
Notes
-----
Testing whether two samples are generated by the same underlying
distribution is a classical question in statistics. A widely used test is
the Kolmogorov-Smirnov (KS) test which relies on the empirical
distribution function. Epps and Singleton introduce a test based on the
empirical characteristic function in [1]_.
One advantage of the ES test compared to the KS test is that is does
not assume a continuous distribution. In [1]_, the authors conclude
that the test also has a higher power than the KS test in many
examples. They recommend the use of the ES test for discrete samples as
well as continuous samples with at least 25 observations each, whereas
`anderson_ksamp` is recommended for smaller sample sizes in the
continuous case.
The p-value is computed from the asymptotic distribution of the test
statistic which follows a `chi2` distribution. If the sample size of both
`x` and `y` is below 25, the small sample correction proposed in [1]_ is
applied to the test statistic.
The default values of `t` are determined in [1]_ by considering
various distributions and finding good values that lead to a high power
of the test in general. Table III in [1]_ gives the optimal values for
the distributions tested in that study. The values of `t` are scaled by
the semi-interquartile range in the implementation, see [1]_.
References
----------
.. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample
problem using the empirical characteristic function", Journal of
Statistical Computation and Simulation 26, p. 177--203, 1986.
.. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions
- the Epps-Singleton two-sample test using the empirical characteristic
function", The Stata Journal 9(3), p. 454--465, 2009.
"""
x, y, t = np.asarray(x), np.asarray(y), np.asarray(t)
# check if x and y are valid inputs
if x.ndim > 1:
raise ValueError('x must be 1d, but x.ndim equals {}.'.format(x.ndim))
if y.ndim > 1:
raise ValueError('y must be 1d, but y.ndim equals {}.'.format(y.ndim))
nx, ny = len(x), len(y)
if (nx < 5) or (ny < 5):
raise ValueError('x and y should have at least 5 elements, but len(x) '
'= {} and len(y) = {}.'.format(nx, ny))
if not np.isfinite(x).all():
raise ValueError('x must not contain nonfinite values.')
if not np.isfinite(y).all():
raise ValueError('y must not contain nonfinite values.')
n = nx + ny
# check if t is valid
if t.ndim > 1:
raise ValueError('t must be 1d, but t.ndim equals {}.'.format(t.ndim))
if np.less_equal(t, 0).any():
raise ValueError('t must contain positive elements only.')
# rescale t with semi-iqr as proposed in [1]; import iqr here to avoid
# circular import
from scipy.stats import iqr
sigma = iqr(np.hstack((x, y))) / 2
ts = np.reshape(t, (-1, 1)) / sigma
# covariance estimation of ES test
gx = np.vstack((np.cos(ts*x), np.sin(ts*x))).T # shape = (nx, 2*len(t))
gy = np.vstack((np.cos(ts*y), np.sin(ts*y))).T
cov_x = np.cov(gx.T, bias=True) # the test uses biased cov-estimate
cov_y = np.cov(gy.T, bias=True)
est_cov = (n/nx)*cov_x + (n/ny)*cov_y
est_cov_inv = np.linalg.pinv(est_cov)
r = np.linalg.matrix_rank(est_cov_inv)
if r < 2*len(t):
warnings.warn('Estimated covariance matrix does not have full rank. '
'This indicates a bad choice of the input t and the '
'test might not be consistent.') # see p. 183 in [1]_
# compute test statistic w distributed asympt. as chisquare with df=r
g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0)
w = n*np.dot(g_diff.T, np.dot(est_cov_inv, g_diff))
# apply small-sample correction
if (max(nx, ny) < 25):
corr = 1.0/(1.0 + n**(-0.45) + 10.1*(nx**(-1.7) + ny**(-1.7)))
w = corr * w
p = chi2.sf(w, r)
return Epps_Singleton_2sampResult(w, p)
def _get_wilcoxon_distr(n):
"""
Distribution of counts of the Wilcoxon ranksum statistic r_plus (sum of
ranks of positive differences).
Returns an array with the counts/frequencies of all the possible ranks
r = 0, ..., n*(n+1)/2
"""
cnt = _wilcoxon_data.COUNTS.get(n)
if cnt is None:
raise ValueError("The exact distribution of the Wilcoxon test "
"statistic is not implemented for n={}".format(n))
return np.array(cnt, dtype=int)

View file

@ -0,0 +1,596 @@
# Compute the two-sided one-sample Kolmogorov-Smirnov Prob(Dn <= d) where:
# D_n = sup_x{|F_n(x) - F(x)|},
# F_n(x) is the empirical CDF for a sample of size n {x_i: i=1,...,n},
# F(x) is the CDF of a probability distribution.
#
# Exact methods:
# Prob(D_n >= d) can be computed via a matrix algorithm of Durbin[1]
# or a recursion algorithm due to Pomeranz[2].
# Marsaglia, Tsang & Wang[3] gave a computation-efficient way to perform
# the Durbin algorithm.
# D_n >= d <==> D_n+ >= d or D_n- >= d (the one-sided K-S statistics), hence
# Prob(D_n >= d) = 2*Prob(D_n+ >= d) - Prob(D_n+ >= d and D_n- >= d).
# For d > 0.5, the latter intersection probability is 0.
#
# Approximate methods:
# For d close to 0.5, ignoring that intersection term may still give a
# reasonable approximation.
# Li-Chien[4] and Korolyuk[5] gave an asymptotic formula extending
# Kolmogorov's initial asymptotic, suitable for large d. (See
# scipy.special.kolmogorov for that asymptotic)
# Pelz-Good[6] used the functional equation for Jacobi theta functions to
# transform the Li-Chien/Korolyuk formula produce a computational formula
# suitable for small d.
#
# Simard and L'Ecuyer[7] provided an algorithm to decide when to use each of
# the above approaches and it is that which is used here.
#
# Other approaches:
# Carvalho[8] optimizes Durbin's matrix algorithm for large values of d.
# Moscovich and Nadler[9] use FFTs to compute the convolutions.
# References:
# [1] Durbin J (1968).
# "The Probability that the Sample Distribution Function Lies Between Two
# Parallel Straight Lines."
# Annals of Mathematical Statistics, 39, 398-411.
# [2] Pomeranz J (1974).
# "Exact Cumulative Distribution of the Kolmogorov-Smirnov Statistic for
# Small Samples (Algorithm 487)."
# Communications of the ACM, 17(12), 703-704.
# [3] Marsaglia G, Tsang WW, Wang J (2003).
# "Evaluating Kolmogorov's Distribution."
# Journal of Statistical Software, 8(18), 1-4.
# [4] LI-CHIEN, C. (1956).
# "On the exact distribution of the statistics of A. N. Kolmogorov and
# their asymptotic expansion."
# Acta Matematica Sinica, 6, 55-81.
# [5] KOROLYUK, V. S. (1960).
# "Asymptotic analysis of the distribution of the maximum deviation in
# the Bernoulli scheme."
# Theor. Probability Appl., 4, 339-366.
# [6] Pelz W, Good IJ (1976).
# "Approximating the Lower Tail-areas of the Kolmogorov-Smirnov One-sample
# Statistic."
# Journal of the Royal Statistical Society, Series B, 38(2), 152-156.
# [7] Simard, R., L'Ecuyer, P. (2011)
# "Computing the Two-Sided Kolmogorov-Smirnov Distribution",
# Journal of Statistical Software, Vol 39, 11, 1-18.
# [8] Carvalho, Luis (2015)
# "An Improved Evaluation of Kolmogorov's Distribution"
# Journal of Statistical Software, Code Snippets; Vol 65(3), 1-8.
# [9] Amit Moscovich, Boaz Nadler (2017)
# "Fast calculation of boundary crossing probabilities for Poisson
# processes",
# Statistics & Probability Letters, Vol 123, 177-182.
import numpy as np
import scipy.special
import scipy.special._ufuncs as scu
import scipy.misc
_E128 = 128
_EP128 = np.ldexp(np.longdouble(1), _E128)
_EM128 = np.ldexp(np.longdouble(1), -_E128)
_SQRT2PI = np.sqrt(2 * np.pi)
_LOG_2PI = np.log(2 * np.pi)
_MIN_LOG = -708
_SQRT3 = np.sqrt(3)
_PI_SQUARED = np.pi ** 2
_PI_FOUR = np.pi ** 4
_PI_SIX = np.pi ** 6
# [Lifted from _loggamma.pxd.] If B_m are the Bernoulli numbers,
# then Stirling coeffs are B_{2j}/(2j)/(2j-1) for j=8,...1.
_STIRLING_COEFFS = [-2.955065359477124183e-2, 6.4102564102564102564e-3,
-1.9175269175269175269e-3, 8.4175084175084175084e-4,
-5.952380952380952381e-4, 7.9365079365079365079e-4,
-2.7777777777777777778e-3, 8.3333333333333333333e-2]
def _log_nfactorial_div_n_pow_n(n):
# Computes n! / n**n
# = (n-1)! / n**(n-1)
# Uses Stirling's approximation, but removes n*log(n) up-front to
# avoid subtractive cancellation.
# = log(n)/2 - n + log(sqrt(2pi)) + sum B_{2j}/(2j)/(2j-1)/n**(2j-1)
rn = 1.0/n
return np.log(n)/2 - n + _LOG_2PI/2 + rn * np.polyval(_STIRLING_COEFFS, rn/n)
def _clip_prob(p):
"""clips a probability to range 0<=p<=1."""
return np.clip(p, 0.0, 1.0)
def _select_and_clip_prob(cdfprob, sfprob, cdf=True):
"""Selects either the CDF or SF, and then clips to range 0<=p<=1."""
p = np.where(cdf, cdfprob, sfprob)
return _clip_prob(p)
def _kolmogn_DMTW(n, d, cdf=True):
r"""Computes the Kolmogorov CDF: Pr(D_n <= d) using the MTW approach to
the Durbin matrix algorithm.
Durbin (1968); Marsaglia, Tsang, Wang (2003). [1], [3].
"""
# Write d = (k-h)/n, where k is positive integer and 0 <= h < 1
# Generate initial matrix H of size m*m where m=(2k-1)
# Compute k-th row of (n!/n^n) * H^n, scaling intermediate results.
# Requires memory O(m^2) and computation O(m^2 log(n)).
# Most suitable for small m.
if d >= 1.0:
return _select_and_clip_prob(1.0, 0.0, cdf)
nd = n * d
if nd <= 0.5:
return _select_and_clip_prob(0.0, 1.0, cdf)
k = int(np.ceil(nd))
h = k - nd
m = 2 * k - 1
H = np.zeros([m, m])
# Initialize: v is first column (and last row) of H
# v[j] = (1-h^(j+1)/(j+1)! (except for v[-1])
# w[j] = 1/(j)!
# q = k-th row of H (actually i!/n^i*H^i)
intm = np.arange(1, m + 1)
v = 1.0 - h ** intm
w = np.zeros(m)
fac = 1.0
for j in intm:
w[j - 1] = fac
fac /= j # This might underflow. Isn't a problem.
v[j - 1] *= fac
tt = max(2 * h - 1.0, 0)**m - 2*h**m
v[-1] = (1.0 + tt) * fac
for i in range(1, m):
H[i - 1:, i] = w[:m - i + 1]
H[:, 0] = v
H[-1, :] = np.flip(v, axis=0)
Hpwr = np.eye(np.shape(H)[0]) # Holds intermediate powers of H
nn = n
expnt = 0 # Scaling of Hpwr
Hexpnt = 0 # Scaling of H
while nn > 0:
if nn % 2:
Hpwr = np.matmul(Hpwr, H)
expnt += Hexpnt
H = np.matmul(H, H)
Hexpnt *= 2
# Scale as needed.
if np.abs(H[k - 1, k - 1]) > _EP128:
H /= _EP128
Hexpnt += _E128
nn = nn // 2
p = Hpwr[k - 1, k - 1]
# Multiply by n!/n^n
for i in range(1, n + 1):
p = i * p / n
if np.abs(p) < _EM128:
p *= _EP128
expnt -= _E128
# unscale
if expnt != 0:
p = np.ldexp(p, expnt)
return _select_and_clip_prob(p, 1.0-p, cdf)
def _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf):
"""Compute the endpoints of the interval for row i."""
if i == 0:
j1, j2 = -ll - ceilf - 1, ll + ceilf - 1
else:
# i + 1 = 2*ip1div2 + ip1mod2
ip1div2, ip1mod2 = divmod(i + 1, 2)
if ip1mod2 == 0: # i is odd
if ip1div2 == n + 1:
j1, j2 = n - ll - ceilf - 1, n + ll + ceilf - 1
else:
j1, j2 = ip1div2 - 1 - ll - roundf - 1, ip1div2 + ll - 1 + ceilf - 1
else:
j1, j2 = ip1div2 - 1 - ll - 1, ip1div2 + ll + roundf - 1
return max(j1 + 2, 0), min(j2, n)
def _kolmogn_Pomeranz(n, x, cdf=True):
r"""Computes Pr(D_n <= d) using the Pomeranz recursion algorithm.
Pomeranz (1974) [2]
"""
# V is n*(2n+2) matrix.
# Each row is convolution of the previous row and probabilities from a
# Poisson distribution.
# Desired CDF probability is n! V[n-1, 2n+1] (final entry in final row).
# Only two rows are needed at any given stage:
# - Call them V0 and V1.
# - Swap each iteration
# Only a few (contiguous) entries in each row can be non-zero.
# - Keep track of start and end (j1 and j2 below)
# - V0s and V1s track the start in the two rows
# Scale intermediate results as needed.
# Only a few different Poisson distributions can occur
t = n * x
ll = int(np.floor(t))
f = 1.0 * (t - ll) # fractional part of t
g = min(f, 1.0 - f)
ceilf = (1 if f > 0 else 0)
roundf = (1 if f > 0.5 else 0)
npwrs = 2 * (ll + 1) # Maximum number of powers needed in convolutions
gpower = np.zeros(npwrs) # gpower = (g/n)^m/m!
twogpower = np.zeros(npwrs) # twogpower = (2g/n)^m/m!
onem2gpower = np.zeros(npwrs) # onem2gpower = ((1-2g)/n)^m/m!
# gpower etc are *almost* Poisson probs, just missing normalizing factor.
gpower[0] = 1.0
twogpower[0] = 1.0
onem2gpower[0] = 1.0
expnt = 0
g_over_n, two_g_over_n, one_minus_two_g_over_n = g/n, 2*g/n, (1 - 2*g)/n
for m in range(1, npwrs):
gpower[m] = gpower[m - 1] * g_over_n / m
twogpower[m] = twogpower[m - 1] * two_g_over_n / m
onem2gpower[m] = onem2gpower[m - 1] * one_minus_two_g_over_n / m
V0 = np.zeros([npwrs])
V1 = np.zeros([npwrs])
V1[0] = 1 # first row
V0s, V1s = 0, 0 # start indices of the two rows
j1, j2 = _pomeranz_compute_j1j2(0, n, ll, ceilf, roundf)
for i in range(1, 2 * n + 2):
# Preserve j1, V1, V1s, V0s from last iteration
k1 = j1
V0, V1 = V1, V0
V0s, V1s = V1s, V0s
V1.fill(0.0)
j1, j2 = _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf)
if i == 1 or i == 2 * n + 1:
pwrs = gpower
else:
pwrs = (twogpower if i % 2 else onem2gpower)
ln2 = j2 - k1 + 1
if ln2 > 0:
conv = np.convolve(V0[k1 - V0s:k1 - V0s + ln2], pwrs[:ln2])
conv_start = j1 - k1 # First index to use from conv
conv_len = j2 - j1 + 1 # Number of entries to use from conv
V1[:conv_len] = conv[conv_start:conv_start + conv_len]
# Scale to avoid underflow.
if 0 < np.max(V1) < _EM128:
V1 *= _EP128
expnt -= _E128
V1s = V0s + j1 - k1
# multiply by n!
ans = V1[n - V1s]
for m in range(1, n + 1):
if np.abs(ans) > _EP128:
ans *= _EM128
expnt += _E128
ans *= m
# Undo any intermediate scaling
if expnt != 0:
ans = np.ldexp(ans, expnt)
ans = _select_and_clip_prob(ans, 1.0 - ans, cdf)
return ans
def _kolmogn_PelzGood(n, x, cdf=True):
"""Computes the Pelz-Good approximation to Prob(Dn <= x) with 0<=x<=1.
Start with Li-Chien, Korolyuk approximation:
Prob(Dn <= x) ~ K0(z) + K1(z)/sqrt(n) + K2(z)/n + K3(z)/n**1.5
where z = x*sqrt(n).
Transform each K_(z) using Jacobi theta functions into a form suitable
for small z.
Pelz-Good (1976). [6]
"""
if x <= 0.0:
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
if x >= 1.0:
return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
z = np.sqrt(n) * x
zsquared, zthree, zfour, zsix = z**2, z**3, z**4, z**6
qlog = -_PI_SQUARED / 8 / zsquared
if qlog < _MIN_LOG: # z ~ 0.041743441416853426
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
q = np.exp(qlog)
# Coefficients of terms in the sums for K1, K2 and K3
k1a = -zsquared
k1b = _PI_SQUARED / 4
k2a = 6 * zsix + 2 * zfour
k2b = (2 * zfour - 5 * zsquared) * _PI_SQUARED / 4
k2c = _PI_FOUR * (1 - 2 * zsquared) / 16
k3d = _PI_SIX * (5 - 30 * zsquared) / 64
k3c = _PI_FOUR * (-60 * zsquared + 212 * zfour) / 16
k3b = _PI_SQUARED * (135 * zfour - 96 * zsix) / 4
k3a = -30 * zsix - 90 * z**8
K0to3 = np.zeros(4)
# Use a Horner scheme to evaluate sum c_i q^(i^2)
# Reduces to a sum over odd integers.
maxk = int(np.ceil(16 * z / np.pi))
for k in range(maxk, 0, -1):
m = 2 * k - 1
msquared, mfour, msix = m**2, m**4, m**6
qpower = np.power(q, 8 * k)
coeffs = np.array([1.0,
k1a + k1b*msquared,
k2a + k2b*msquared + k2c*mfour,
k3a + k3b*msquared + k3c*mfour + k3d*msix])
K0to3 *= qpower
K0to3 += coeffs
K0to3 *= q
K0to3 *= _SQRT2PI
# z**10 > 0 as z > 0.04
K0to3 /= np.array([z, 6 * zfour, 72 * z**7, 6480 * z**10])
# Now do the other sum over the other terms, all integers k
# K_2: (pi^2 k^2) q^(k^2),
# K_3: (3pi^2 k^2 z^2 - pi^4 k^4)*q^(k^2)
# Don't expect much subtractive cancellation so use direct calculation
q = np.exp(-_PI_SQUARED / 2 / zsquared)
ks = np.arange(maxk, 0, -1)
ksquared = ks ** 2
sqrt3z = _SQRT3 * z
kspi = np.pi * ks
qpwers = q ** ksquared
k2extra = np.sum(ksquared * qpwers)
k2extra *= _PI_SQUARED * _SQRT2PI/(-36 * zthree)
K0to3[2] += k2extra
k3extra = np.sum((sqrt3z + kspi) * (sqrt3z - kspi) * ksquared * qpwers)
k3extra *= _PI_SQUARED * _SQRT2PI/(216 * zsix)
K0to3[3] += k3extra
powers_of_n = np.power(n * 1.0, np.arange(len(K0to3)) / 2.0)
K0to3 /= powers_of_n
if not cdf:
K0to3 *= -1
K0to3[0] += 1
Ksum = sum(K0to3)
return Ksum
def _kolmogn(n, x, cdf=True):
"""Computes the CDF(or SF) for the two-sided Kolmogorov-Smirnov statistic.
x must be of type float, n of type integer.
Simard & L'Ecuyer (2011) [7].
"""
if np.isnan(n):
return n # Keep the same type of nan
if int(n) != n or n <= 0:
return np.nan
if x >= 1.0:
return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
if x <= 0.0:
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
t = n * x
if t <= 1.0: # Ruben-Gambino: 1/2n <= x <= 1/n
if t <= 0.5:
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
if n <= 140:
prob = np.prod(np.arange(1, n+1) * (1.0/n) * (2*t - 1))
else:
prob = np.exp(_log_nfactorial_div_n_pow_n(n) + n * np.log(2*t-1))
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
if t >= n - 1: # Ruben-Gambino
prob = 2 * (1.0 - x)**n
return _select_and_clip_prob(1 - prob, prob, cdf=cdf)
if x >= 0.5: # Exact: 2 * smirnov
prob = 2 * scipy.special.smirnov(n, x)
return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
nxsquared = t * x
if n <= 140:
if nxsquared <= 0.754693:
prob = _kolmogn_DMTW(n, x, cdf=True)
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
if nxsquared <= 4:
prob = _kolmogn_Pomeranz(n, x, cdf=True)
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
# Now use Miller approximation of 2*smirnov
prob = 2 * scipy.special.smirnov(n, x)
return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
# Split CDF and SF as they have different cutoffs on nxsquared.
if not cdf:
if nxsquared >= 370.0:
return 0.0
if nxsquared >= 2.2:
prob = 2 * scipy.special.smirnov(n, x)
return _clip_prob(prob)
# Fall through and compute the SF as 1.0-CDF
if nxsquared >= 18.0:
cdfprob = 1.0
elif n <= 100000 and n * x**1.5 <= 1.4:
cdfprob = _kolmogn_DMTW(n, x, cdf=True)
else:
cdfprob = _kolmogn_PelzGood(n, x, cdf=True)
return _select_and_clip_prob(cdfprob, 1.0 - cdfprob, cdf=cdf)
def _kolmogn_p(n, x):
"""Computes the PDF for the two-sided Kolmogorov-Smirnov statistic.
x must be of type float, n of type integer.
"""
if np.isnan(n):
return n # Keep the same type of nan
if int(n) != n or n <= 0:
return np.nan
if x >= 1.0 or x <= 0:
return 0
t = n * x
if t <= 1.0:
# Ruben-Gambino: n!/n^n * (2t-1)^n -> 2 n!/n^n * n^2 * (2t-1)^(n-1)
if t <= 0.5:
return 0.0
if n <= 140:
prd = np.prod(np.arange(1, n) * (1.0 / n) * (2 * t - 1))
else:
prd = np.exp(_log_nfactorial_div_n_pow_n(n) + (n-1) * np.log(2 * t - 1))
return prd * 2 * n**2
if t >= n - 1:
# Ruben-Gambino : 1-2(1-x)**n -> 2n*(1-x)**(n-1)
return 2 * (1.0 - x) ** (n-1) * n
if x >= 0.5:
return 2 * scipy.stats.ksone.pdf(x, n)
# Just take a small delta.
# Ideally x +/- delta would stay within [i/n, (i+1)/n] for some integer a.
# as the CDF is a piecewise degree n polynomial.
# It has knots at 1/n, 2/n, ... (n-1)/n
# and is not a C-infinity function at the knots
delta = x / 2.0**16
delta = min(delta, x - 1.0/n)
delta = min(delta, 0.5 - x)
def _kk(_x):
return kolmogn(n, _x)
return scipy.misc.derivative(_kk, x, dx=delta, order=5)
def _kolmogni(n, p, q):
"""Computes the PPF/ISF of kolmogn.
n of type integer, n>= 1
p is the CDF, q the SF, p+q=1
"""
if np.isnan(n):
return n # Keep the same type of nan
if int(n) != n or n <= 0:
return np.nan
if p <= 0:
return 1.0/n
if q <= 0:
return 1.0
delta = np.exp((np.log(p) - scipy.special.loggamma(n+1))/n)
if delta <= 1.0/n:
return (delta + 1.0 / n) / 2
x = -np.expm1(np.log(q/2.0)/n)
if x >= 1 - 1.0/n:
return x
x1 = scu._kolmogci(p)/np.sqrt(n)
x1 = min(x1, 1.0 - 1.0/n)
_f = lambda x: _kolmogn(n, x) - p
return scipy.optimize.brentq(_f, 1.0/n, x1, xtol=1e-14)
def kolmogn(n, x, cdf=True):
"""Computes the CDF for the two-sided Kolmogorov-Smirnov distribution.
The two-sided Kolmogorov-Smirnov distribution has as its CDF Pr(D_n <= x),
for a sample of size n drawn from a distribution with CDF F(t), where
D_n &= sup_t |F_n(t) - F(t)|, and
F_n(t) is the Empirical Cumulative Distribution Function of the sample.
Parameters
----------
n : integer, array_like
the number of samples
x : float, array_like
The K-S statistic, float between 0 and 1
cdf : bool, optional
whether to compute the CDF(default=true) or the SF.
Returns
-------
cdf : ndarray
CDF (or SF it cdf is False) at the specified locations.
The return value has shape the result of numpy broadcasting n and x.
"""
it = np.nditer([n, x, cdf, None],
op_dtypes=[None, np.float64, np.bool_, np.float64])
for _n, _x, _cdf, z in it:
if np.isnan(_n):
z[...] = _n
continue
if int(_n) != _n:
raise ValueError(f'n is not integral: {_n}')
z[...] = _kolmogn(int(_n), _x, cdf=_cdf)
result = it.operands[-1]
return result
def kolmognp(n, x):
"""Computes the PDF for the two-sided Kolmogorov-Smirnov distribution.
Parameters
----------
n : integer, array_like
the number of samples
x : float, array_like
The K-S statistic, float between 0 and 1
Returns
-------
pdf : ndarray
The PDF at the specified locations
The return value has shape the result of numpy broadcasting n and x.
"""
it = np.nditer([n, x, None])
for _n, _x, z in it:
if np.isnan(_n):
z[...] = _n
continue
if int(_n) != _n:
raise ValueError(f'n is not integral: {_n}')
z[...] = _kolmogn_p(int(_n), _x)
result = it.operands[-1]
return result
def kolmogni(n, q, cdf=True):
"""Computes the PPF(or ISF) for the two-sided Kolmogorov-Smirnov distribution.
Parameters
----------
n : integer, array_like
the number of samples
q : float, array_like
Probabilities, float between 0 and 1
cdf : bool, optional
whether to compute the PPF(default=true) or the ISF.
Returns
-------
ppf : ndarray
PPF (or ISF if cdf is False) at the specified locations
The return value has shape the result of numpy broadcasting n and x.
"""
it = np.nditer([n, q, cdf, None])
for _n, _q, _cdf, z in it:
if np.isnan(_n):
z[...] = _n
continue
if int(_n) != _n:
raise ValueError(f'n is not integral: {_n}')
_pcdf, _psf = (_q, 1-_q) if _cdf else (1-_q, _q)
z[...] = _kolmogni(int(_n), _pcdf, _psf)
result = it.operands[-1]
return result

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,169 @@
import numpy as np
from scipy._lib._util import check_random_state
def rvs_ratio_uniforms(pdf, umax, vmin, vmax, size=1, c=0, random_state=None):
"""
Generate random samples from a probability density function using the
ratio-of-uniforms method.
Parameters
----------
pdf : callable
A function with signature `pdf(x)` that is proportional to the
probability density function of the distribution.
umax : float
The upper bound of the bounding rectangle in the u-direction.
vmin : float
The lower bound of the bounding rectangle in the v-direction.
vmax : float
The upper bound of the bounding rectangle in the v-direction.
size : int or tuple of ints, optional
Defining number of random variates (default is 1).
c : float, optional.
Shift parameter of ratio-of-uniforms method, see Notes. Default is 0.
random_state : {None, int, `~np.random.RandomState`, `~np.random.Generator`}, optional
If `random_state` is `None` the `~np.random.RandomState` singleton is
used.
If `random_state` is an int, a new ``RandomState`` instance is used,
seeded with random_state.
If `random_state` is already a ``RandomState`` or ``Generator``
instance, then that object is used.
Default is None.
Returns
-------
rvs : ndarray
The random variates distributed according to the probability
distribution defined by the pdf.
Notes
-----
Given a univariate probability density function `pdf` and a constant `c`,
define the set ``A = {(u, v) : 0 < u <= sqrt(pdf(v/u + c))}``.
If `(U, V)` is a random vector uniformly distributed over `A`,
then `V/U + c` follows a distribution according to `pdf`.
The above result (see [1]_, [2]_) can be used to sample random variables
using only the pdf, i.e. no inversion of the cdf is required. Typical
choices of `c` are zero or the mode of `pdf`. The set `A` is a subset of
the rectangle ``R = [0, umax] x [vmin, vmax]`` where
- ``umax = sup sqrt(pdf(x))``
- ``vmin = inf (x - c) sqrt(pdf(x))``
- ``vmax = sup (x - c) sqrt(pdf(x))``
In particular, these values are finite if `pdf` is bounded and
``x**2 * pdf(x)`` is bounded (i.e. subquadratic tails).
One can generate `(U, V)` uniformly on `R` and return
`V/U + c` if `(U, V)` are also in `A` which can be directly
verified.
The algorithm is not changed if one replaces `pdf` by k * `pdf` for any
constant k > 0. Thus, it is often convenient to work with a function
that is proportional to the probability density function by dropping
unneccessary normalization factors.
Intuitively, the method works well if `A` fills up most of the
enclosing rectangle such that the probability is high that `(U, V)`
lies in `A` whenever it lies in `R` as the number of required
iterations becomes too large otherwise. To be more precise, note that
the expected number of iterations to draw `(U, V)` uniformly
distributed on `R` such that `(U, V)` is also in `A` is given by
the ratio ``area(R) / area(A) = 2 * umax * (vmax - vmin) / area(pdf)``,
where `area(pdf)` is the integral of `pdf` (which is equal to one if the
probability density function is used but can take on other values if a
function proportional to the density is used). The equality holds since
the area of `A` is equal to 0.5 * area(pdf) (Theorem 7.1 in [1]_).
If the sampling fails to generate a single random variate after 50000
iterations (i.e. not a single draw is in `A`), an exception is raised.
If the bounding rectangle is not correctly specified (i.e. if it does not
contain `A`), the algorithm samples from a distribution different from
the one given by `pdf`. It is therefore recommended to perform a
test such as `~scipy.stats.kstest` as a check.
References
----------
.. [1] L. Devroye, "Non-Uniform Random Variate Generation",
Springer-Verlag, 1986.
.. [2] W. Hoermann and J. Leydold, "Generating generalized inverse Gaussian
random variates", Statistics and Computing, 24(4), p. 547--557, 2014.
.. [3] A.J. Kinderman and J.F. Monahan, "Computer Generation of Random
Variables Using the Ratio of Uniform Deviates",
ACM Transactions on Mathematical Software, 3(3), p. 257--260, 1977.
Examples
--------
>>> from scipy import stats
Simulate normally distributed random variables. It is easy to compute the
bounding rectangle explicitly in that case. For simplicity, we drop the
normalization factor of the density.
>>> f = lambda x: np.exp(-x**2 / 2)
>>> v_bound = np.sqrt(f(np.sqrt(2))) * np.sqrt(2)
>>> umax, vmin, vmax = np.sqrt(f(0)), -v_bound, v_bound
>>> np.random.seed(12345)
>>> rvs = stats.rvs_ratio_uniforms(f, umax, vmin, vmax, size=2500)
The K-S test confirms that the random variates are indeed normally
distributed (normality is not rejected at 5% significance level):
>>> stats.kstest(rvs, 'norm')[1]
0.33783681428365553
The exponential distribution provides another example where the bounding
rectangle can be determined explicitly.
>>> np.random.seed(12345)
>>> rvs = stats.rvs_ratio_uniforms(lambda x: np.exp(-x), umax=1,
... vmin=0, vmax=2*np.exp(-1), size=1000)
>>> stats.kstest(rvs, 'expon')[1]
0.928454552559516
"""
if vmin >= vmax:
raise ValueError("vmin must be smaller than vmax.")
if umax <= 0:
raise ValueError("umax must be positive.")
size1d = tuple(np.atleast_1d(size))
N = np.prod(size1d) # number of rvs needed, reshape upon return
# start sampling using ratio of uniforms method
rng = check_random_state(random_state)
x = np.zeros(N)
simulated, i = 0, 1
# loop until N rvs have been generated: expected runtime is finite.
# to avoid infinite loop, raise exception if not a single rv has been
# generated after 50000 tries. even if the expected numer of iterations
# is 1000, the probability of this event is (1-1/1000)**50000
# which is of order 10e-22
while simulated < N:
k = N - simulated
# simulate uniform rvs on [0, umax] and [vmin, vmax]
u1 = umax * rng.uniform(size=k)
v1 = rng.uniform(vmin, vmax, size=k)
# apply rejection method
rvs = v1 / u1 + c
accept = (u1**2 <= pdf(rvs))
num_accept = np.sum(accept)
if num_accept > 0:
x[simulated:(simulated + num_accept)] = rvs[accept]
simulated += num_accept
if (simulated == 0) and (i*N >= 50000):
msg = ("Not a single random variate could be generated in {} "
"attempts. The ratio of uniforms method does not appear "
"to work for the provided parameters. Please check the "
"pdf and the bounds.".format(i*N))
raise RuntimeError(msg)
i += 1
return np.reshape(x, size1d)

View file

@ -0,0 +1,404 @@
from collections import namedtuple
import numpy as np
from . import distributions
__all__ = ['_find_repeats', 'linregress', 'theilslopes', 'siegelslopes']
LinregressResult = namedtuple('LinregressResult', ('slope', 'intercept',
'rvalue', 'pvalue',
'stderr'))
def linregress(x, y=None):
"""
Calculate a linear least-squares regression for two sets of measurements.
Parameters
----------
x, y : array_like
Two sets of measurements. Both arrays should have the same length. If
only `x` is given (and ``y=None``), then it must be a two-dimensional
array where one dimension has length 2. The two sets of measurements
are then found by splitting the array along the length-2 dimension. In
the case where ``y=None`` and `x` is a 2x2 array, ``linregress(x)`` is
equivalent to ``linregress(x[0], x[1])``.
Returns
-------
slope : float
Slope of the regression line.
intercept : float
Intercept of the regression line.
rvalue : float
Correlation coefficient.
pvalue : float
Two-sided p-value for a hypothesis test whose null hypothesis is
that the slope is zero, using Wald Test with t-distribution of
the test statistic.
stderr : float
Standard error of the estimated gradient.
See also
--------
:func:`scipy.optimize.curve_fit` : Use non-linear
least squares to fit a function to data.
:func:`scipy.optimize.leastsq` : Minimize the sum of
squares of a set of equations.
Notes
-----
Missing values are considered pair-wise: if a value is missing in `x`,
the corresponding value in `y` is masked.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from scipy import stats
Generate some data:
>>> np.random.seed(12345678)
>>> x = np.random.random(10)
>>> y = 1.6*x + np.random.random(10)
Perform the linear regression:
>>> slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
>>> print("slope: %f intercept: %f" % (slope, intercept))
slope: 1.944864 intercept: 0.268578
To get coefficient of determination (R-squared):
>>> print("R-squared: %f" % r_value**2)
R-squared: 0.735498
Plot the data along with the fitted line:
>>> plt.plot(x, y, 'o', label='original data')
>>> plt.plot(x, intercept + slope*x, 'r', label='fitted line')
>>> plt.legend()
>>> plt.show()
Example for the case where only x is provided as a 2x2 array:
>>> x = np.array([[0, 1], [0, 2]])
>>> r = stats.linregress(x)
>>> r.slope, r.intercept
(2.0, 0.0)
"""
TINY = 1.0e-20
if y is None: # x is a (2, N) or (N, 2) shaped array_like
x = np.asarray(x)
if x.shape[0] == 2:
x, y = x
elif x.shape[1] == 2:
x, y = x.T
else:
msg = ("If only `x` is given as input, it has to be of shape "
"(2, N) or (N, 2), provided shape was %s" % str(x.shape))
raise ValueError(msg)
else:
x = np.asarray(x)
y = np.asarray(y)
if x.size == 0 or y.size == 0:
raise ValueError("Inputs must not be empty.")
n = len(x)
xmean = np.mean(x, None)
ymean = np.mean(y, None)
# average sum of squares:
ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat
r_num = ssxym
r_den = np.sqrt(ssxm * ssym)
if r_den == 0.0:
r = 0.0
else:
r = r_num / r_den
# test for numerical error propagation
if r > 1.0:
r = 1.0
elif r < -1.0:
r = -1.0
df = n - 2
slope = r_num / ssxm
intercept = ymean - slope*xmean
if n == 2:
# handle case when only two points are passed in
if y[0] == y[1]:
prob = 1.0
else:
prob = 0.0
sterrest = 0.0
else:
t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
prob = 2 * distributions.t.sf(np.abs(t), df)
sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)
return LinregressResult(slope, intercept, r, prob, sterrest)
def theilslopes(y, x=None, alpha=0.95):
r"""
Computes the Theil-Sen estimator for a set of points (x, y).
`theilslopes` implements a method for robust linear regression. It
computes the slope as the median of all slopes between paired values.
Parameters
----------
y : array_like
Dependent variable.
x : array_like or None, optional
Independent variable. If None, use ``arange(len(y))`` instead.
alpha : float, optional
Confidence degree between 0 and 1. Default is 95% confidence.
Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
interpreted as "find the 90% confidence interval".
Returns
-------
medslope : float
Theil slope.
medintercept : float
Intercept of the Theil line, as ``median(y) - medslope*median(x)``.
lo_slope : float
Lower bound of the confidence interval on `medslope`.
up_slope : float
Upper bound of the confidence interval on `medslope`.
See also
--------
siegelslopes : a similar technique using repeated medians
Notes
-----
The implementation of `theilslopes` follows [1]_. The intercept is
not defined in [1]_, and here it is defined as ``median(y) -
medslope*median(x)``, which is given in [3]_. Other definitions of
the intercept exist in the literature. A confidence interval for
the intercept is not given as this question is not addressed in
[1]_.
References
----------
.. [1] P.K. Sen, "Estimates of the regression coefficient based on Kendall's tau",
J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.
.. [2] H. Theil, "A rank-invariant method of linear and polynomial
regression analysis I, II and III", Nederl. Akad. Wetensch., Proc.
53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.
.. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,
John Wiley and Sons, New York, pp. 493.
Examples
--------
>>> from scipy import stats
>>> import matplotlib.pyplot as plt
>>> x = np.linspace(-5, 5, num=150)
>>> y = x + np.random.normal(size=x.size)
>>> y[11:15] += 10 # add outliers
>>> y[-5:] -= 7
Compute the slope, intercept and 90% confidence interval. For comparison,
also compute the least-squares fit with `linregress`:
>>> res = stats.theilslopes(y, x, 0.90)
>>> lsq_res = stats.linregress(x, y)
Plot the results. The Theil-Sen regression line is shown in red, with the
dashed red lines illustrating the confidence interval of the slope (note
that the dashed red lines are not the confidence interval of the regression
as the confidence interval of the intercept is not included). The green
line shows the least-squares fit for comparison.
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.plot(x, y, 'b.')
>>> ax.plot(x, res[1] + res[0] * x, 'r-')
>>> ax.plot(x, res[1] + res[2] * x, 'r--')
>>> ax.plot(x, res[1] + res[3] * x, 'r--')
>>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
>>> plt.show()
"""
# We copy both x and y so we can use _find_repeats.
y = np.array(y).flatten()
if x is None:
x = np.arange(len(y), dtype=float)
else:
x = np.array(x, dtype=float).flatten()
if len(x) != len(y):
raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
# Compute sorted slopes only when deltax > 0
deltax = x[:, np.newaxis] - x
deltay = y[:, np.newaxis] - y
slopes = deltay[deltax > 0] / deltax[deltax > 0]
slopes.sort()
medslope = np.median(slopes)
medinter = np.median(y) - medslope * np.median(x)
# Now compute confidence intervals
if alpha > 0.5:
alpha = 1. - alpha
z = distributions.norm.ppf(alpha / 2.)
# This implements (2.6) from Sen (1968)
_, nxreps = _find_repeats(x)
_, nyreps = _find_repeats(y)
nt = len(slopes) # N in Sen (1968)
ny = len(y) # n in Sen (1968)
# Equation 2.6 in Sen (1968):
sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -
sum(k * (k-1) * (2*k + 5) for k in nxreps) -
sum(k * (k-1) * (2*k + 5) for k in nyreps))
# Find the confidence interval indices in `slopes`
sigma = np.sqrt(sigsq)
Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
delta = slopes[[Rl, Ru]]
return medslope, medinter, delta[0], delta[1]
def _find_repeats(arr):
# This function assumes it may clobber its input.
if len(arr) == 0:
return np.array(0, np.float64), np.array(0, np.intp)
# XXX This cast was previously needed for the Fortran implementation,
# should we ditch it?
arr = np.asarray(arr, np.float64).ravel()
arr.sort()
# Taken from NumPy 1.9's np.unique.
change = np.concatenate(([True], arr[1:] != arr[:-1]))
unique = arr[change]
change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
freq = np.diff(change_idx)
atleast2 = freq > 1
return unique[atleast2], freq[atleast2]
def siegelslopes(y, x=None, method="hierarchical"):
r"""
Computes the Siegel estimator for a set of points (x, y).
`siegelslopes` implements a method for robust linear regression
using repeated medians (see [1]_) to fit a line to the points (x, y).
The method is robust to outliers with an asymptotic breakdown point
of 50%.
Parameters
----------
y : array_like
Dependent variable.
x : array_like or None, optional
Independent variable. If None, use ``arange(len(y))`` instead.
method : {'hierarchical', 'separate'}
If 'hierarchical', estimate the intercept using the estimated
slope ``medslope`` (default option).
If 'separate', estimate the intercept independent of the estimated
slope. See Notes for details.
Returns
-------
medslope : float
Estimate of the slope of the regression line.
medintercept : float
Estimate of the intercept of the regression line.
See also
--------
theilslopes : a similar technique without repeated medians
Notes
-----
With ``n = len(y)``, compute ``m_j`` as the median of
the slopes from the point ``(x[j], y[j])`` to all other `n-1` points.
``medslope`` is then the median of all slopes ``m_j``.
Two ways are given to estimate the intercept in [1]_ which can be chosen
via the parameter ``method``.
The hierarchical approach uses the estimated slope ``medslope``
and computes ``medintercept`` as the median of ``y - medslope*x``.
The other approach estimates the intercept separately as follows: for
each point ``(x[j], y[j])``, compute the intercepts of all the `n-1`
lines through the remaining points and take the median ``i_j``.
``medintercept`` is the median of the ``i_j``.
The implementation computes `n` times the median of a vector of size `n`
which can be slow for large vectors. There are more efficient algorithms
(see [2]_) which are not implemented here.
References
----------
.. [1] A. Siegel, "Robust Regression Using Repeated Medians",
Biometrika, Vol. 69, pp. 242-244, 1982.
.. [2] A. Stein and M. Werman, "Finding the repeated median regression
line", Proceedings of the Third Annual ACM-SIAM Symposium on
Discrete Algorithms, pp. 409-413, 1992.
Examples
--------
>>> from scipy import stats
>>> import matplotlib.pyplot as plt
>>> x = np.linspace(-5, 5, num=150)
>>> y = x + np.random.normal(size=x.size)
>>> y[11:15] += 10 # add outliers
>>> y[-5:] -= 7
Compute the slope and intercept. For comparison, also compute the
least-squares fit with `linregress`:
>>> res = stats.siegelslopes(y, x)
>>> lsq_res = stats.linregress(x, y)
Plot the results. The Siegel regression line is shown in red. The green
line shows the least-squares fit for comparison.
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.plot(x, y, 'b.')
>>> ax.plot(x, res[1] + res[0] * x, 'r-')
>>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
>>> plt.show()
"""
if method not in ['hierarchical', 'separate']:
raise ValueError("method can only be 'hierarchical' or 'separate'")
y = np.asarray(y).ravel()
if x is None:
x = np.arange(len(y), dtype=float)
else:
x = np.asarray(x, dtype=float).ravel()
if len(x) != len(y):
raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
deltax = x[:, np.newaxis] - x
deltay = y[:, np.newaxis] - y
slopes, intercepts = [], []
for j in range(len(x)):
id_nonzero = deltax[j, :] != 0
slopes_j = deltay[j, id_nonzero] / deltax[j, id_nonzero]
medslope_j = np.median(slopes_j)
slopes.append(medslope_j)
if method == 'separate':
z = y*x[j] - y[j]*x
medintercept_j = np.median(z[id_nonzero] / deltax[j, id_nonzero])
intercepts.append(medintercept_j)
medslope = np.median(np.asarray(slopes))
if method == "separate":
medinter = np.median(np.asarray(intercepts))
else:
medinter = np.median(y - medslope*x)
return medslope, medinter

View file

@ -0,0 +1,199 @@
import numpy as np
from numpy import poly1d
from scipy.special import beta
# The following code was used to generate the Pade coefficients for the
# Tukey Lambda variance function. Version 0.17 of mpmath was used.
#---------------------------------------------------------------------------
# import mpmath as mp
#
# mp.mp.dps = 60
#
# one = mp.mpf(1)
# two = mp.mpf(2)
#
# def mpvar(lam):
# if lam == 0:
# v = mp.pi**2 / three
# else:
# v = (two / lam**2) * (one / (one + two*lam) -
# mp.beta(lam + one, lam + one))
# return v
#
# t = mp.taylor(mpvar, 0, 8)
# p, q = mp.pade(t, 4, 4)
# print("p =", [mp.fp.mpf(c) for c in p])
# print("q =", [mp.fp.mpf(c) for c in q])
#---------------------------------------------------------------------------
# Pade coefficients for the Tukey Lambda variance function.
_tukeylambda_var_pc = [3.289868133696453, 0.7306125098871127,
-0.5370742306855439, 0.17292046290190008,
-0.02371146284628187]
_tukeylambda_var_qc = [1.0, 3.683605511659861, 4.184152498888124,
1.7660926747377275, 0.2643989311168465]
# numpy.poly1d instances for the numerator and denominator of the
# Pade approximation to the Tukey Lambda variance.
_tukeylambda_var_p = poly1d(_tukeylambda_var_pc[::-1])
_tukeylambda_var_q = poly1d(_tukeylambda_var_qc[::-1])
def tukeylambda_variance(lam):
"""Variance of the Tukey Lambda distribution.
Parameters
----------
lam : array_like
The lambda values at which to compute the variance.
Returns
-------
v : ndarray
The variance. For lam < -0.5, the variance is not defined, so
np.nan is returned. For lam = 0.5, np.inf is returned.
Notes
-----
In an interval around lambda=0, this function uses the [4,4] Pade
approximation to compute the variance. Otherwise it uses the standard
formula (https://en.wikipedia.org/wiki/Tukey_lambda_distribution). The
Pade approximation is used because the standard formula has a removable
discontinuity at lambda = 0, and does not produce accurate numerical
results near lambda = 0.
"""
lam = np.asarray(lam)
shp = lam.shape
lam = np.atleast_1d(lam).astype(np.float64)
# For absolute values of lam less than threshold, use the Pade
# approximation.
threshold = 0.075
# Play games with masks to implement the conditional evaluation of
# the distribution.
# lambda < -0.5: var = nan
low_mask = lam < -0.5
# lambda == -0.5: var = inf
neghalf_mask = lam == -0.5
# abs(lambda) < threshold: use Pade approximation
small_mask = np.abs(lam) < threshold
# else the "regular" case: use the explicit formula.
reg_mask = ~(low_mask | neghalf_mask | small_mask)
# Get the 'lam' values for the cases where they are needed.
small = lam[small_mask]
reg = lam[reg_mask]
# Compute the function for each case.
v = np.empty_like(lam)
v[low_mask] = np.nan
v[neghalf_mask] = np.inf
if small.size > 0:
# Use the Pade approximation near lambda = 0.
v[small_mask] = _tukeylambda_var_p(small) / _tukeylambda_var_q(small)
if reg.size > 0:
v[reg_mask] = (2.0 / reg**2) * (1.0 / (1.0 + 2 * reg) -
beta(reg + 1, reg + 1))
v.shape = shp
return v
# The following code was used to generate the Pade coefficients for the
# Tukey Lambda kurtosis function. Version 0.17 of mpmath was used.
#---------------------------------------------------------------------------
# import mpmath as mp
#
# mp.mp.dps = 60
#
# one = mp.mpf(1)
# two = mp.mpf(2)
# three = mp.mpf(3)
# four = mp.mpf(4)
#
# def mpkurt(lam):
# if lam == 0:
# k = mp.mpf(6)/5
# else:
# numer = (one/(four*lam+one) - four*mp.beta(three*lam+one, lam+one) +
# three*mp.beta(two*lam+one, two*lam+one))
# denom = two*(one/(two*lam+one) - mp.beta(lam+one,lam+one))**2
# k = numer / denom - three
# return k
#
# # There is a bug in mpmath 0.17: when we use the 'method' keyword of the
# # taylor function and we request a degree 9 Taylor polynomial, we actually
# # get degree 8.
# t = mp.taylor(mpkurt, 0, 9, method='quad', radius=0.01)
# t = [mp.chop(c, tol=1e-15) for c in t]
# p, q = mp.pade(t, 4, 4)
# print("p =", [mp.fp.mpf(c) for c in p])
# print("q =", [mp.fp.mpf(c) for c in q])
#---------------------------------------------------------------------------
# Pade coefficients for the Tukey Lambda kurtosis function.
_tukeylambda_kurt_pc = [1.2, -5.853465139719495, -22.653447381131077,
0.20601184383406815, 4.59796302262789]
_tukeylambda_kurt_qc = [1.0, 7.171149192233599, 12.96663094361842,
0.43075235247853005, -2.789746758009912]
# numpy.poly1d instances for the numerator and denominator of the
# Pade approximation to the Tukey Lambda kurtosis.
_tukeylambda_kurt_p = poly1d(_tukeylambda_kurt_pc[::-1])
_tukeylambda_kurt_q = poly1d(_tukeylambda_kurt_qc[::-1])
def tukeylambda_kurtosis(lam):
"""Kurtosis of the Tukey Lambda distribution.
Parameters
----------
lam : array_like
The lambda values at which to compute the variance.
Returns
-------
v : ndarray
The variance. For lam < -0.25, the variance is not defined, so
np.nan is returned. For lam = 0.25, np.inf is returned.
"""
lam = np.asarray(lam)
shp = lam.shape
lam = np.atleast_1d(lam).astype(np.float64)
# For absolute values of lam less than threshold, use the Pade
# approximation.
threshold = 0.055
# Use masks to implement the conditional evaluation of the kurtosis.
# lambda < -0.25: kurtosis = nan
low_mask = lam < -0.25
# lambda == -0.25: kurtosis = inf
negqrtr_mask = lam == -0.25
# lambda near 0: use Pade approximation
small_mask = np.abs(lam) < threshold
# else the "regular" case: use the explicit formula.
reg_mask = ~(low_mask | negqrtr_mask | small_mask)
# Get the 'lam' values for the cases where they are needed.
small = lam[small_mask]
reg = lam[reg_mask]
# Compute the function for each case.
k = np.empty_like(lam)
k[low_mask] = np.nan
k[negqrtr_mask] = np.inf
if small.size > 0:
k[small_mask] = _tukeylambda_kurt_p(small) / _tukeylambda_kurt_q(small)
if reg.size > 0:
numer = (1.0 / (4 * reg + 1) - 4 * beta(3 * reg + 1, reg + 1) +
3 * beta(2 * reg + 1, 2 * reg + 1))
denom = 2 * (1.0/(2 * reg + 1) - beta(reg + 1, reg + 1))**2
k[reg_mask] = numer / denom - 3
# The return value will be a numpy array; resetting the shape ensures that
# if `lam` was a scalar, the return value is a 0-d array.
k.shape = shp
return k

View file

@ -0,0 +1,299 @@
import numpy as np
import itertools
# This file contains a dictionary that maps an integer n to the
# distribution of the Wilcoxon signed rank test statistic.
# The dictionary can be generated by the functions
# _generate_wilcoxon_exact_table and _generate_wilcoxon_exact_table_fast.
# The second function is about 20% faster.
def _generate_wilcoxon_exact_table(N):
"""
Generate counts of the Wilcoxon ranksum statistic r_plus (sum of
ranks of positive differences). For fixed n, simulate all possible states
{0, 1}**n and compute the sum of the ranks over the indices that are equal
to one (positive differences).
Return a dictionary that maps n=3,...N to the corresponding list of counts
"""
res_dict = {}
for n in range(1, N+1):
res = []
ranks = np.arange(n) + 1
M = n*(n + 1)/2
for x in itertools.product((0, 1), repeat=n):
# note that by symmetry, given a state x, we can directly compute
# the positive ranksum of the inverted state (i.e. ~x or 1 - x),
# therefore, it is enough to consider sequences starting with a one
if x[0] == 1:
rank_sum = np.sum(x * ranks)
res.append(rank_sum)
res.append(M - rank_sum)
_, cnt = np.unique(res, return_counts=True)
res_dict[n] = list(cnt)
return res_dict
def _generate_wilcoxon_exact_table_fast(N):
"""
Same functionality as _generate_wilcoxon_exact_table, but about 20% faster,
but harder to follow.
"""
res_dict = {}
for n in range(1, N+1):
ranks = np.arange(n) + 1
M = int(n*(n + 1)/2)
res = np.zeros(M + 1, dtype=int)
for x in itertools.product((0, 1), repeat=n):
if x[0] == 1:
rank_sum = int(np.sum(x * ranks))
res[rank_sum] += 1
# flip array to get counts of symmetric sequences starting with 0
res_dict[n] = list(res + np.flip(res))
return res_dict
COUNTS = {
1: [1, 1],
2: [1, 1, 1, 1],
3: [1, 1, 1, 2, 1, 1, 1],
4: [1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1],
5: [1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 2, 2, 1, 1, 1],
6: [1, 1, 1, 2, 2, 3, 4, 4, 4, 5, 5, 5, 5, 4, 4, 4, 3, 2, 2, 1, 1, 1],
7: [1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 8, 8, 8, 8, 7, 7, 6, 5, 5, 4,
3, 2, 2, 1, 1, 1],
8: [1, 1, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13, 14, 13,
13, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 2, 1, 1, 1],
9: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, 15, 17, 18, 19, 21, 21,
22, 23, 23, 23, 23, 22, 21, 21, 19, 18, 17, 15, 13, 12, 10, 9, 8, 6,
5, 4, 3, 2, 2, 1, 1, 1],
10: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 11, 13, 15, 17, 20, 22, 24, 27, 29,
31, 33, 35, 36, 38, 39, 39, 40, 40, 39, 39, 38, 36, 35, 33, 31, 29,
27, 24, 22, 20, 17, 15, 13, 11, 10, 8, 6, 5, 4, 3, 2, 2, 1, 1, 1],
11: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 14, 16, 19, 22, 25, 28, 32, 35,
39, 43, 46, 49, 53, 56, 59, 62, 64, 66, 68, 69, 69, 70, 69, 69, 68,
66, 64, 62, 59, 56, 53, 49, 46, 43, 39, 35, 32, 28, 25, 22, 19, 16,
14, 12, 10, 8, 6, 5, 4, 3, 2, 2, 1, 1, 1],
12: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 15, 17, 20, 24, 27, 31, 36, 40,
45, 51, 56, 61, 67, 72, 78, 84, 89, 94, 100, 104, 108, 113, 115, 118,
121, 122, 123, 124, 123, 122, 121, 118, 115, 113, 108, 104, 100, 94,
89, 84, 78, 72, 67, 61, 56, 51, 45, 40, 36, 31, 27, 24, 20, 17, 15,
12, 10, 8, 6, 5, 4, 3, 2, 2, 1, 1, 1],
13: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 15, 18, 21, 25, 29, 33, 39, 44,
50, 57, 64, 71, 79, 87, 95, 104, 113, 121, 131, 140, 148, 158, 166,
174, 182, 189, 195, 202, 207, 211, 215, 218, 219, 221, 221, 219, 218,
215, 211, 207, 202, 195, 189, 182, 174, 166, 158, 148, 140, 131, 121,
113, 104, 95, 87, 79, 71, 64, 57, 50, 44, 39, 33, 29, 25, 21, 18, 15,
12, 10, 8, 6, 5, 4, 3, 2, 2, 1, 1, 1],
14: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 15, 18, 22, 26, 30, 35, 41, 47,
54, 62, 70, 79, 89, 99, 110, 122, 134, 146, 160, 173, 187, 202, 216,
231, 246, 260, 274, 289, 302, 315, 328, 339, 350, 361, 369, 377, 384,
389, 393, 396, 397, 397, 396, 393, 389, 384, 377, 369, 361, 350, 339,
328, 315, 302, 289, 274, 260, 246, 231, 216, 202, 187, 173, 160, 146,
134, 122, 110, 99, 89, 79, 70, 62, 54, 47, 41, 35, 30, 26, 22, 18,
15, 12, 10, 8, 6, 5, 4, 3, 2, 2, 1, 1, 1],
15: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 15, 18, 22, 27, 31, 36, 43, 49,
57, 66, 75, 85, 97, 109, 122, 137, 152, 168, 186, 203, 222, 243, 263,
285, 308, 330, 353, 378, 401, 425, 450, 473, 496, 521, 542, 564, 586,
605, 624, 642, 657, 671, 685, 695, 704, 712, 716, 719, 722, 719, 716,
712, 704, 695, 685, 671, 657, 642, 624, 605, 586, 564, 542, 521, 496,
473, 450, 425, 401, 378, 353, 330, 308, 285, 263, 243, 222, 203, 186,
168, 152, 137, 122, 109, 97, 85, 75, 66, 57, 49, 43, 36, 31, 27, 22,
18, 15, 12, 10, 8, 6, 5, 4, 3, 2, 2, 1, 1, 1],
16: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 15, 18, 22, 27, 32, 37, 44, 51,
59, 69, 79, 90, 103, 117, 132, 149, 167, 186, 208, 230, 253, 279,
306, 334, 365, 396, 428, 463, 498, 534, 572, 610, 648, 689, 728, 767,
808, 848, 887, 927, 965, 1001, 1038, 1073, 1105, 1137, 1166, 1192,
1218, 1240, 1258, 1276, 1290, 1300, 1309, 1313, 1314, 1313, 1309,
1300, 1290, 1276, 1258, 1240, 1218, 1192, 1166, 1137, 1105, 1073,
1038, 1001, 965, 927, 887, 848, 808, 767, 728, 689, 648, 610, 572,
534, 498, 463, 428, 396, 365, 334, 306, 279, 253, 230, 208, 186, 167,
149, 132, 117, 103, 90, 79, 69, 59, 51, 44, 37, 32, 27, 22, 18, 15,
12, 10, 8, 6, 5, 4, 3, 2, 2, 1, 1, 1],
17: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 15, 18, 22, 27, 32, 38, 45, 52,
61, 71, 82, 94, 108, 123, 140, 159, 179, 201, 226, 252, 280, 311,
343, 378, 416, 455, 497, 542, 588, 637, 689, 742, 797, 856, 914, 975,
1038, 1101, 1166, 1233, 1299, 1366, 1434, 1501, 1568, 1635, 1700,
1764, 1828, 1888, 1947, 2004, 2057, 2108, 2157, 2200, 2241, 2278,
2310, 2338, 2363, 2381, 2395, 2406, 2410, 2410, 2406, 2395, 2381,
2363, 2338, 2310, 2278, 2241, 2200, 2157, 2108, 2057, 2004, 1947,
1888, 1828, 1764, 1700, 1635, 1568, 1501, 1434, 1366, 1299, 1233,
1166, 1101, 1038, 975, 914, 856, 797, 742, 689, 637, 588, 542, 497,
455, 416, 378, 343, 311, 280, 252, 226, 201, 179, 159, 140, 123, 108,
94, 82, 71, 61, 52, 45, 38, 32, 27, 22, 18, 15, 12, 10, 8, 6, 5, 4,
3, 2, 2, 1, 1, 1],
18: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 15, 18, 22, 27, 32, 38, 46, 53,
62, 73, 84, 97, 112, 128, 146, 167, 189, 213, 241, 270, 302, 338,
375, 416, 461, 507, 558, 613, 670, 731, 797, 865, 937, 1015, 1093,
1176, 1264, 1353, 1446, 1544, 1642, 1744, 1850, 1956, 2065, 2177,
2288, 2401, 2517, 2630, 2744, 2860, 2971, 3083, 3195, 3301, 3407,
3511, 3609, 3704, 3797, 3882, 3963, 4041, 4110, 4174, 4234, 4283,
4328, 4367, 4395, 4418, 4435, 4441, 4441, 4435, 4418, 4395, 4367,
4328, 4283, 4234, 4174, 4110, 4041, 3963, 3882, 3797, 3704, 3609,
3511, 3407, 3301, 3195, 3083, 2971, 2860, 2744, 2630, 2517, 2401,
2288, 2177, 2065, 1956, 1850, 1744, 1642, 1544, 1446, 1353, 1264,
1176, 1093, 1015, 937, 865, 797, 731, 670, 613, 558, 507, 461, 416,
375, 338, 302, 270, 241, 213, 189, 167, 146, 128, 112, 97, 84, 73,
62, 53, 46, 38, 32, 27, 22, 18, 15, 12, 10, 8, 6, 5, 4, 3, 2, 2, 1,
1, 1],
19: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 15, 18, 22, 27, 32, 38, 46, 54,
63, 74, 86, 99, 115, 132, 151, 173, 197, 223, 253, 285, 320, 360,
402, 448, 499, 553, 611, 675, 743, 815, 894, 977, 1065, 1161, 1260,
1365, 1477, 1594, 1716, 1846, 1980, 2119, 2266, 2417, 2572, 2735,
2901, 3071, 3248, 3427, 3609, 3797, 3986, 4176, 4371, 4565, 4760,
4957, 5153, 5346, 5541, 5732, 5919, 6106, 6287, 6462, 6635, 6800,
6958, 7111, 7255, 7389, 7518, 7636, 7742, 7842, 7929, 8004, 8071,
8125, 8165, 8197, 8215, 8220, 8215, 8197, 8165, 8125, 8071, 8004,
7929, 7842, 7742, 7636, 7518, 7389, 7255, 7111, 6958, 6800, 6635,
6462, 6287, 6106, 5919, 5732, 5541, 5346, 5153, 4957, 4760, 4565,
4371, 4176, 3986, 3797, 3609, 3427, 3248, 3071, 2901, 2735, 2572,
2417, 2266, 2119, 1980, 1846, 1716, 1594, 1477, 1365, 1260, 1161,
1065, 977, 894, 815, 743, 675, 611, 553, 499, 448, 402, 360, 320, 285,
253, 223, 197, 173, 151, 132, 115, 99, 86, 74, 63, 54, 46, 38, 32, 27,
22, 18, 15, 12, 10, 8, 6, 5, 4, 3, 2, 2, 1, 1, 1],
20: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 15, 18, 22, 27, 32, 38, 46, 54,
64, 75, 87, 101, 117, 135, 155, 178, 203, 231, 263, 297, 335, 378,
424, 475, 531, 591, 657, 729, 806, 889, 980, 1076, 1180, 1293, 1411,
1538, 1674, 1817, 1969, 2131, 2300, 2479, 2668, 2865, 3071, 3288,
3512, 3746, 3991, 4242, 4503, 4774, 5051, 5337, 5631, 5930, 6237,
6551, 6869, 7192, 7521, 7851, 8185, 8523, 8859, 9197, 9536, 9871,
10206, 10538, 10864, 11186, 11504, 11812, 12113, 12407, 12689, 12961,
13224, 13471, 13706, 13929, 14134, 14326, 14502, 14659, 14800, 14925,
15029, 15115, 15184, 15231, 15260, 15272, 15260, 15231, 15184, 15115,
15029, 14925, 14800, 14659, 14502, 14326, 14134, 13929, 13706, 13471,
13224, 12961, 12689, 12407, 12113, 11812, 11504, 11186, 10864, 10538,
10206, 9871, 9536, 9197, 8859, 8523, 8185, 7851, 7521, 7192, 6869,
6551, 6237, 5930, 5631, 5337, 5051, 4774, 4503, 4242, 3991, 3746,
3512, 3288, 3071, 2865, 2668, 2479, 2300, 2131, 1969, 1817, 1674,
1538, 1411, 1293, 1180, 1076, 980, 889, 806, 729, 657, 591, 531, 475,
424, 378, 335, 297, 263, 231, 203, 178, 155, 135, 117, 101, 87, 75,
64, 54, 46, 38, 32, 27, 22, 18, 15, 12, 10, 8, 6, 5, 4, 3, 2, 2, 1, 1,
1],
21: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 15, 18, 22, 27, 32, 38, 46, 54,
64, 76, 88, 102, 119, 137, 158, 182, 208, 237, 271, 307, 347, 393,
442, 497, 558, 623, 695, 775, 860, 953, 1055, 1163, 1281, 1410, 1546,
1693, 1852, 2020, 2200, 2394, 2597, 2814, 3046, 3289, 3546, 3819,
4103, 4403, 4720, 5048, 5392, 5754, 6127, 6517, 6924, 7341, 7775,
8225, 8686, 9161, 9652, 10151, 10664, 11191, 11724, 12268, 12824,
13383, 13952, 14529, 15106, 15689, 16278, 16863, 17450, 18038, 18619,
19198, 19775, 20340, 20898, 21450, 21985, 22511, 23025, 23518, 23997,
24461, 24900, 25321, 25722, 26095, 26446, 26776, 27072, 27344, 27591,
27804, 27990, 28149, 28271, 28365, 28431, 28460, 28460, 28431, 28365,
28271, 28149, 27990, 27804, 27591, 27344, 27072, 26776, 26446, 26095,
25722, 25321, 24900, 24461, 23997, 23518, 23025, 22511, 21985, 21450,
20898, 20340, 19775, 19198, 18619, 18038, 17450, 16863, 16278, 15689,
15106, 14529, 13952, 13383, 12824, 12268, 11724, 11191, 10664, 10151,
9652, 9161, 8686, 8225, 7775, 7341, 6924, 6517, 6127, 5754, 5392,
5048, 4720, 4403, 4103, 3819, 3546, 3289, 3046, 2814, 2597, 2394,
2200, 2020, 1852, 1693, 1546, 1410, 1281, 1163, 1055, 953, 860, 775,
695, 623, 558, 497, 442, 393, 347, 307, 271, 237, 208, 182, 158, 137,
119, 102, 88, 76, 64, 54, 46, 38, 32, 27, 22, 18, 15, 12, 10, 8, 6,
5, 4, 3, 2, 2, 1, 1, 1],
22: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 15, 18, 22, 27, 32, 38, 46, 54,
64, 76, 89, 103, 120, 139, 160, 185, 212, 242, 277, 315, 357, 405,
457, 515, 580, 650, 727, 813, 906, 1007, 1119, 1239, 1369, 1512, 1665,
1830, 2010, 2202, 2408, 2631, 2868, 3121, 3393, 3682, 3988, 4316,
4661, 5026, 5415, 5823, 6252, 6707, 7182, 7680, 8205, 8751, 9321,
9918, 10538, 11181, 11852, 12545, 13261, 14005, 14770, 15557, 16370,
17202, 18055, 18932, 19826, 20737, 21670, 22617, 23577, 24555, 25543,
26539, 27550, 28565, 29584, 30611, 31637, 32662, 33689, 34709, 35721,
36729, 37724, 38704, 39674, 40624, 41552, 42465, 43350, 44207, 45041,
45842, 46609, 47347, 48046, 48705, 49329, 49910, 50445, 50942, 51390,
51789, 52146, 52451, 52704, 52912, 53066, 53167, 53222, 53222, 53167,
53066, 52912, 52704, 52451, 52146, 51789, 51390, 50942, 50445, 49910,
49329, 48705, 48046, 47347, 46609, 45842, 45041, 44207, 43350, 42465,
41552, 40624, 39674, 38704, 37724, 36729, 35721, 34709, 33689, 32662,
31637, 30611, 29584, 28565, 27550, 26539, 25543, 24555, 23577, 22617,
21670, 20737, 19826, 18932, 18055, 17202, 16370, 15557, 14770, 14005,
13261, 12545, 11852, 11181, 10538, 9918, 9321, 8751, 8205, 7680, 7182,
6707, 6252, 5823, 5415, 5026, 4661, 4316, 3988, 3682, 3393, 3121,
2868, 2631, 2408, 2202, 2010, 1830, 1665, 1512, 1369, 1239, 1119,
1007, 906, 813, 727, 650, 580, 515, 457, 405, 357, 315, 277, 242, 212,
185, 160, 139, 120, 103, 89, 76, 64, 54, 46, 38, 32, 27, 22, 18, 15,
12, 10, 8, 6, 5, 4, 3, 2, 2, 1, 1, 1],
23: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 15, 18, 22, 27, 32, 38, 46, 54,
64, 76, 89, 104, 121, 140, 162, 187, 215, 246, 282, 321, 365, 415,
469, 530, 598, 672, 754, 845, 944, 1053, 1173, 1303, 1445, 1601,
1768, 1950, 2149, 2362, 2593, 2843, 3110, 3398, 3708, 4039, 4393,
4773, 5176, 5606, 6065, 6550, 7065, 7613, 8189, 8799, 9444, 10120,
10833, 11583, 12368, 13191, 14054, 14953, 15892, 16873, 17891, 18950,
20052, 21190, 22371, 23593, 24852, 26152, 27493, 28869, 30284, 31737,
33223, 34744, 36301, 37886, 39502, 41149, 42818, 44514, 46234, 47970,
49726, 51499, 53281, 55074, 56876, 58679, 60484, 62291, 64087, 65877,
67658, 69419, 71164, 72890, 74585, 76255, 77894, 79494, 81056, 82579,
84052, 85478, 86855, 88172, 89433, 90636, 91770, 92841, 93846, 94774,
95632, 96416, 97119, 97745, 98293, 98755, 99136, 99436, 99647, 99774,
99820, 99774, 99647, 99436, 99136, 98755, 98293, 97745, 97119, 96416,
95632, 94774, 93846, 92841, 91770, 90636, 89433, 88172, 86855, 85478,
84052, 82579, 81056, 79494, 77894, 76255, 74585, 72890, 71164, 69419,
67658, 65877, 64087, 62291, 60484, 58679, 56876, 55074, 53281, 51499,
49726, 47970, 46234, 44514, 42818, 41149, 39502, 37886, 36301, 34744,
33223, 31737, 30284, 28869, 27493, 26152, 24852, 23593, 22371, 21190,
20052, 18950, 17891, 16873, 15892, 14953, 14054, 13191, 12368, 11583,
10833, 10120, 9444, 8799, 8189, 7613, 7065, 6550, 6065, 5606, 5176,
4773, 4393, 4039, 3708, 3398, 3110, 2843, 2593, 2362, 2149, 1950,
1768, 1601, 1445, 1303, 1173, 1053, 944, 845, 754, 672, 598, 530, 469,
415, 365, 321, 282, 246, 215, 187, 162, 140, 121, 104, 89, 76, 64, 54,
46, 38, 32, 27, 22, 18, 15, 12, 10, 8, 6, 5, 4, 3, 2, 2, 1, 1, 1],
24: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 15, 18, 22, 27, 32, 38, 46, 54,
64, 76, 89, 104, 122, 141, 163, 189, 217, 249, 286, 326, 371, 423,
479, 542, 613, 690, 776, 872, 976, 1091, 1219, 1357, 1509, 1677, 1857,
2054, 2270, 2502, 2755, 3030, 3325, 3644, 3990, 4360, 4758, 5188,
5645, 6136, 6663, 7222, 7819, 8458, 9133, 9852, 10617, 11423, 12278,
13184, 14136, 15141, 16203, 17315, 18485, 19716, 21001, 22348, 23760,
25229, 26764, 28366, 30028, 31758, 33558, 35419, 37349, 39350, 41412,
43543, 45745, 48006, 50335, 52732, 55186, 57705, 60288, 62923, 65618,
68372, 71172, 74024, 76928, 79869, 82855, 85884, 88939, 92029, 95151,
98288, 101448, 104627, 107808, 110999, 114195, 117380, 120558, 123728,
126870, 129992, 133089, 136142, 139159, 142135, 145051, 147915,
150722, 153453, 156116, 158707, 161206, 163622, 165951, 168174,
170300, 172326, 174232, 176029, 177714, 179268, 180703, 182015,
183188, 184233, 185148, 185917, 186552, 187052, 187402, 187615,
187692, 187615, 187402, 187052, 186552, 185917, 185148, 184233,
183188, 182015, 180703, 179268, 177714, 176029, 174232, 172326,
170300, 168174, 165951, 163622, 161206, 158707, 156116, 153453,
150722, 147915, 145051, 142135, 139159, 136142, 133089, 129992,
126870, 123728, 120558, 117380, 114195, 110999, 107808, 104627,
101448, 98288, 95151, 92029, 88939, 85884, 82855, 79869, 76928,
74024, 71172, 68372, 65618, 62923, 60288, 57705, 55186, 52732, 50335,
48006, 45745, 43543, 41412, 39350, 37349, 35419, 33558, 31758, 30028,
28366, 26764, 25229, 23760, 22348, 21001, 19716, 18485, 17315, 16203,
15141, 14136, 13184, 12278, 11423, 10617, 9852, 9133, 8458, 7819,
7222, 6663, 6136, 5645, 5188, 4758, 4360, 3990, 3644, 3325, 3030,
2755, 2502, 2270, 2054, 1857, 1677, 1509, 1357, 1219, 1091, 976, 872,
776, 690, 613, 542, 479, 423, 371, 326, 286, 249, 217, 189, 163, 141,
122, 104, 89, 76, 64, 54, 46, 38, 32, 27, 22, 18, 15, 12, 10, 8, 6,
5, 4, 3, 2, 2, 1, 1, 1],
25: [1, 1, 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 15, 18, 22, 27, 32, 38, 46, 54,
64, 76, 89, 104, 122, 142, 164, 190, 219, 251, 289, 330, 376, 429,
487, 552, 625, 705, 794, 894, 1003, 1123, 1257, 1403, 1563, 1741,
1933, 2143, 2374, 2624, 2896, 3193, 3514, 3861, 4239, 4646, 5084,
5559, 6068, 6615, 7205, 7835, 8509, 9234, 10005, 10828, 11708, 12642,
13635, 14693, 15813, 16998, 18257, 19585, 20987, 22471, 24031, 25673,
27404, 29219, 31124, 33124, 35216, 37403, 39694, 42082, 44571, 47169,
49870, 52676, 55597, 58623, 61758, 65010, 68370, 71841, 75429, 79126,
82933, 86857, 90888, 95025, 99276, 103629, 108084, 112648, 117305,
122057, 126909, 131846, 136867, 141976, 147158, 152411, 157738,
163125, 168564, 174063, 179602, 185178, 190794, 196430, 202082,
207753, 213423, 219087, 224746, 230381, 235985, 241562, 247090,
252561, 257980, 263325, 268588, 273774, 278859, 283837, 288713,
293463, 298083, 302573, 306916, 311103, 315140, 319006, 322694,
326211, 329537, 332666, 335607, 338337, 340855, 343168, 345259,
347123, 348770, 350184, 351362, 352315, 353029, 353500, 353743,
353743, 353500, 353029, 352315, 351362, 350184, 348770, 347123,
345259, 343168, 340855, 338337, 335607, 332666, 329537, 326211,
322694, 319006, 315140, 311103, 306916, 302573, 298083, 293463,
288713, 283837, 278859, 273774, 268588, 263325, 257980, 252561,
247090, 241562, 235985, 230381, 224746, 219087, 213423, 207753,
202082, 196430, 190794, 185178, 179602, 174063, 168564, 163125,
157738, 152411, 147158, 141976, 136867, 131846, 126909, 122057,
117305, 112648, 108084, 103629, 99276, 95025, 90888, 86857, 82933,
79126, 75429, 71841, 68370, 65010, 61758, 58623, 55597, 52676, 49870,
47169, 44571, 42082, 39694, 37403, 35216, 33124, 31124, 29219, 27404,
25673, 24031, 22471, 20987, 19585, 18257, 16998, 15813, 14693, 13635,
12642, 11708, 10828, 10005, 9234, 8509, 7835, 7205, 6615, 6068, 5559,
5084, 4646, 4239, 3861, 3514, 3193, 2896, 2624, 2374, 2143, 1933,
1741, 1563, 1403, 1257, 1123, 1003, 894, 794, 705, 625, 552, 487,
429, 376, 330, 289, 251, 219, 190, 164, 142, 122, 104, 89, 76, 64,
54, 46, 38, 32, 27, 22, 18, 15, 12, 10, 8, 6, 5, 4, 3, 2, 2, 1, 1, 1]
}

View file

@ -0,0 +1,273 @@
"""Some functions for working with contingency tables (i.e. cross tabulations).
"""
from functools import reduce
import numpy as np
from .stats import power_divergence
__all__ = ['margins', 'expected_freq', 'chi2_contingency']
def margins(a):
"""Return a list of the marginal sums of the array `a`.
Parameters
----------
a : ndarray
The array for which to compute the marginal sums.
Returns
-------
margsums : list of ndarrays
A list of length `a.ndim`. `margsums[k]` is the result
of summing `a` over all axes except `k`; it has the same
number of dimensions as `a`, but the length of each axis
except axis `k` will be 1.
Examples
--------
>>> a = np.arange(12).reshape(2, 6)
>>> a
array([[ 0, 1, 2, 3, 4, 5],
[ 6, 7, 8, 9, 10, 11]])
>>> from scipy.stats.contingency import margins
>>> m0, m1 = margins(a)
>>> m0
array([[15],
[51]])
>>> m1
array([[ 6, 8, 10, 12, 14, 16]])
>>> b = np.arange(24).reshape(2,3,4)
>>> m0, m1, m2 = margins(b)
>>> m0
array([[[ 66]],
[[210]]])
>>> m1
array([[[ 60],
[ 92],
[124]]])
>>> m2
array([[[60, 66, 72, 78]]])
"""
margsums = []
ranged = list(range(a.ndim))
for k in ranged:
marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])
margsums.append(marg)
return margsums
def expected_freq(observed):
"""
Compute the expected frequencies from a contingency table.
Given an n-dimensional contingency table of observed frequencies,
compute the expected frequencies for the table based on the marginal
sums under the assumption that the groups associated with each
dimension are independent.
Parameters
----------
observed : array_like
The table of observed frequencies. (While this function can handle
a 1-D array, that case is trivial. Generally `observed` is at
least 2-D.)
Returns
-------
expected : ndarray of float64
The expected frequencies, based on the marginal sums of the table.
Same shape as `observed`.
Examples
--------
>>> observed = np.array([[10, 10, 20],[20, 20, 20]])
>>> from scipy.stats.contingency import expected_freq
>>> expected_freq(observed)
array([[ 12., 12., 16.],
[ 18., 18., 24.]])
"""
# Typically `observed` is an integer array. If `observed` has a large
# number of dimensions or holds large values, some of the following
# computations may overflow, so we first switch to floating point.
observed = np.asarray(observed, dtype=np.float64)
# Create a list of the marginal sums.
margsums = margins(observed)
# Create the array of expected frequencies. The shapes of the
# marginal sums returned by apply_over_axes() are just what we
# need for broadcasting in the following product.
d = observed.ndim
expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
return expected
def chi2_contingency(observed, correction=True, lambda_=None):
"""Chi-square test of independence of variables in a contingency table.
This function computes the chi-square statistic and p-value for the
hypothesis test of independence of the observed frequencies in the
contingency table [1]_ `observed`. The expected frequencies are computed
based on the marginal sums under the assumption of independence; see
`scipy.stats.contingency.expected_freq`. The number of degrees of
freedom is (expressed using numpy functions and attributes)::
dof = observed.size - sum(observed.shape) + observed.ndim - 1
Parameters
----------
observed : array_like
The contingency table. The table contains the observed frequencies
(i.e. number of occurrences) in each category. In the two-dimensional
case, the table is often described as an "R x C table".
correction : bool, optional
If True, *and* the degrees of freedom is 1, apply Yates' correction
for continuity. The effect of the correction is to adjust each
observed value by 0.5 towards the corresponding expected value.
lambda_ : float or str, optional.
By default, the statistic computed in this test is Pearson's
chi-squared statistic [2]_. `lambda_` allows a statistic from the
Cressie-Read power divergence family [3]_ to be used instead. See
`power_divergence` for details.
Returns
-------
chi2 : float
The test statistic.
p : float
The p-value of the test
dof : int
Degrees of freedom
expected : ndarray, same shape as `observed`
The expected frequencies, based on the marginal sums of the table.
See Also
--------
contingency.expected_freq
fisher_exact
chisquare
power_divergence
Notes
-----
An often quoted guideline for the validity of this calculation is that
the test should be used only if the observed and expected frequencies
in each cell are at least 5.
This is a test for the independence of different categories of a
population. The test is only meaningful when the dimension of
`observed` is two or more. Applying the test to a one-dimensional
table will always result in `expected` equal to `observed` and a
chi-square statistic equal to 0.
This function does not handle masked arrays, because the calculation
does not make sense with missing values.
Like stats.chisquare, this function computes a chi-square statistic;
the convenience this function provides is to figure out the expected
frequencies and degrees of freedom from the given contingency table.
If these were already known, and if the Yates' correction was not
required, one could use stats.chisquare. That is, if one calls::
chi2, p, dof, ex = chi2_contingency(obs, correction=False)
then the following is true::
(chi2, p) == stats.chisquare(obs.ravel(), f_exp=ex.ravel(),
ddof=obs.size - 1 - dof)
The `lambda_` argument was added in version 0.13.0 of scipy.
References
----------
.. [1] "Contingency table",
https://en.wikipedia.org/wiki/Contingency_table
.. [2] "Pearson's chi-squared test",
https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
.. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
pp. 440-464.
Examples
--------
A two-way example (2 x 3):
>>> from scipy.stats import chi2_contingency
>>> obs = np.array([[10, 10, 20], [20, 20, 20]])
>>> chi2_contingency(obs)
(2.7777777777777777,
0.24935220877729619,
2,
array([[ 12., 12., 16.],
[ 18., 18., 24.]]))
Perform the test using the log-likelihood ratio (i.e. the "G-test")
instead of Pearson's chi-squared statistic.
>>> g, p, dof, expctd = chi2_contingency(obs, lambda_="log-likelihood")
>>> g, p
(2.7688587616781319, 0.25046668010954165)
A four-way example (2 x 2 x 2 x 2):
>>> obs = np.array(
... [[[[12, 17],
... [11, 16]],
... [[11, 12],
... [15, 16]]],
... [[[23, 15],
... [30, 22]],
... [[14, 17],
... [15, 16]]]])
>>> chi2_contingency(obs)
(8.7584514426741897,
0.64417725029295503,
11,
array([[[[ 14.15462386, 14.15462386],
[ 16.49423111, 16.49423111]],
[[ 11.2461395 , 11.2461395 ],
[ 13.10500554, 13.10500554]]],
[[[ 19.5591166 , 19.5591166 ],
[ 22.79202844, 22.79202844]],
[[ 15.54012004, 15.54012004],
[ 18.10873492, 18.10873492]]]]))
"""
observed = np.asarray(observed)
if np.any(observed < 0):
raise ValueError("All values in `observed` must be nonnegative.")
if observed.size == 0:
raise ValueError("No data; `observed` has size 0.")
expected = expected_freq(observed)
if np.any(expected == 0):
# Include one of the positions where expected is zero in
# the exception message.
zeropos = list(zip(*np.nonzero(expected == 0)))[0]
raise ValueError("The internally computed table of expected "
"frequencies has a zero element at %s." % (zeropos,))
# The degrees of freedom
dof = expected.size - sum(expected.shape) + expected.ndim - 1
if dof == 0:
# Degenerate case; this occurs when `observed` is 1D (or, more
# generally, when it has only one nontrivial dimension). In this
# case, we also have observed == expected, so chi2 is 0.
chi2 = 0.0
p = 1.0
else:
if dof == 1 and correction:
# Adjust `observed` according to Yates' correction for continuity.
observed = observed + 0.5 * np.sign(expected - observed)
chi2, p = power_divergence(observed, expected,
ddof=observed.size - 1 - dof, axis=None,
lambda_=lambda_)
return chi2, p, dof, expected

View file

@ -0,0 +1,22 @@
#
# Author: Travis Oliphant 2002-2011 with contributions from
# SciPy Developers 2004-2011
#
# NOTE: To look at history using `git blame`, use `git blame -M -C -C`
# instead of `git blame -Lxxx,+x`.
#
from ._distn_infrastructure import (entropy, rv_discrete, rv_continuous,
rv_frozen)
from . import _continuous_distns
from . import _discrete_distns
from ._continuous_distns import *
from ._discrete_distns import *
# For backwards compatibility e.g. pymc expects distributions.__all__.
__all__ = ['entropy', 'rv_discrete', 'rv_continuous', 'rv_histogram']
# Add only the distribution names, not the *_gen names.
__all__ += _continuous_distns._distn_names
__all__ += _discrete_distns._distn_names

View file

@ -0,0 +1,639 @@
#-------------------------------------------------------------------------------
#
# Define classes for (uni/multi)-variate kernel density estimation.
#
# Currently, only Gaussian kernels are implemented.
#
# Written by: Robert Kern
#
# Date: 2004-08-09
#
# Modified: 2005-02-10 by Robert Kern.
# Contributed to SciPy
# 2005-10-07 by Robert Kern.
# Some fixes to match the new scipy_core
#
# Copyright 2004-2005 by Enthought, Inc.
#
#-------------------------------------------------------------------------------
# Standard library imports.
import warnings
# SciPy imports.
from scipy import linalg, special
from scipy.special import logsumexp
from scipy._lib._util import check_random_state
from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, dot, exp, pi,
sqrt, ravel, power, atleast_1d, squeeze, sum, transpose,
ones, cov)
import numpy as np
# Local imports.
from . import mvn
from ._stats import gaussian_kernel_estimate
__all__ = ['gaussian_kde']
class gaussian_kde(object):
"""Representation of a kernel-density estimate using Gaussian kernels.
Kernel density estimation is a way to estimate the probability density
function (PDF) of a random variable in a non-parametric way.
`gaussian_kde` works for both uni-variate and multi-variate data. It
includes automatic bandwidth determination. The estimation works best for
a unimodal distribution; bimodal or multi-modal distributions tend to be
oversmoothed.
Parameters
----------
dataset : array_like
Datapoints to estimate from. In case of univariate data this is a 1-D
array, otherwise a 2-D array with shape (# of dims, # of data).
bw_method : str, scalar or callable, optional
The method used to calculate the estimator bandwidth. This can be
'scott', 'silverman', a scalar constant or a callable. If a scalar,
this will be used directly as `kde.factor`. If a callable, it should
take a `gaussian_kde` instance as only parameter and return a scalar.
If None (default), 'scott' is used. See Notes for more details.
weights : array_like, optional
weights of datapoints. This must be the same shape as dataset.
If None (default), the samples are assumed to be equally weighted
Attributes
----------
dataset : ndarray
The dataset with which `gaussian_kde` was initialized.
d : int
Number of dimensions.
n : int
Number of datapoints.
neff : int
Effective number of datapoints.
.. versionadded:: 1.2.0
factor : float
The bandwidth factor, obtained from `kde.covariance_factor`, with which
the covariance matrix is multiplied.
covariance : ndarray
The covariance matrix of `dataset`, scaled by the calculated bandwidth
(`kde.factor`).
inv_cov : ndarray
The inverse of `covariance`.
Methods
-------
evaluate
__call__
integrate_gaussian
integrate_box_1d
integrate_box
integrate_kde
pdf
logpdf
resample
set_bandwidth
covariance_factor
Notes
-----
Bandwidth selection strongly influences the estimate obtained from the KDE
(much more so than the actual shape of the kernel). Bandwidth selection
can be done by a "rule of thumb", by cross-validation, by "plug-in
methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
uses a rule of thumb, the default is Scott's Rule.
Scott's Rule [1]_, implemented as `scotts_factor`, is::
n**(-1./(d+4)),
with ``n`` the number of data points and ``d`` the number of dimensions.
In the case of unequally weighted points, `scotts_factor` becomes::
neff**(-1./(d+4)),
with ``neff`` the effective number of datapoints.
Silverman's Rule [2]_, implemented as `silverman_factor`, is::
(n * (d + 2) / 4.)**(-1. / (d + 4)).
or in the case of unequally weighted points::
(neff * (d + 2) / 4.)**(-1. / (d + 4)).
Good general descriptions of kernel density estimation can be found in [1]_
and [2]_, the mathematics for this multi-dimensional implementation can be
found in [1]_.
With a set of weighted samples, the effective number of datapoints ``neff``
is defined by::
neff = sum(weights)^2 / sum(weights^2)
as detailed in [5]_.
References
----------
.. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
Visualization", John Wiley & Sons, New York, Chicester, 1992.
.. [2] B.W. Silverman, "Density Estimation for Statistics and Data
Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
Chapman and Hall, London, 1986.
.. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
.. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
conditional density estimation", Computational Statistics & Data
Analysis, Vol. 36, pp. 279-298, 2001.
.. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.
Series A (General), 132, 272
Examples
--------
Generate some random two-dimensional data:
>>> from scipy import stats
>>> def measure(n):
... "Measurement model, return two coupled measurements."
... m1 = np.random.normal(size=n)
... m2 = np.random.normal(scale=0.5, size=n)
... return m1+m2, m1-m2
>>> m1, m2 = measure(2000)
>>> xmin = m1.min()
>>> xmax = m1.max()
>>> ymin = m2.min()
>>> ymax = m2.max()
Perform a kernel density estimate on the data:
>>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
>>> positions = np.vstack([X.ravel(), Y.ravel()])
>>> values = np.vstack([m1, m2])
>>> kernel = stats.gaussian_kde(values)
>>> Z = np.reshape(kernel(positions).T, X.shape)
Plot the results:
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots()
>>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
... extent=[xmin, xmax, ymin, ymax])
>>> ax.plot(m1, m2, 'k.', markersize=2)
>>> ax.set_xlim([xmin, xmax])
>>> ax.set_ylim([ymin, ymax])
>>> plt.show()
"""
def __init__(self, dataset, bw_method=None, weights=None):
self.dataset = atleast_2d(asarray(dataset))
if not self.dataset.size > 1:
raise ValueError("`dataset` input should have multiple elements.")
self.d, self.n = self.dataset.shape
if weights is not None:
self._weights = atleast_1d(weights).astype(float)
self._weights /= sum(self._weights)
if self.weights.ndim != 1:
raise ValueError("`weights` input should be one-dimensional.")
if len(self._weights) != self.n:
raise ValueError("`weights` input should be of length n")
self._neff = 1/sum(self._weights**2)
self.set_bandwidth(bw_method=bw_method)
def evaluate(self, points):
"""Evaluate the estimated pdf on a set of points.
Parameters
----------
points : (# of dimensions, # of points)-array
Alternatively, a (# of dimensions,) vector can be passed in and
treated as a single point.
Returns
-------
values : (# of points,)-array
The values at each point.
Raises
------
ValueError : if the dimensionality of the input points is different than
the dimensionality of the KDE.
"""
points = atleast_2d(asarray(points))
d, m = points.shape
if d != self.d:
if d == 1 and m == self.d:
# points was passed in as a row vector
points = reshape(points, (self.d, 1))
m = 1
else:
msg = "points have dimension %s, dataset has dimension %s" % (d,
self.d)
raise ValueError(msg)
output_dtype = np.common_type(self.covariance, points)
itemsize = np.dtype(output_dtype).itemsize
if itemsize == 4:
spec = 'float'
elif itemsize == 8:
spec = 'double'
elif itemsize in (12, 16):
spec = 'long double'
else:
raise TypeError('%s has unexpected item size %d' %
(output_dtype, itemsize))
result = gaussian_kernel_estimate[spec](self.dataset.T, self.weights[:, None],
points.T, self.inv_cov, output_dtype)
return result[:, 0]
__call__ = evaluate
def integrate_gaussian(self, mean, cov):
"""
Multiply estimated density by a multivariate Gaussian and integrate
over the whole space.
Parameters
----------
mean : aray_like
A 1-D array, specifying the mean of the Gaussian.
cov : array_like
A 2-D array, specifying the covariance matrix of the Gaussian.
Returns
-------
result : scalar
The value of the integral.
Raises
------
ValueError
If the mean or covariance of the input Gaussian differs from
the KDE's dimensionality.
"""
mean = atleast_1d(squeeze(mean))
cov = atleast_2d(cov)
if mean.shape != (self.d,):
raise ValueError("mean does not have dimension %s" % self.d)
if cov.shape != (self.d, self.d):
raise ValueError("covariance does not have dimension %s" % self.d)
# make mean a column vector
mean = mean[:, newaxis]
sum_cov = self.covariance + cov
# This will raise LinAlgError if the new cov matrix is not s.p.d
# cho_factor returns (ndarray, bool) where bool is a flag for whether
# or not ndarray is upper or lower triangular
sum_cov_chol = linalg.cho_factor(sum_cov)
diff = self.dataset - mean
tdiff = linalg.cho_solve(sum_cov_chol, diff)
sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
energies = sum(diff * tdiff, axis=0) / 2.0
result = sum(exp(-energies)*self.weights, axis=0) / norm_const
return result
def integrate_box_1d(self, low, high):
"""
Computes the integral of a 1D pdf between two bounds.
Parameters
----------
low : scalar
Lower bound of integration.
high : scalar
Upper bound of integration.
Returns
-------
value : scalar
The result of the integral.
Raises
------
ValueError
If the KDE is over more than one dimension.
"""
if self.d != 1:
raise ValueError("integrate_box_1d() only handles 1D pdfs")
stdev = ravel(sqrt(self.covariance))[0]
normalized_low = ravel((low - self.dataset) / stdev)
normalized_high = ravel((high - self.dataset) / stdev)
value = np.sum(self.weights*(
special.ndtr(normalized_high) -
special.ndtr(normalized_low)))
return value
def integrate_box(self, low_bounds, high_bounds, maxpts=None):
"""Computes the integral of a pdf over a rectangular interval.
Parameters
----------
low_bounds : array_like
A 1-D array containing the lower bounds of integration.
high_bounds : array_like
A 1-D array containing the upper bounds of integration.
maxpts : int, optional
The maximum number of points to use for integration.
Returns
-------
value : scalar
The result of the integral.
"""
if maxpts is not None:
extra_kwds = {'maxpts': maxpts}
else:
extra_kwds = {}
value, inform = mvn.mvnun_weighted(low_bounds, high_bounds,
self.dataset, self.weights,
self.covariance, **extra_kwds)
if inform:
msg = ('An integral in mvn.mvnun requires more points than %s' %
(self.d * 1000))
warnings.warn(msg)
return value
def integrate_kde(self, other):
"""
Computes the integral of the product of this kernel density estimate
with another.
Parameters
----------
other : gaussian_kde instance
The other kde.
Returns
-------
value : scalar
The result of the integral.
Raises
------
ValueError
If the KDEs have different dimensionality.
"""
if other.d != self.d:
raise ValueError("KDEs are not the same dimensionality")
# we want to iterate over the smallest number of points
if other.n < self.n:
small = other
large = self
else:
small = self
large = other
sum_cov = small.covariance + large.covariance
sum_cov_chol = linalg.cho_factor(sum_cov)
result = 0.0
for i in range(small.n):
mean = small.dataset[:, i, newaxis]
diff = large.dataset - mean
tdiff = linalg.cho_solve(sum_cov_chol, diff)
energies = sum(diff * tdiff, axis=0) / 2.0
result += sum(exp(-energies)*large.weights, axis=0)*small.weights[i]
sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
result /= norm_const
return result
def resample(self, size=None, seed=None):
"""
Randomly sample a dataset from the estimated pdf.
Parameters
----------
size : int, optional
The number of samples to draw. If not provided, then the size is
the same as the effective number of samples in the underlying
dataset.
seed : {None, int, `~np.random.RandomState`, `~np.random.Generator`}, optional
This parameter defines the object to use for drawing random
variates.
If `seed` is `None` the `~np.random.RandomState` singleton is used.
If `seed` is an int, a new ``RandomState`` instance is used, seeded
with seed.
If `seed` is already a ``RandomState`` or ``Generator`` instance,
then that object is used.
Default is None.
Specify `seed` for reproducible drawing of random variates.
Returns
-------
resample : (self.d, `size`) ndarray
The sampled dataset.
"""
if size is None:
size = int(self.neff)
random_state = check_random_state(seed)
norm = transpose(random_state.multivariate_normal(
zeros((self.d,), float), self.covariance, size=size
))
indices = random_state.choice(self.n, size=size, p=self.weights)
means = self.dataset[:, indices]
return means + norm
def scotts_factor(self):
"""Compute Scott's factor.
Returns
-------
s : float
Scott's factor.
"""
return power(self.neff, -1./(self.d+4))
def silverman_factor(self):
"""Compute the Silverman factor.
Returns
-------
s : float
The silverman factor.
"""
return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))
# Default method to calculate bandwidth, can be overwritten by subclass
covariance_factor = scotts_factor
covariance_factor.__doc__ = """Computes the coefficient (`kde.factor`) that
multiplies the data covariance matrix to obtain the kernel covariance
matrix. The default is `scotts_factor`. A subclass can overwrite this
method to provide a different method, or set it through a call to
`kde.set_bandwidth`."""
def set_bandwidth(self, bw_method=None):
"""Compute the estimator bandwidth with given method.
The new bandwidth calculated after a call to `set_bandwidth` is used
for subsequent evaluations of the estimated density.
Parameters
----------
bw_method : str, scalar or callable, optional
The method used to calculate the estimator bandwidth. This can be
'scott', 'silverman', a scalar constant or a callable. If a
scalar, this will be used directly as `kde.factor`. If a callable,
it should take a `gaussian_kde` instance as only parameter and
return a scalar. If None (default), nothing happens; the current
`kde.covariance_factor` method is kept.
Notes
-----
.. versionadded:: 0.11
Examples
--------
>>> import scipy.stats as stats
>>> x1 = np.array([-7, -5, 1, 4, 5.])
>>> kde = stats.gaussian_kde(x1)
>>> xs = np.linspace(-10, 10, num=50)
>>> y1 = kde(xs)
>>> kde.set_bandwidth(bw_method='silverman')
>>> y2 = kde(xs)
>>> kde.set_bandwidth(bw_method=kde.factor / 3.)
>>> y3 = kde(xs)
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots()
>>> ax.plot(x1, np.full(x1.shape, 1 / (4. * x1.size)), 'bo',
... label='Data points (rescaled)')
>>> ax.plot(xs, y1, label='Scott (default)')
>>> ax.plot(xs, y2, label='Silverman')
>>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
>>> ax.legend()
>>> plt.show()
"""
if bw_method is None:
pass
elif bw_method == 'scott':
self.covariance_factor = self.scotts_factor
elif bw_method == 'silverman':
self.covariance_factor = self.silverman_factor
elif np.isscalar(bw_method) and not isinstance(bw_method, str):
self._bw_method = 'use constant'
self.covariance_factor = lambda: bw_method
elif callable(bw_method):
self._bw_method = bw_method
self.covariance_factor = lambda: self._bw_method(self)
else:
msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
"or a callable."
raise ValueError(msg)
self._compute_covariance()
def _compute_covariance(self):
"""Computes the covariance matrix for each Gaussian kernel using
covariance_factor().
"""
self.factor = self.covariance_factor()
# Cache covariance and inverse covariance of the data
if not hasattr(self, '_data_inv_cov'):
self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
bias=False,
aweights=self.weights))
self._data_inv_cov = linalg.inv(self._data_covariance)
self.covariance = self._data_covariance * self.factor**2
self.inv_cov = self._data_inv_cov / self.factor**2
self._norm_factor = sqrt(linalg.det(2*pi*self.covariance))
def pdf(self, x):
"""
Evaluate the estimated pdf on a provided set of points.
Notes
-----
This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``
docstring for more details.
"""
return self.evaluate(x)
def logpdf(self, x):
"""
Evaluate the log of the estimated pdf on a provided set of points.
"""
points = atleast_2d(x)
d, m = points.shape
if d != self.d:
if d == 1 and m == self.d:
# points was passed in as a row vector
points = reshape(points, (self.d, 1))
m = 1
else:
msg = "points have dimension %s, dataset has dimension %s" % (d,
self.d)
raise ValueError(msg)
if m >= self.n:
# there are more points than data, so loop over data
energy = zeros((self.n, m), dtype=float)
for i in range(self.n):
diff = self.dataset[:, i, newaxis] - points
tdiff = dot(self.inv_cov, diff)
energy[i] = sum(diff*tdiff, axis=0) / 2.0
result = logsumexp(-energy.T,
b=self.weights / self._norm_factor, axis=1)
else:
# loop over points
result = zeros((m,), dtype=float)
for i in range(m):
diff = self.dataset - points[:, i, newaxis]
tdiff = dot(self.inv_cov, diff)
energy = sum(diff * tdiff, axis=0) / 2.0
result[i] = logsumexp(-energy, b=self.weights /
self._norm_factor)
return result
@property
def weights(self):
try:
return self._weights
except AttributeError:
self._weights = ones(self.n)/self.n
return self._weights
@property
def neff(self):
try:
return self._neff
except AttributeError:
self._neff = 1/sum(self.weights**2)
return self._neff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,135 @@
"""
===================================================================
Statistical functions for masked arrays (:mod:`scipy.stats.mstats`)
===================================================================
.. currentmodule:: scipy.stats.mstats
This module contains a large number of statistical functions that can
be used with masked arrays.
Most of these functions are similar to those in `scipy.stats` but might
have small differences in the API or in the algorithm used. Since this
is a relatively new package, some API changes are still possible.
Summary statistics
==================
.. autosummary::
:toctree: generated/
describe
gmean
hmean
kurtosis
mode
mquantiles
hdmedian
hdquantiles
hdquantiles_sd
idealfourths
plotting_positions
meppf
moment
skew
tmean
tvar
tmin
tmax
tsem
variation
find_repeats
sem
trimmed_mean
trimmed_mean_ci
trimmed_std
trimmed_var
Frequency statistics
====================
.. autosummary::
:toctree: generated/
scoreatpercentile
Correlation functions
=====================
.. autosummary::
:toctree: generated/
f_oneway
pearsonr
spearmanr
pointbiserialr
kendalltau
kendalltau_seasonal
linregress
siegelslopes
theilslopes
sen_seasonal_slopes
Statistical tests
=================
.. autosummary::
:toctree: generated/
ttest_1samp
ttest_onesamp
ttest_ind
ttest_rel
chisquare
kstest
ks_2samp
ks_1samp
ks_twosamp
mannwhitneyu
rankdata
kruskal
kruskalwallis
friedmanchisquare
brunnermunzel
skewtest
kurtosistest
normaltest
Transformations
===============
.. autosummary::
:toctree: generated/
obrientransform
trim
trima
trimmed_stde
trimr
trimtail
trimboth
winsorize
zmap
zscore
Other
=====
.. autosummary::
:toctree: generated/
argstoarray
count_tied_groups
msign
compare_medians_ms
median_cihs
mjci
mquantiles_cimj
rsh
"""
from .mstats_basic import *
from .mstats_extras import *
# Functions that support masked array input in stats but need to be kept in the
# mstats namespace for backwards compatibility:
from scipy.stats import gmean, hmean, zmap, zscore, chisquare

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,474 @@
"""
Additional statistics functions with support for masked arrays.
"""
# Original author (2007): Pierre GF Gerard-Marchant
__all__ = ['compare_medians_ms',
'hdquantiles', 'hdmedian', 'hdquantiles_sd',
'idealfourths',
'median_cihs','mjci','mquantiles_cimj',
'rsh',
'trimmed_mean_ci',]
import numpy as np
from numpy import float_, int_, ndarray
import numpy.ma as ma
from numpy.ma import MaskedArray
from . import mstats_basic as mstats
from scipy.stats.distributions import norm, beta, t, binom
def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
"""
Computes quantile estimates with the Harrell-Davis method.
The quantile estimates are calculated as a weighted linear combination
of order statistics.
Parameters
----------
data : array_like
Data array.
prob : sequence, optional
Sequence of quantiles to compute.
axis : int or None, optional
Axis along which to compute the quantiles. If None, use a flattened
array.
var : bool, optional
Whether to return the variance of the estimate.
Returns
-------
hdquantiles : MaskedArray
A (p,) array of quantiles (if `var` is False), or a (2,p) array of
quantiles and variances (if `var` is True), where ``p`` is the
number of quantiles.
See Also
--------
hdquantiles_sd
"""
def _hd_1D(data,prob,var):
"Computes the HD quantiles for a 1D array. Returns nan for invalid data."
xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
# Don't use length here, in case we have a numpy scalar
n = xsorted.size
hd = np.empty((2,len(prob)), float_)
if n < 2:
hd.flat = np.nan
if var:
return hd
return hd[0]
v = np.arange(n+1) / float(n)
betacdf = beta.cdf
for (i,p) in enumerate(prob):
_w = betacdf(v, (n+1)*p, (n+1)*(1-p))
w = _w[1:] - _w[:-1]
hd_mean = np.dot(w, xsorted)
hd[0,i] = hd_mean
#
hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
#
hd[0, prob == 0] = xsorted[0]
hd[0, prob == 1] = xsorted[-1]
if var:
hd[1, prob == 0] = hd[1, prob == 1] = np.nan
return hd
return hd[0]
# Initialization & checks
data = ma.array(data, copy=False, dtype=float_)
p = np.array(prob, copy=False, ndmin=1)
# Computes quantiles along axis (or globally)
if (axis is None) or (data.ndim == 1):
result = _hd_1D(data, p, var)
else:
if data.ndim > 2:
raise ValueError("Array 'data' must be at most two dimensional, "
"but got data.ndim = %d" % data.ndim)
result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
return ma.fix_invalid(result, copy=False)
def hdmedian(data, axis=-1, var=False):
"""
Returns the Harrell-Davis estimate of the median along the given axis.
Parameters
----------
data : ndarray
Data array.
axis : int, optional
Axis along which to compute the quantiles. If None, use a flattened
array.
var : bool, optional
Whether to return the variance of the estimate.
Returns
-------
hdmedian : MaskedArray
The median values. If ``var=True``, the variance is returned inside
the masked array. E.g. for a 1-D array the shape change from (1,) to
(2,).
"""
result = hdquantiles(data,[0.5], axis=axis, var=var)
return result.squeeze()
def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
"""
The standard error of the Harrell-Davis quantile estimates by jackknife.
Parameters
----------
data : array_like
Data array.
prob : sequence, optional
Sequence of quantiles to compute.
axis : int, optional
Axis along which to compute the quantiles. If None, use a flattened
array.
Returns
-------
hdquantiles_sd : MaskedArray
Standard error of the Harrell-Davis quantile estimates.
See Also
--------
hdquantiles
"""
def _hdsd_1D(data, prob):
"Computes the std error for 1D arrays."
xsorted = np.sort(data.compressed())
n = len(xsorted)
hdsd = np.empty(len(prob), float_)
if n < 2:
hdsd.flat = np.nan
vv = np.arange(n) / float(n-1)
betacdf = beta.cdf
for (i,p) in enumerate(prob):
_w = betacdf(vv, (n+1)*p, (n+1)*(1-p))
w = _w[1:] - _w[:-1]
mx_ = np.fromiter([np.dot(w,xsorted[np.r_[list(range(0,k)),
list(range(k+1,n))].astype(int_)])
for k in range(n)], dtype=float_)
mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1)
hdsd[i] = float(n-1) * np.sqrt(np.diag(mx_var).diagonal() / float(n))
return hdsd
# Initialization & checks
data = ma.array(data, copy=False, dtype=float_)
p = np.array(prob, copy=False, ndmin=1)
# Computes quantiles along axis (or globally)
if (axis is None):
result = _hdsd_1D(data, p)
else:
if data.ndim > 2:
raise ValueError("Array 'data' must be at most two dimensional, "
"but got data.ndim = %d" % data.ndim)
result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
return ma.fix_invalid(result, copy=False).ravel()
def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
alpha=0.05, axis=None):
"""
Selected confidence interval of the trimmed mean along the given axis.
Parameters
----------
data : array_like
Input data.
limits : {None, tuple}, optional
None or a two item tuple.
Tuple of the percentages to cut on each side of the array, with respect
to the number of unmasked data, as floats between 0. and 1. If ``n``
is the number of unmasked data before trimming, then
(``n * limits[0]``)th smallest data and (``n * limits[1]``)th
largest data are masked. The total number of unmasked data after
trimming is ``n * (1. - sum(limits))``.
The value of one limit can be set to None to indicate an open interval.
Defaults to (0.2, 0.2).
inclusive : (2,) tuple of boolean, optional
If relative==False, tuple indicating whether values exactly equal to
the absolute limits are allowed.
If relative==True, tuple indicating whether the number of data being
masked on each side should be rounded (True) or truncated (False).
Defaults to (True, True).
alpha : float, optional
Confidence level of the intervals.
Defaults to 0.05.
axis : int, optional
Axis along which to cut. If None, uses a flattened version of `data`.
Defaults to None.
Returns
-------
trimmed_mean_ci : (2,) ndarray
The lower and upper confidence intervals of the trimmed data.
"""
data = ma.array(data, copy=False)
trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
tmean = trimmed.mean(axis)
tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
df = trimmed.count(axis) - 1
tppf = t.ppf(1-alpha/2.,df)
return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
def mjci(data, prob=[0.25,0.5,0.75], axis=None):
"""
Returns the Maritz-Jarrett estimators of the standard error of selected
experimental quantiles of the data.
Parameters
----------
data : ndarray
Data array.
prob : sequence, optional
Sequence of quantiles to compute.
axis : int or None, optional
Axis along which to compute the quantiles. If None, use a flattened
array.
"""
def _mjci_1D(data, p):
data = np.sort(data.compressed())
n = data.size
prob = (np.array(p) * n + 0.5).astype(int_)
betacdf = beta.cdf
mj = np.empty(len(prob), float_)
x = np.arange(1,n+1, dtype=float_) / n
y = x - 1./n
for (i,m) in enumerate(prob):
W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
C1 = np.dot(W,data)
C2 = np.dot(W,data**2)
mj[i] = np.sqrt(C2 - C1**2)
return mj
data = ma.array(data, copy=False)
if data.ndim > 2:
raise ValueError("Array 'data' must be at most two dimensional, "
"but got data.ndim = %d" % data.ndim)
p = np.array(prob, copy=False, ndmin=1)
# Computes quantiles along axis (or globally)
if (axis is None):
return _mjci_1D(data, p)
else:
return ma.apply_along_axis(_mjci_1D, axis, data, p)
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
"""
Computes the alpha confidence interval for the selected quantiles of the
data, with Maritz-Jarrett estimators.
Parameters
----------
data : ndarray
Data array.
prob : sequence, optional
Sequence of quantiles to compute.
alpha : float, optional
Confidence level of the intervals.
axis : int or None, optional
Axis along which to compute the quantiles.
If None, use a flattened array.
Returns
-------
ci_lower : ndarray
The lower boundaries of the confidence interval. Of the same length as
`prob`.
ci_upper : ndarray
The upper boundaries of the confidence interval. Of the same length as
`prob`.
"""
alpha = min(alpha, 1 - alpha)
z = norm.ppf(1 - alpha/2.)
xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
smj = mjci(data, prob, axis=axis)
return (xq - z * smj, xq + z * smj)
def median_cihs(data, alpha=0.05, axis=None):
"""
Computes the alpha-level confidence interval for the median of the data.
Uses the Hettmasperger-Sheather method.
Parameters
----------
data : array_like
Input data. Masked values are discarded. The input should be 1D only,
or `axis` should be set to None.
alpha : float, optional
Confidence level of the intervals.
axis : int or None, optional
Axis along which to compute the quantiles. If None, use a flattened
array.
Returns
-------
median_cihs
Alpha level confidence interval.
"""
def _cihs_1D(data, alpha):
data = np.sort(data.compressed())
n = len(data)
alpha = min(alpha, 1-alpha)
k = int(binom._ppf(alpha/2., n, 0.5))
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
if gk < 1-alpha:
k -= 1
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
I = (gk - 1 + alpha)/(gk - gkk)
lambd = (n-k) * I / float(k + (n-2*k)*I)
lims = (lambd*data[k] + (1-lambd)*data[k-1],
lambd*data[n-k-1] + (1-lambd)*data[n-k])
return lims
data = ma.array(data, copy=False)
# Computes quantiles along axis (or globally)
if (axis is None):
result = _cihs_1D(data, alpha)
else:
if data.ndim > 2:
raise ValueError("Array 'data' must be at most two dimensional, "
"but got data.ndim = %d" % data.ndim)
result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
return result
def compare_medians_ms(group_1, group_2, axis=None):
"""
Compares the medians from two independent groups along the given axis.
The comparison is performed using the McKean-Schrader estimate of the
standard error of the medians.
Parameters
----------
group_1 : array_like
First dataset. Has to be of size >=7.
group_2 : array_like
Second dataset. Has to be of size >=7.
axis : int, optional
Axis along which the medians are estimated. If None, the arrays are
flattened. If `axis` is not None, then `group_1` and `group_2`
should have the same shape.
Returns
-------
compare_medians_ms : {float, ndarray}
If `axis` is None, then returns a float, otherwise returns a 1-D
ndarray of floats with a length equal to the length of `group_1`
along `axis`.
"""
(med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
(std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
mstats.stde_median(group_2, axis=axis))
W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
return 1 - norm.cdf(W)
def idealfourths(data, axis=None):
"""
Returns an estimate of the lower and upper quartiles.
Uses the ideal fourths algorithm.
Parameters
----------
data : array_like
Input array.
axis : int, optional
Axis along which the quartiles are estimated. If None, the arrays are
flattened.
Returns
-------
idealfourths : {list of floats, masked array}
Returns the two internal values that divide `data` into four parts
using the ideal fourths algorithm either along the flattened array
(if `axis` is None) or along `axis` of `data`.
"""
def _idf(data):
x = data.compressed()
n = len(x)
if n < 3:
return [np.nan,np.nan]
(j,h) = divmod(n/4. + 5/12.,1)
j = int(j)
qlo = (1-h)*x[j-1] + h*x[j]
k = n - j
qup = (1-h)*x[k] + h*x[k-1]
return [qlo, qup]
data = ma.sort(data, axis=axis).view(MaskedArray)
if (axis is None):
return _idf(data)
else:
return ma.apply_along_axis(_idf, axis, data)
def rsh(data, points=None):
"""
Evaluates Rosenblatt's shifted histogram estimators for each data point.
Rosenblatt's estimator is a centered finite-difference approximation to the
derivative of the empirical cumulative distribution function.
Parameters
----------
data : sequence
Input data, should be 1-D. Masked values are ignored.
points : sequence or None, optional
Sequence of points where to evaluate Rosenblatt shifted histogram.
If None, use the data.
"""
data = ma.array(data, copy=False)
if points is None:
points = data
else:
points = np.array(points, copy=False, ndmin=1)
if data.ndim != 1:
raise AttributeError("The input array should be 1D only !")
n = data.count()
r = idealfourths(data, axis=None)
h = 1.2 * (r[-1]-r[0]) / n**(1./5)
nhi = (data[:,None] <= points[None,:] + h).sum(0)
nlo = (data[:,None] < points[None,:] - h).sum(0)
return (nhi-nlo) / (2.*n*h)

Binary file not shown.

View file

@ -0,0 +1,36 @@
from os.path import join
def configuration(parent_package='',top_path=None):
from numpy.distutils.misc_util import Configuration
config = Configuration('stats', parent_package, top_path)
config.add_data_dir('tests')
statlib_src = [join('statlib', '*.f')]
config.add_library('statlib', sources=statlib_src)
# add statlib module
config.add_extension('statlib',
sources=['statlib.pyf'],
f2py_options=['--no-wrap-functions'],
libraries=['statlib'],
depends=statlib_src
)
# add _stats module
config.add_extension('_stats',
sources=['_stats.c'],
)
# add mvn module
config.add_extension('mvn',
sources=['mvn.pyf','mvndst.f'],
)
return config
if __name__ == '__main__':
from numpy.distutils.core import setup
setup(**configuration(top_path='').todict())

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,324 @@
import pickle
import numpy as np
import numpy.testing as npt
from numpy.testing import assert_allclose, assert_equal, suppress_warnings
from pytest import raises as assert_raises
import numpy.ma.testutils as ma_npt
from scipy._lib._util import getfullargspec_no_self as _getfullargspec
from scipy import stats
def check_named_results(res, attributes, ma=False):
for i, attr in enumerate(attributes):
if ma:
ma_npt.assert_equal(res[i], getattr(res, attr))
else:
npt.assert_equal(res[i], getattr(res, attr))
def check_normalization(distfn, args, distname):
norm_moment = distfn.moment(0, *args)
npt.assert_allclose(norm_moment, 1.0)
# this is a temporary plug: either ncf or expect is problematic;
# best be marked as a knownfail, but I've no clue how to do it.
if distname == "ncf":
atol, rtol = 1e-5, 0
else:
atol, rtol = 1e-7, 1e-7
normalization_expect = distfn.expect(lambda x: 1, args=args)
npt.assert_allclose(normalization_expect, 1.0, atol=atol, rtol=rtol,
err_msg=distname, verbose=True)
_a, _b = distfn.support(*args)
normalization_cdf = distfn.cdf(_b, *args)
npt.assert_allclose(normalization_cdf, 1.0)
def check_moment(distfn, arg, m, v, msg):
m1 = distfn.moment(1, *arg)
m2 = distfn.moment(2, *arg)
if not np.isinf(m):
npt.assert_almost_equal(m1, m, decimal=10, err_msg=msg +
' - 1st moment')
else: # or np.isnan(m1),
npt.assert_(np.isinf(m1),
msg + ' - 1st moment -infinite, m1=%s' % str(m1))
if not np.isinf(v):
npt.assert_almost_equal(m2 - m1 * m1, v, decimal=10, err_msg=msg +
' - 2ndt moment')
else: # or np.isnan(m2),
npt.assert_(np.isinf(m2),
msg + ' - 2nd moment -infinite, m2=%s' % str(m2))
def check_mean_expect(distfn, arg, m, msg):
if np.isfinite(m):
m1 = distfn.expect(lambda x: x, arg)
npt.assert_almost_equal(m1, m, decimal=5, err_msg=msg +
' - 1st moment (expect)')
def check_var_expect(distfn, arg, m, v, msg):
if np.isfinite(v):
m2 = distfn.expect(lambda x: x*x, arg)
npt.assert_almost_equal(m2, v + m*m, decimal=5, err_msg=msg +
' - 2st moment (expect)')
def check_skew_expect(distfn, arg, m, v, s, msg):
if np.isfinite(s):
m3e = distfn.expect(lambda x: np.power(x-m, 3), arg)
npt.assert_almost_equal(m3e, s * np.power(v, 1.5),
decimal=5, err_msg=msg + ' - skew')
else:
npt.assert_(np.isnan(s))
def check_kurt_expect(distfn, arg, m, v, k, msg):
if np.isfinite(k):
m4e = distfn.expect(lambda x: np.power(x-m, 4), arg)
npt.assert_allclose(m4e, (k + 3.) * np.power(v, 2), atol=1e-5, rtol=1e-5,
err_msg=msg + ' - kurtosis')
elif not np.isposinf(k):
npt.assert_(np.isnan(k))
def check_entropy(distfn, arg, msg):
ent = distfn.entropy(*arg)
npt.assert_(not np.isnan(ent), msg + 'test Entropy is nan')
def check_private_entropy(distfn, args, superclass):
# compare a generic _entropy with the distribution-specific implementation
npt.assert_allclose(distfn._entropy(*args),
superclass._entropy(distfn, *args))
def check_entropy_vect_scale(distfn, arg):
# check 2-d
sc = np.asarray([[1, 2], [3, 4]])
v_ent = distfn.entropy(*arg, scale=sc)
s_ent = [distfn.entropy(*arg, scale=s) for s in sc.ravel()]
s_ent = np.asarray(s_ent).reshape(v_ent.shape)
assert_allclose(v_ent, s_ent, atol=1e-14)
# check invalid value, check cast
sc = [1, 2, -3]
v_ent = distfn.entropy(*arg, scale=sc)
s_ent = [distfn.entropy(*arg, scale=s) for s in sc]
s_ent = np.asarray(s_ent).reshape(v_ent.shape)
assert_allclose(v_ent, s_ent, atol=1e-14)
def check_edge_support(distfn, args):
# Make sure that x=self.a and self.b are handled correctly.
x = distfn.support(*args)
if isinstance(distfn, stats.rv_discrete):
x = x[0]-1, x[1]
npt.assert_equal(distfn.cdf(x, *args), [0.0, 1.0])
npt.assert_equal(distfn.sf(x, *args), [1.0, 0.0])
if distfn.name not in ('skellam', 'dlaplace'):
# with a = -inf, log(0) generates warnings
npt.assert_equal(distfn.logcdf(x, *args), [-np.inf, 0.0])
npt.assert_equal(distfn.logsf(x, *args), [0.0, -np.inf])
npt.assert_equal(distfn.ppf([0.0, 1.0], *args), x)
npt.assert_equal(distfn.isf([0.0, 1.0], *args), x[::-1])
# out-of-bounds for isf & ppf
npt.assert_(np.isnan(distfn.isf([-1, 2], *args)).all())
npt.assert_(np.isnan(distfn.ppf([-1, 2], *args)).all())
def check_named_args(distfn, x, shape_args, defaults, meths):
## Check calling w/ named arguments.
# check consistency of shapes, numargs and _parse signature
signature = _getfullargspec(distfn._parse_args)
npt.assert_(signature.varargs is None)
npt.assert_(signature.varkw is None)
npt.assert_(not signature.kwonlyargs)
npt.assert_(list(signature.defaults) == list(defaults))
shape_argnames = signature.args[:-len(defaults)] # a, b, loc=0, scale=1
if distfn.shapes:
shapes_ = distfn.shapes.replace(',', ' ').split()
else:
shapes_ = ''
npt.assert_(len(shapes_) == distfn.numargs)
npt.assert_(len(shapes_) == len(shape_argnames))
# check calling w/ named arguments
shape_args = list(shape_args)
vals = [meth(x, *shape_args) for meth in meths]
npt.assert_(np.all(np.isfinite(vals)))
names, a, k = shape_argnames[:], shape_args[:], {}
while names:
k.update({names.pop(): a.pop()})
v = [meth(x, *a, **k) for meth in meths]
npt.assert_array_equal(vals, v)
if 'n' not in k.keys():
# `n` is first parameter of moment(), so can't be used as named arg
npt.assert_equal(distfn.moment(1, *a, **k),
distfn.moment(1, *shape_args))
# unknown arguments should not go through:
k.update({'kaboom': 42})
assert_raises(TypeError, distfn.cdf, x, **k)
def check_random_state_property(distfn, args):
# check the random_state attribute of a distribution *instance*
# This test fiddles with distfn.random_state. This breaks other tests,
# hence need to save it and then restore.
rndm = distfn.random_state
# baseline: this relies on the global state
np.random.seed(1234)
distfn.random_state = None
r0 = distfn.rvs(*args, size=8)
# use an explicit instance-level random_state
distfn.random_state = 1234
r1 = distfn.rvs(*args, size=8)
npt.assert_equal(r0, r1)
distfn.random_state = np.random.RandomState(1234)
r2 = distfn.rvs(*args, size=8)
npt.assert_equal(r0, r2)
# check that np.random.Generator can be used (numpy >= 1.17)
if hasattr(np.random, 'default_rng'):
# obtain a np.random.Generator object
rng = np.random.default_rng(1234)
distfn.rvs(*args, size=1, random_state=rng)
# can override the instance-level random_state for an individual .rvs call
distfn.random_state = 2
orig_state = distfn.random_state.get_state()
r3 = distfn.rvs(*args, size=8, random_state=np.random.RandomState(1234))
npt.assert_equal(r0, r3)
# ... and that does not alter the instance-level random_state!
npt.assert_equal(distfn.random_state.get_state(), orig_state)
# finally, restore the random_state
distfn.random_state = rndm
def check_meth_dtype(distfn, arg, meths):
q0 = [0.25, 0.5, 0.75]
x0 = distfn.ppf(q0, *arg)
x_cast = [x0.astype(tp) for tp in
(np.int_, np.float16, np.float32, np.float64)]
for x in x_cast:
# casting may have clipped the values, exclude those
distfn._argcheck(*arg)
x = x[(distfn.a < x) & (x < distfn.b)]
for meth in meths:
val = meth(x, *arg)
npt.assert_(val.dtype == np.float_)
def check_ppf_dtype(distfn, arg):
q0 = np.asarray([0.25, 0.5, 0.75])
q_cast = [q0.astype(tp) for tp in (np.float16, np.float32, np.float64)]
for q in q_cast:
for meth in [distfn.ppf, distfn.isf]:
val = meth(q, *arg)
npt.assert_(val.dtype == np.float_)
def check_cmplx_deriv(distfn, arg):
# Distributions allow complex arguments.
def deriv(f, x, *arg):
x = np.asarray(x)
h = 1e-10
return (f(x + h*1j, *arg)/h).imag
x0 = distfn.ppf([0.25, 0.51, 0.75], *arg)
x_cast = [x0.astype(tp) for tp in
(np.int_, np.float16, np.float32, np.float64)]
for x in x_cast:
# casting may have clipped the values, exclude those
distfn._argcheck(*arg)
x = x[(distfn.a < x) & (x < distfn.b)]
pdf, cdf, sf = distfn.pdf(x, *arg), distfn.cdf(x, *arg), distfn.sf(x, *arg)
assert_allclose(deriv(distfn.cdf, x, *arg), pdf, rtol=1e-5)
assert_allclose(deriv(distfn.logcdf, x, *arg), pdf/cdf, rtol=1e-5)
assert_allclose(deriv(distfn.sf, x, *arg), -pdf, rtol=1e-5)
assert_allclose(deriv(distfn.logsf, x, *arg), -pdf/sf, rtol=1e-5)
assert_allclose(deriv(distfn.logpdf, x, *arg),
deriv(distfn.pdf, x, *arg) / distfn.pdf(x, *arg),
rtol=1e-5)
def check_pickling(distfn, args):
# check that a distribution instance pickles and unpickles
# pay special attention to the random_state property
# save the random_state (restore later)
rndm = distfn.random_state
distfn.random_state = 1234
distfn.rvs(*args, size=8)
s = pickle.dumps(distfn)
r0 = distfn.rvs(*args, size=8)
unpickled = pickle.loads(s)
r1 = unpickled.rvs(*args, size=8)
npt.assert_equal(r0, r1)
# also smoke test some methods
medians = [distfn.ppf(0.5, *args), unpickled.ppf(0.5, *args)]
npt.assert_equal(medians[0], medians[1])
npt.assert_equal(distfn.cdf(medians[0], *args),
unpickled.cdf(medians[1], *args))
# restore the random_state
distfn.random_state = rndm
def check_freezing(distfn, args):
# regression test for gh-11089: freezing a distribution fails
# if loc and/or scale are specified
if isinstance(distfn, stats.rv_continuous):
locscale = {'loc': 1, 'scale': 2}
else:
locscale = {'loc': 1}
rv = distfn(*args, **locscale)
assert rv.a == distfn(*args).a
assert rv.b == distfn(*args).b
def check_rvs_broadcast(distfunc, distname, allargs, shape, shape_only, otype):
np.random.seed(123)
with suppress_warnings() as sup:
# frechet_l and frechet_r are deprecated, so all their
# methods generate DeprecationWarnings.
sup.filter(category=DeprecationWarning, message=".*frechet_")
sample = distfunc.rvs(*allargs)
assert_equal(sample.shape, shape, "%s: rvs failed to broadcast" % distname)
if not shape_only:
rvs = np.vectorize(lambda *allargs: distfunc.rvs(*allargs), otypes=otype)
np.random.seed(123)
expected = rvs(*allargs)
assert_allclose(sample, expected, rtol=1e-15)

View file

@ -0,0 +1,108 @@
NIST/ITL StRD
Dataset Name: AtmWtAg (AtmWtAg.dat)
File Format: ASCII
Certified Values (lines 41 to 47)
Data (lines 61 to 108)
Procedure: Analysis of Variance
Reference: Powell, L.J., Murphy, T.J. and Gramlich, J.W. (1982).
"The Absolute Isotopic Abundance & Atomic Weight
of a Reference Sample of Silver".
NBS Journal of Research, 87, pp. 9-19.
Data: 1 Factor
2 Treatments
24 Replicates/Cell
48 Observations
7 Constant Leading Digits
Average Level of Difficulty
Observed Data
Model: 3 Parameters (mu, tau_1, tau_2)
y_{ij} = mu + tau_i + epsilon_{ij}
Certified Values:
Source of Sums of Mean
Variation df Squares Squares F Statistic
Between Instrument 1 3.63834187500000E-09 3.63834187500000E-09 1.59467335677930E+01
Within Instrument 46 1.04951729166667E-08 2.28155932971014E-10
Certified R-Squared 2.57426544538321E-01
Certified Residual
Standard Deviation 1.51048314446410E-05
Data: Instrument AgWt
1 107.8681568
1 107.8681465
1 107.8681572
1 107.8681785
1 107.8681446
1 107.8681903
1 107.8681526
1 107.8681494
1 107.8681616
1 107.8681587
1 107.8681519
1 107.8681486
1 107.8681419
1 107.8681569
1 107.8681508
1 107.8681672
1 107.8681385
1 107.8681518
1 107.8681662
1 107.8681424
1 107.8681360
1 107.8681333
1 107.8681610
1 107.8681477
2 107.8681079
2 107.8681344
2 107.8681513
2 107.8681197
2 107.8681604
2 107.8681385
2 107.8681642
2 107.8681365
2 107.8681151
2 107.8681082
2 107.8681517
2 107.8681448
2 107.8681198
2 107.8681482
2 107.8681334
2 107.8681609
2 107.8681101
2 107.8681512
2 107.8681469
2 107.8681360
2 107.8681254
2 107.8681261
2 107.8681450
2 107.8681368

View file

@ -0,0 +1,85 @@
NIST/ITL StRD
Dataset Name: SiRstv (SiRstv.dat)
File Format: ASCII
Certified Values (lines 41 to 47)
Data (lines 61 to 85)
Procedure: Analysis of Variance
Reference: Ehrstein, James and Croarkin, M. Carroll.
Unpublished NIST dataset.
Data: 1 Factor
5 Treatments
5 Replicates/Cell
25 Observations
3 Constant Leading Digits
Lower Level of Difficulty
Observed Data
Model: 6 Parameters (mu,tau_1, ... , tau_5)
y_{ij} = mu + tau_i + epsilon_{ij}
Certified Values:
Source of Sums of Mean
Variation df Squares Squares F Statistic
Between Instrument 4 5.11462616000000E-02 1.27865654000000E-02 1.18046237440255E+00
Within Instrument 20 2.16636560000000E-01 1.08318280000000E-02
Certified R-Squared 1.90999039051129E-01
Certified Residual
Standard Deviation 1.04076068334656E-01
Data: Instrument Resistance
1 196.3052
1 196.1240
1 196.1890
1 196.2569
1 196.3403
2 196.3042
2 196.3825
2 196.1669
2 196.3257
2 196.0422
3 196.1303
3 196.2005
3 196.2889
3 196.0343
3 196.1811
4 196.2795
4 196.1748
4 196.1494
4 196.1485
4 195.9885
5 196.2119
5 196.1051
5 196.1850
5 196.0052
5 196.2090

View file

@ -0,0 +1,249 @@
NIST/ITL StRD
Dataset Name: SmLs01 (SmLs01.dat)
File Format: ASCII
Certified Values (lines 41 to 47)
Data (lines 61 to 249)
Procedure: Analysis of Variance
Reference: Simon, Stephen D. and Lesage, James P. (1989).
"Assessing the Accuracy of ANOVA Calculations in
Statistical Software".
Computational Statistics & Data Analysis, 8, pp. 325-332.
Data: 1 Factor
9 Treatments
21 Replicates/Cell
189 Observations
1 Constant Leading Digit
Lower Level of Difficulty
Generated Data
Model: 10 Parameters (mu,tau_1, ... , tau_9)
y_{ij} = mu + tau_i + epsilon_{ij}
Certified Values:
Source of Sums of Mean
Variation df Squares Squares F Statistic
Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01
Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02
Certified R-Squared 4.82758620689655E-01
Certified Residual
Standard Deviation 1.00000000000000E-01
Data: Treatment Response
1 1.4
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
2 1.3
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
3 1.5
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
4 1.3
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
5 1.5
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
6 1.3
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
7 1.5
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
8 1.3
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
9 1.5
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,249 @@
NIST/ITL StRD
Dataset Name: SmLs04 (SmLs04.dat)
File Format: ASCII
Certified Values (lines 41 to 47)
Data (lines 61 to 249)
Procedure: Analysis of Variance
Reference: Simon, Stephen D. and Lesage, James P. (1989).
"Assessing the Accuracy of ANOVA Calculations in
Statistical Software".
Computational Statistics & Data Analysis, 8, pp. 325-332.
Data: 1 Factor
9 Treatments
21 Replicates/Cell
189 Observations
7 Constant Leading Digits
Average Level of Difficulty
Generated Data
Model: 10 Parameters (mu,tau_1, ... , tau_9)
y_{ij} = mu + tau_i + epsilon_{ij}
Certified Values:
Source of Sums of Mean
Variation df Squares Squares F Statistic
Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01
Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02
Certified R-Squared 4.82758620689655E-01
Certified Residual
Standard Deviation 1.00000000000000E-01
Data: Treatment Response
1 1000000.4
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
2 1000000.3
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
3 1000000.5
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
4 1000000.3
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
5 1000000.5
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
6 1000000.3
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
7 1000000.5
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
8 1000000.3
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
9 1000000.5
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,249 @@
NIST/ITL StRD
Dataset Name: SmLs07 (SmLs07.dat)
File Format: ASCII
Certified Values (lines 41 to 47)
Data (lines 61 to 249)
Procedure: Analysis of Variance
Reference: Simon, Stephen D. and Lesage, James P. (1989).
"Assessing the Accuracy of ANOVA Calculations in
Statistical Software".
Computational Statistics & Data Analysis, 8, pp. 325-332.
Data: 1 Factor
9 Treatments
21 Replicates/Cell
189 Observations
13 Constant Leading Digits
Higher Level of Difficulty
Generated Data
Model: 10 Parameters (mu,tau_1, ... , tau_9)
y_{ij} = mu + tau_i + epsilon_{ij}
Certified Values:
Source of Sums of Mean
Variation df Squares Squares F Statistic
Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01
Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02
Certified R-Squared 4.82758620689655E-01
Certified Residual
Standard Deviation 1.00000000000000E-01
Data: Treatment Response
1 1000000000000.4
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
2 1000000000000.3
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
3 1000000000000.5
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
4 1000000000000.3
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
5 1000000000000.5
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
6 1000000000000.3
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
7 1000000000000.5
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
8 1000000000000.3
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
9 1000000000000.5
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,97 @@
NIST/ITL StRD
Dataset Name: Norris (Norris.dat)
File Format: ASCII
Certified Values (lines 31 to 46)
Data (lines 61 to 96)
Procedure: Linear Least Squares Regression
Reference: Norris, J., NIST.
Calibration of Ozone Monitors.
Data: 1 Response Variable (y)
1 Predictor Variable (x)
36 Observations
Lower Level of Difficulty
Observed Data
Model: Linear Class
2 Parameters (B0,B1)
y = B0 + B1*x + e
Certified Regression Statistics
Standard Deviation
Parameter Estimate of Estimate
B0 -0.262323073774029 0.232818234301152
B1 1.00211681802045 0.429796848199937E-03
Residual
Standard Deviation 0.884796396144373
R-Squared 0.999993745883712
Certified Analysis of Variance Table
Source of Degrees of Sums of Mean
Variation Freedom Squares Squares F Statistic
Regression 1 4255954.13232369 4255954.13232369 5436385.54079785
Residual 34 26.6173985294224 0.782864662630069
Data: y x
0.1 0.2
338.8 337.4
118.1 118.2
888.0 884.6
9.2 10.1
228.1 226.5
668.5 666.3
998.5 996.3
449.1 448.6
778.9 777.0
559.2 558.2
0.3 0.4
0.1 0.6
778.1 775.5
668.8 666.9
339.3 338.0
448.9 447.5
10.8 11.6
557.7 556.0
228.3 228.1
998.0 995.8
888.8 887.6
119.6 120.2
0.3 0.3
0.6 0.3
557.6 556.8
339.3 339.1
888.0 887.2
998.5 999.0
778.9 779.0
10.2 11.1
117.6 118.3
228.9 229.2
668.4 669.1
449.2 448.9
0.2 0.5

View file

@ -0,0 +1,488 @@
import numpy as np
from numpy.testing import assert_allclose
from pytest import raises as assert_raises
from scipy.stats import (binned_statistic, binned_statistic_2d,
binned_statistic_dd)
from scipy._lib._util import check_random_state
from .common_tests import check_named_results
class TestBinnedStatistic(object):
@classmethod
def setup_class(cls):
rng = check_random_state(9865)
cls.x = rng.uniform(size=100)
cls.y = rng.uniform(size=100)
cls.v = rng.uniform(size=100)
cls.X = rng.uniform(size=(100, 3))
cls.w = rng.uniform(size=100)
cls.u = rng.uniform(size=100) + 1e6
def test_1d_count(self):
x = self.x
v = self.v
count1, edges1, bc = binned_statistic(x, v, 'count', bins=10)
count2, edges2 = np.histogram(x, bins=10)
assert_allclose(count1, count2)
assert_allclose(edges1, edges2)
def test_gh5927(self):
# smoke test for gh5927 - binned_statistic was using `is` for string
# comparison
x = self.x
v = self.v
statistics = [u'mean', u'median', u'count', u'sum']
for statistic in statistics:
binned_statistic(x, v, statistic, bins=10)
def test_big_number_std(self):
# tests for numerical stability of std calculation
# see issue gh-10126 for more
x = self.x
u = self.u
stat1, edges1, bc = binned_statistic(x, u, 'std', bins=10)
stat2, edges2, bc = binned_statistic(x, u, np.std, bins=10)
assert_allclose(stat1, stat2)
def test_non_finite_inputs_and_int_bins(self):
# if either `values` or `sample` contain np.inf or np.nan throw
# see issue gh-9010 for more
x = self.x
u = self.u
orig = u[0]
u[0] = np.inf
assert_raises(ValueError, binned_statistic, u, x, 'std', bins=10)
# need to test for non-python specific ints, e.g. np.int8, np.int64
assert_raises(ValueError, binned_statistic, u, x, 'std',
bins=np.int64(10))
u[0] = np.nan
assert_raises(ValueError, binned_statistic, u, x, 'count', bins=10)
# replace original value, u belongs the class
u[0] = orig
def test_1d_result_attributes(self):
x = self.x
v = self.v
res = binned_statistic(x, v, 'count', bins=10)
attributes = ('statistic', 'bin_edges', 'binnumber')
check_named_results(res, attributes)
def test_1d_sum(self):
x = self.x
v = self.v
sum1, edges1, bc = binned_statistic(x, v, 'sum', bins=10)
sum2, edges2 = np.histogram(x, bins=10, weights=v)
assert_allclose(sum1, sum2)
assert_allclose(edges1, edges2)
def test_1d_mean(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'mean', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.mean, bins=10)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_1d_std(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'std', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.std, bins=10)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_1d_min(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'min', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.min, bins=10)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_1d_max(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'max', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.max, bins=10)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_1d_median(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'median', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.median, bins=10)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_1d_bincode(self):
x = self.x[:20]
v = self.v[:20]
count1, edges1, bc = binned_statistic(x, v, 'count', bins=3)
bc2 = np.array([3, 2, 1, 3, 2, 3, 3, 3, 3, 1, 1, 3, 3, 1, 2, 3, 1,
1, 2, 1])
bcount = [(bc == i).sum() for i in np.unique(bc)]
assert_allclose(bc, bc2)
assert_allclose(bcount, count1)
def test_1d_range_keyword(self):
# Regression test for gh-3063, range can be (min, max) or [(min, max)]
np.random.seed(9865)
x = np.arange(30)
data = np.random.random(30)
mean, bins, _ = binned_statistic(x[:15], data[:15])
mean_range, bins_range, _ = binned_statistic(x, data, range=[(0, 14)])
mean_range2, bins_range2, _ = binned_statistic(x, data, range=(0, 14))
assert_allclose(mean, mean_range)
assert_allclose(bins, bins_range)
assert_allclose(mean, mean_range2)
assert_allclose(bins, bins_range2)
def test_1d_multi_values(self):
x = self.x
v = self.v
w = self.w
stat1v, edges1v, bc1v = binned_statistic(x, v, 'mean', bins=10)
stat1w, edges1w, bc1w = binned_statistic(x, w, 'mean', bins=10)
stat2, edges2, bc2 = binned_statistic(x, [v, w], 'mean', bins=10)
assert_allclose(stat2[0], stat1v)
assert_allclose(stat2[1], stat1w)
assert_allclose(edges1v, edges2)
assert_allclose(bc1v, bc2)
def test_2d_count(self):
x = self.x
y = self.y
v = self.v
count1, binx1, biny1, bc = binned_statistic_2d(
x, y, v, 'count', bins=5)
count2, binx2, biny2 = np.histogram2d(x, y, bins=5)
assert_allclose(count1, count2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_result_attributes(self):
x = self.x
y = self.y
v = self.v
res = binned_statistic_2d(x, y, v, 'count', bins=5)
attributes = ('statistic', 'x_edge', 'y_edge', 'binnumber')
check_named_results(res, attributes)
def test_2d_sum(self):
x = self.x
y = self.y
v = self.v
sum1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'sum', bins=5)
sum2, binx2, biny2 = np.histogram2d(x, y, bins=5, weights=v)
assert_allclose(sum1, sum2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_mean(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'mean', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_mean_unicode(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(
x, y, v, 'mean', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_std(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'std', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.std, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_min(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'min', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.min, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_max(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'max', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.max, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_median(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(
x, y, v, 'median', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(
x, y, v, np.median, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_bincode(self):
x = self.x[:20]
y = self.y[:20]
v = self.v[:20]
count1, binx1, biny1, bc = binned_statistic_2d(
x, y, v, 'count', bins=3)
bc2 = np.array([17, 11, 6, 16, 11, 17, 18, 17, 17, 7, 6, 18, 16,
6, 11, 16, 6, 6, 11, 8])
bcount = [(bc == i).sum() for i in np.unique(bc)]
assert_allclose(bc, bc2)
count1adj = count1[count1.nonzero()]
assert_allclose(bcount, count1adj)
def test_2d_multi_values(self):
x = self.x
y = self.y
v = self.v
w = self.w
stat1v, binx1v, biny1v, bc1v = binned_statistic_2d(
x, y, v, 'mean', bins=8)
stat1w, binx1w, biny1w, bc1w = binned_statistic_2d(
x, y, w, 'mean', bins=8)
stat2, binx2, biny2, bc2 = binned_statistic_2d(
x, y, [v, w], 'mean', bins=8)
assert_allclose(stat2[0], stat1v)
assert_allclose(stat2[1], stat1w)
assert_allclose(binx1v, binx2)
assert_allclose(biny1w, biny2)
assert_allclose(bc1v, bc2)
def test_2d_binnumbers_unraveled(self):
x = self.x
y = self.y
v = self.v
stat, edgesx, bcx = binned_statistic(x, v, 'mean', bins=20)
stat, edgesy, bcy = binned_statistic(y, v, 'mean', bins=10)
stat2, edgesx2, edgesy2, bc2 = binned_statistic_2d(
x, y, v, 'mean', bins=(20, 10), expand_binnumbers=True)
bcx3 = np.searchsorted(edgesx, x, side='right')
bcy3 = np.searchsorted(edgesy, y, side='right')
# `numpy.searchsorted` is non-inclusive on right-edge, compensate
bcx3[x == x.max()] -= 1
bcy3[y == y.max()] -= 1
assert_allclose(bcx, bc2[0])
assert_allclose(bcy, bc2[1])
assert_allclose(bcx3, bc2[0])
assert_allclose(bcy3, bc2[1])
def test_dd_count(self):
X = self.X
v = self.v
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
count2, edges2 = np.histogramdd(X, bins=3)
assert_allclose(count1, count2)
assert_allclose(edges1, edges2)
def test_dd_result_attributes(self):
X = self.X
v = self.v
res = binned_statistic_dd(X, v, 'count', bins=3)
attributes = ('statistic', 'bin_edges', 'binnumber')
check_named_results(res, attributes)
def test_dd_sum(self):
X = self.X
v = self.v
sum1, edges1, bc = binned_statistic_dd(X, v, 'sum', bins=3)
sum2, edges2 = np.histogramdd(X, bins=3, weights=v)
assert_allclose(sum1, sum2)
assert_allclose(edges1, edges2)
def test_dd_mean(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'mean', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.mean, bins=3)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_dd_std(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'std', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.std, bins=3)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_dd_min(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'min', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.min, bins=3)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_dd_max(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'max', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.max, bins=3)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_dd_median(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'median', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.median, bins=3)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_dd_bincode(self):
X = self.X[:20]
v = self.v[:20]
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
bc2 = np.array([63, 33, 86, 83, 88, 67, 57, 33, 42, 41, 82, 83, 92,
32, 36, 91, 43, 87, 81, 81])
bcount = [(bc == i).sum() for i in np.unique(bc)]
assert_allclose(bc, bc2)
count1adj = count1[count1.nonzero()]
assert_allclose(bcount, count1adj)
def test_dd_multi_values(self):
X = self.X
v = self.v
w = self.w
stat1v, edges1v, bc1v = binned_statistic_dd(X, v, np.std, bins=8)
stat1w, edges1w, bc1w = binned_statistic_dd(X, w, np.std, bins=8)
stat2, edges2, bc2 = binned_statistic_dd(X, [v, w], np.std, bins=8)
assert_allclose(stat2[0], stat1v)
assert_allclose(stat2[1], stat1w)
assert_allclose(edges1v, edges2)
assert_allclose(edges1w, edges2)
assert_allclose(bc1v, bc2)
def test_dd_binnumbers_unraveled(self):
X = self.X
v = self.v
stat, edgesx, bcx = binned_statistic(X[:, 0], v, 'mean', bins=15)
stat, edgesy, bcy = binned_statistic(X[:, 1], v, 'mean', bins=20)
stat, edgesz, bcz = binned_statistic(X[:, 2], v, 'mean', bins=10)
stat2, edges2, bc2 = binned_statistic_dd(
X, v, 'mean', bins=(15, 20, 10), expand_binnumbers=True)
assert_allclose(bcx, bc2[0])
assert_allclose(bcy, bc2[1])
assert_allclose(bcz, bc2[2])
def test_dd_binned_statistic_result(self):
# NOTE: tests the reuse of bin_edges from previous call
x = np.random.random((10000, 3))
v = np.random.random((10000))
bins = np.linspace(0, 1, 10)
bins = (bins, bins, bins)
result = binned_statistic_dd(x, v, 'mean', bins=bins)
stat = result.statistic
result = binned_statistic_dd(x, v, 'mean',
binned_statistic_result=result)
stat2 = result.statistic
assert_allclose(stat, stat2)
def test_dd_zero_dedges(self):
x = np.random.random((10000, 3))
v = np.random.random((10000))
bins = np.linspace(0, 1, 10)
bins = np.append(bins, 1)
bins = (bins, bins, bins)
with assert_raises(ValueError, match='difference is numerically 0'):
binned_statistic_dd(x, v, 'mean', bins=bins)

View file

@ -0,0 +1,198 @@
import numpy as np
from numpy.testing import (assert_equal, assert_array_equal,
assert_array_almost_equal, assert_approx_equal, assert_allclose)
from pytest import raises as assert_raises
from scipy.special import xlogy
from scipy.stats.contingency import margins, expected_freq, chi2_contingency
def test_margins():
a = np.array([1])
m = margins(a)
assert_equal(len(m), 1)
m0 = m[0]
assert_array_equal(m0, np.array([1]))
a = np.array([[1]])
m0, m1 = margins(a)
expected0 = np.array([[1]])
expected1 = np.array([[1]])
assert_array_equal(m0, expected0)
assert_array_equal(m1, expected1)
a = np.arange(12).reshape(2, 6)
m0, m1 = margins(a)
expected0 = np.array([[15], [51]])
expected1 = np.array([[6, 8, 10, 12, 14, 16]])
assert_array_equal(m0, expected0)
assert_array_equal(m1, expected1)
a = np.arange(24).reshape(2, 3, 4)
m0, m1, m2 = margins(a)
expected0 = np.array([[[66]], [[210]]])
expected1 = np.array([[[60], [92], [124]]])
expected2 = np.array([[[60, 66, 72, 78]]])
assert_array_equal(m0, expected0)
assert_array_equal(m1, expected1)
assert_array_equal(m2, expected2)
def test_expected_freq():
assert_array_equal(expected_freq([1]), np.array([1.0]))
observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
e = expected_freq(observed)
assert_array_equal(e, np.ones_like(observed))
observed = np.array([[10, 10, 20], [20, 20, 20]])
e = expected_freq(observed)
correct = np.array([[12., 12., 16.], [18., 18., 24.]])
assert_array_almost_equal(e, correct)
def test_chi2_contingency_trivial():
# Some very simple tests for chi2_contingency.
# A trivial case
obs = np.array([[1, 2], [1, 2]])
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
assert_equal(chi2, 0.0)
assert_equal(p, 1.0)
assert_equal(dof, 1)
assert_array_equal(obs, expected)
# A *really* trivial case: 1-D data.
obs = np.array([1, 2, 3])
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
assert_equal(chi2, 0.0)
assert_equal(p, 1.0)
assert_equal(dof, 0)
assert_array_equal(obs, expected)
def test_chi2_contingency_R():
# Some test cases that were computed independently, using R.
# Rcode = \
# """
# # Data vector.
# data <- c(
# 12, 34, 23, 4, 47, 11,
# 35, 31, 11, 34, 10, 18,
# 12, 32, 9, 18, 13, 19,
# 12, 12, 14, 9, 33, 25
# )
#
# # Create factor tags:r=rows, c=columns, t=tiers
# r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
# c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3")))
# t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2")))
#
# # 3-way Chi squared test of independence
# s = summary(xtabs(data~r+c+t))
# print(s)
# """
# Routput = \
# """
# Call: xtabs(formula = data ~ r + c + t)
# Number of cases in table: 478
# Number of factors: 3
# Test for independence of all factors:
# Chisq = 102.17, df = 17, p-value = 3.514e-14
# """
obs = np.array(
[[[12, 34, 23],
[35, 31, 11],
[12, 32, 9],
[12, 12, 14]],
[[4, 47, 11],
[34, 10, 18],
[18, 13, 19],
[9, 33, 25]]])
chi2, p, dof, expected = chi2_contingency(obs)
assert_approx_equal(chi2, 102.17, significant=5)
assert_approx_equal(p, 3.514e-14, significant=4)
assert_equal(dof, 17)
# Rcode = \
# """
# # Data vector.
# data <- c(
# #
# 12, 17,
# 11, 16,
# #
# 11, 12,
# 15, 16,
# #
# 23, 15,
# 30, 22,
# #
# 14, 17,
# 15, 16
# )
#
# # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
# r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2")))
# c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2")))
# d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2")))
# t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2")))
#
# # 4-way Chi squared test of independence
# s = summary(xtabs(data~r+c+d+t))
# print(s)
# """
# Routput = \
# """
# Call: xtabs(formula = data ~ r + c + d + t)
# Number of cases in table: 262
# Number of factors: 4
# Test for independence of all factors:
# Chisq = 8.758, df = 11, p-value = 0.6442
# """
obs = np.array(
[[[[12, 17],
[11, 16]],
[[11, 12],
[15, 16]]],
[[[23, 15],
[30, 22]],
[[14, 17],
[15, 16]]]])
chi2, p, dof, expected = chi2_contingency(obs)
assert_approx_equal(chi2, 8.758, significant=4)
assert_approx_equal(p, 0.6442, significant=4)
assert_equal(dof, 11)
def test_chi2_contingency_g():
c = np.array([[15, 60], [15, 90]])
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=False)
assert_allclose(g, 2*xlogy(c, c/e).sum())
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=True)
c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())
c = np.array([[10, 12, 10], [12, 10, 10]])
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
assert_allclose(g, 2*xlogy(c, c/e).sum())
def test_chi2_contingency_bad_args():
# Test that "bad" inputs raise a ValueError.
# Negative value in the array of observed frequencies.
obs = np.array([[-1, 10], [1, 2]])
assert_raises(ValueError, chi2_contingency, obs)
# The zeros in this will result in zeros in the array
# of expected frequencies.
obs = np.array([[0, 1], [0, 1]])
assert_raises(ValueError, chi2_contingency, obs)
# A degenerate case: `observed` has size 0.
obs = np.empty((0, 8))
assert_raises(ValueError, chi2_contingency, obs)

View file

@ -0,0 +1,664 @@
import numpy as np
import numpy.testing as npt
import pytest
from pytest import raises as assert_raises
from scipy.integrate import IntegrationWarning
from scipy import stats
from scipy.special import betainc
from. common_tests import (check_normalization, check_moment, check_mean_expect,
check_var_expect, check_skew_expect,
check_kurt_expect, check_entropy,
check_private_entropy, check_entropy_vect_scale,
check_edge_support, check_named_args,
check_random_state_property,
check_meth_dtype, check_ppf_dtype, check_cmplx_deriv,
check_pickling, check_rvs_broadcast, check_freezing)
from scipy.stats._distr_params import distcont
"""
Test all continuous distributions.
Parameters were chosen for those distributions that pass the
Kolmogorov-Smirnov test. This provides safe parameters for each
distributions so that we can perform further testing of class methods.
These tests currently check only/mostly for serious errors and exceptions,
not for numerically exact results.
"""
# Note that you need to add new distributions you want tested
# to _distr_params
DECIMAL = 5 # specify the precision of the tests # increased from 0 to 5
# Last three of these fail all around. Need to be checked
distcont_extra = [
['betaprime', (100, 86)],
['fatiguelife', (5,)],
['invweibull', (0.58847112119264788,)],
# burr: sample mean test fails still for c<1
['burr', (0.94839838075366045, 4.3820284068855795)],
# genextreme: sample mean test, sf-logsf test fail
['genextreme', (3.3184017469423535,)],
]
distslow = ['kstwo', 'ksone', 'kappa4', 'gausshyper', 'recipinvgauss',
'genexpon', 'vonmises', 'vonmises_line', 'cosine', 'invweibull',
'powerlognorm', 'johnsonsu', 'kstwobign']
# distslow are sorted by speed (very slow to slow)
# skip check_fit_args (test is slow)
skip_fit_test = ['exponpow', 'exponweib', 'gausshyper', 'genexpon',
'halfgennorm', 'gompertz', 'johnsonsb', 'johnsonsu',
'kappa4', 'ksone', 'kstwo', 'kstwobign', 'mielke', 'ncf', 'nct',
'powerlognorm', 'powernorm', 'recipinvgauss', 'trapz',
'vonmises', 'vonmises_line',
'levy_stable', 'rv_histogram_instance']
# skip check_fit_args_fix (test is slow)
skip_fit_fix_test = ['burr', 'exponpow', 'exponweib',
'gausshyper', 'genexpon', 'halfgennorm',
'gompertz', 'johnsonsb', 'johnsonsu', 'kappa4',
'ksone', 'kstwo', 'kstwobign', 'levy_stable', 'mielke', 'ncf',
'ncx2', 'powerlognorm', 'powernorm', 'rdist',
'recipinvgauss', 'trapz', 'vonmises', 'vonmises_line']
# These distributions fail the complex derivative test below.
# Here 'fail' mean produce wrong results and/or raise exceptions, depending
# on the implementation details of corresponding special functions.
# cf https://github.com/scipy/scipy/pull/4979 for a discussion.
fails_cmplx = set(['beta', 'betaprime', 'chi', 'chi2', 'dgamma', 'dweibull',
'erlang', 'f', 'gamma', 'gausshyper', 'gengamma',
'geninvgauss', 'gennorm', 'genpareto',
'halfgennorm', 'invgamma',
'ksone', 'kstwo', 'kstwobign', 'levy_l', 'loggamma', 'logistic',
'loguniform', 'maxwell', 'nakagami',
'ncf', 'nct', 'ncx2', 'norminvgauss', 'pearson3', 'rdist',
'reciprocal', 'rice', 'skewnorm', 't', 'tukeylambda',
'vonmises', 'vonmises_line', 'rv_histogram_instance'])
_h = np.histogram([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6,
6, 6, 6, 7, 7, 7, 8, 8, 9], bins=8)
histogram_test_instance = stats.rv_histogram(_h)
def cases_test_cont_basic():
for distname, arg in distcont[:] + [(histogram_test_instance, tuple())]:
if distname == 'levy_stable':
continue
elif distname in distslow:
yield pytest.param(distname, arg, marks=pytest.mark.slow)
else:
yield distname, arg
@pytest.mark.parametrize('distname,arg', cases_test_cont_basic())
def test_cont_basic(distname, arg):
# this test skips slow distributions
if distname == 'truncnorm':
pytest.xfail(reason=distname)
try:
distfn = getattr(stats, distname)
except TypeError:
distfn = distname
distname = 'rv_histogram_instance'
np.random.seed(765456)
sn = 500
with npt.suppress_warnings() as sup:
# frechet_l and frechet_r are deprecated, so all their
# methods generate DeprecationWarnings.
sup.filter(category=DeprecationWarning, message=".*frechet_")
rvs = distfn.rvs(size=sn, *arg)
sm = rvs.mean()
sv = rvs.var()
m, v = distfn.stats(*arg)
check_sample_meanvar_(distfn, arg, m, v, sm, sv, sn, distname + 'sample mean test')
check_cdf_ppf(distfn, arg, distname)
check_sf_isf(distfn, arg, distname)
check_pdf(distfn, arg, distname)
check_pdf_logpdf(distfn, arg, distname)
check_pdf_logpdf_at_endpoints(distfn, arg, distname)
check_cdf_logcdf(distfn, arg, distname)
check_sf_logsf(distfn, arg, distname)
check_ppf_broadcast(distfn, arg, distname)
alpha = 0.01
if distname == 'rv_histogram_instance':
check_distribution_rvs(distfn.cdf, arg, alpha, rvs)
elif distname != 'geninvgauss':
# skip kstest for geninvgauss since cdf is too slow; see test for
# rv generation in TestGenInvGauss in test_distributions.py
check_distribution_rvs(distname, arg, alpha, rvs)
locscale_defaults = (0, 1)
meths = [distfn.pdf, distfn.logpdf, distfn.cdf, distfn.logcdf,
distfn.logsf]
# make sure arguments are within support
spec_x = {'frechet_l': -0.5, 'weibull_max': -0.5, 'levy_l': -0.5,
'pareto': 1.5, 'tukeylambda': 0.3,
'rv_histogram_instance': 5.0}
x = spec_x.get(distname, 0.5)
if distname == 'invweibull':
arg = (1,)
elif distname == 'ksone':
arg = (3,)
check_named_args(distfn, x, arg, locscale_defaults, meths)
check_random_state_property(distfn, arg)
check_pickling(distfn, arg)
check_freezing(distfn, arg)
# Entropy
if distname not in ['kstwobign', 'kstwo']:
check_entropy(distfn, arg, distname)
if distfn.numargs == 0:
check_vecentropy(distfn, arg)
if (distfn.__class__._entropy != stats.rv_continuous._entropy
and distname != 'vonmises'):
check_private_entropy(distfn, arg, stats.rv_continuous)
with npt.suppress_warnings() as sup:
sup.filter(IntegrationWarning, "The occurrence of roundoff error")
sup.filter(IntegrationWarning, "Extremely bad integrand")
sup.filter(RuntimeWarning, "invalid value")
check_entropy_vect_scale(distfn, arg)
check_retrieving_support(distfn, arg)
check_edge_support(distfn, arg)
check_meth_dtype(distfn, arg, meths)
check_ppf_dtype(distfn, arg)
if distname not in fails_cmplx:
check_cmplx_deriv(distfn, arg)
if distname != 'truncnorm':
check_ppf_private(distfn, arg, distname)
if distname not in skip_fit_test:
check_fit_args(distfn, arg, rvs[0:200])
if distname not in skip_fit_fix_test:
check_fit_args_fix(distfn, arg, rvs[0:200])
@pytest.mark.parametrize('distname,arg', cases_test_cont_basic())
def test_rvs_scalar(distname, arg):
# rvs should return a scalar when given scalar arguments (gh-12428)
try:
distfn = getattr(stats, distname)
except TypeError:
distfn = distname
distname = 'rv_histogram_instance'
with npt.suppress_warnings() as sup:
sup.filter(category=DeprecationWarning, message=".*frechet_")
rvs = distfn.rvs(*arg)
assert np.isscalar(distfn.rvs(*arg))
assert np.isscalar(distfn.rvs(*arg, size=()))
assert np.isscalar(distfn.rvs(*arg, size=None))
def test_levy_stable_random_state_property():
# levy_stable only implements rvs(), so it is skipped in the
# main loop in test_cont_basic(). Here we apply just the test
# check_random_state_property to levy_stable.
check_random_state_property(stats.levy_stable, (0.5, 0.1))
def cases_test_moments():
fail_normalization = set(['vonmises'])
fail_higher = set(['vonmises', 'ncf'])
for distname, arg in distcont[:] + [(histogram_test_instance, tuple())]:
if distname == 'levy_stable':
continue
cond1 = distname not in fail_normalization
cond2 = distname not in fail_higher
yield distname, arg, cond1, cond2, False
if not cond1 or not cond2:
# Run the distributions that have issues twice, once skipping the
# not_ok parts, once with the not_ok parts but marked as knownfail
yield pytest.param(distname, arg, True, True, True,
marks=pytest.mark.xfail)
@pytest.mark.slow
@pytest.mark.parametrize('distname,arg,normalization_ok,higher_ok,is_xfailing',
cases_test_moments())
def test_moments(distname, arg, normalization_ok, higher_ok, is_xfailing):
try:
distfn = getattr(stats, distname)
except TypeError:
distfn = distname
distname = 'rv_histogram_instance'
with npt.suppress_warnings() as sup:
sup.filter(IntegrationWarning,
"The integral is probably divergent, or slowly convergent.")
sup.filter(category=DeprecationWarning, message=".*frechet_")
if is_xfailing:
sup.filter(IntegrationWarning)
m, v, s, k = distfn.stats(*arg, moments='mvsk')
if normalization_ok:
check_normalization(distfn, arg, distname)
if higher_ok:
check_mean_expect(distfn, arg, m, distname)
check_skew_expect(distfn, arg, m, v, s, distname)
check_var_expect(distfn, arg, m, v, distname)
check_kurt_expect(distfn, arg, m, v, k, distname)
check_loc_scale(distfn, arg, m, v, distname)
check_moment(distfn, arg, m, v, distname)
@pytest.mark.parametrize('dist,shape_args', distcont)
def test_rvs_broadcast(dist, shape_args):
if dist in ['gausshyper', 'genexpon']:
pytest.skip("too slow")
# If shape_only is True, it means the _rvs method of the
# distribution uses more than one random number to generate a random
# variate. That means the result of using rvs with broadcasting or
# with a nontrivial size will not necessarily be the same as using the
# numpy.vectorize'd version of rvs(), so we can only compare the shapes
# of the results, not the values.
# Whether or not a distribution is in the following list is an
# implementation detail of the distribution, not a requirement. If
# the implementation the rvs() method of a distribution changes, this
# test might also have to be changed.
shape_only = dist in ['argus', 'betaprime', 'dgamma', 'dweibull',
'exponnorm', 'geninvgauss', 'levy_stable', 'nct',
'norminvgauss', 'rice', 'skewnorm', 'semicircular']
distfunc = getattr(stats, dist)
loc = np.zeros(2)
scale = np.ones((3, 1))
nargs = distfunc.numargs
allargs = []
bshape = [3, 2]
# Generate shape parameter arguments...
for k in range(nargs):
shp = (k + 4,) + (1,)*(k + 2)
allargs.append(shape_args[k]*np.ones(shp))
bshape.insert(0, k + 4)
allargs.extend([loc, scale])
# bshape holds the expected shape when loc, scale, and the shape
# parameters are all broadcast together.
check_rvs_broadcast(distfunc, dist, allargs, bshape, shape_only, 'd')
def test_rvs_gh2069_regression():
# Regression tests for gh-2069. In scipy 0.17 and earlier,
# these tests would fail.
#
# A typical example of the broken behavior:
# >>> norm.rvs(loc=np.zeros(5), scale=np.ones(5))
# array([-2.49613705, -2.49613705, -2.49613705, -2.49613705, -2.49613705])
np.random.seed(123)
vals = stats.norm.rvs(loc=np.zeros(5), scale=1)
d = np.diff(vals)
npt.assert_(np.all(d != 0), "All the values are equal, but they shouldn't be!")
vals = stats.norm.rvs(loc=0, scale=np.ones(5))
d = np.diff(vals)
npt.assert_(np.all(d != 0), "All the values are equal, but they shouldn't be!")
vals = stats.norm.rvs(loc=np.zeros(5), scale=np.ones(5))
d = np.diff(vals)
npt.assert_(np.all(d != 0), "All the values are equal, but they shouldn't be!")
vals = stats.norm.rvs(loc=np.array([[0], [0]]), scale=np.ones(5))
d = np.diff(vals.ravel())
npt.assert_(np.all(d != 0), "All the values are equal, but they shouldn't be!")
assert_raises(ValueError, stats.norm.rvs, [[0, 0], [0, 0]],
[[1, 1], [1, 1]], 1)
assert_raises(ValueError, stats.gamma.rvs, [2, 3, 4, 5], 0, 1, (2, 2))
assert_raises(ValueError, stats.gamma.rvs, [1, 1, 1, 1], [0, 0, 0, 0],
[[1], [2]], (4,))
def test_nomodify_gh9900_regression():
# Regression test for gh-9990
# Prior to gh-9990, calls to stats.truncnorm._cdf() use what ever was
# set inside the stats.truncnorm instance during stats.truncnorm.cdf().
# This could cause issues wth multi-threaded code.
# Since then, the calls to cdf() are not permitted to modify the global
# stats.truncnorm instance.
tn = stats.truncnorm
# Use the right-half truncated normal
# Check that the cdf and _cdf return the same result.
npt.assert_almost_equal(tn.cdf(1, 0, np.inf), 0.6826894921370859)
npt.assert_almost_equal(tn._cdf(1, 0, np.inf), 0.6826894921370859)
# Now use the left-half truncated normal
npt.assert_almost_equal(tn.cdf(-1, -np.inf, 0), 0.31731050786291415)
npt.assert_almost_equal(tn._cdf(-1, -np.inf, 0), 0.31731050786291415)
# Check that the right-half truncated normal _cdf hasn't changed
npt.assert_almost_equal(tn._cdf(1, 0, np.inf), 0.6826894921370859) # NOT 1.6826894921370859
npt.assert_almost_equal(tn.cdf(1, 0, np.inf), 0.6826894921370859)
# Check that the left-half truncated normal _cdf hasn't changed
npt.assert_almost_equal(tn._cdf(-1, -np.inf, 0), 0.31731050786291415) # Not -0.6826894921370859
npt.assert_almost_equal(tn.cdf(1, -np.inf, 0), 1) # Not 1.6826894921370859
npt.assert_almost_equal(tn.cdf(-1, -np.inf, 0), 0.31731050786291415) # Not -0.6826894921370859
def test_broadcast_gh9990_regression():
# Regression test for gh-9990
# The x-value 7 only lies within the support of 4 of the supplied
# distributions. Prior to 9990, one array passed to
# stats.reciprocal._cdf would have 4 elements, but an array
# previously stored by stats.reciprocal_argcheck() would have 6, leading
# to a broadcast error.
a = np.array([1, 2, 3, 4, 5, 6])
b = np.array([8, 16, 1, 32, 1, 48])
ans = [stats.reciprocal.cdf(7, _a, _b) for _a, _b in zip(a,b)]
npt.assert_array_almost_equal(stats.reciprocal.cdf(7, a, b), ans)
ans = [stats.reciprocal.cdf(1, _a, _b) for _a, _b in zip(a,b)]
npt.assert_array_almost_equal(stats.reciprocal.cdf(1, a, b), ans)
ans = [stats.reciprocal.cdf(_a, _a, _b) for _a, _b in zip(a,b)]
npt.assert_array_almost_equal(stats.reciprocal.cdf(a, a, b), ans)
ans = [stats.reciprocal.cdf(_b, _a, _b) for _a, _b in zip(a,b)]
npt.assert_array_almost_equal(stats.reciprocal.cdf(b, a, b), ans)
def test_broadcast_gh7933_regression():
# Check broadcast works
stats.truncnorm.logpdf(
np.array([3.0, 2.0, 1.0]),
a=(1.5 - np.array([6.0, 5.0, 4.0])) / 3.0,
b=np.inf,
loc=np.array([6.0, 5.0, 4.0]),
scale=3.0
)
def test_gh2002_regression():
# Add a check that broadcast works in situations where only some
# x-values are compatible with some of the shape arguments.
x = np.r_[-2:2:101j]
a = np.r_[-np.ones(50), np.ones(51)]
expected = [stats.truncnorm.pdf(_x, _a, np.inf) for _x, _a in zip(x, a)]
ans = stats.truncnorm.pdf(x, a, np.inf)
npt.assert_array_almost_equal(ans, expected)
def test_gh1320_regression():
# Check that the first example from gh-1320 now works.
c = 2.62
stats.genextreme.ppf(0.5, np.array([[c], [c + 0.5]]))
# The other examples in gh-1320 appear to have stopped working
# some time ago.
# ans = stats.genextreme.moment(2, np.array([c, c + 0.5]))
# expected = np.array([25.50105963, 115.11191437])
# stats.genextreme.moment(5, np.array([[c], [c + 0.5]]))
# stats.genextreme.moment(5, np.array([c, c + 0.5]))
def check_sample_meanvar_(distfn, arg, m, v, sm, sv, sn, msg):
# this did not work, skipped silently by nose
if np.isfinite(m):
check_sample_mean(sm, sv, sn, m)
if np.isfinite(v):
check_sample_var(sv, sn, v)
def check_sample_mean(sm, v, n, popmean):
# from stats.stats.ttest_1samp(a, popmean):
# Calculates the t-obtained for the independent samples T-test on ONE group
# of scores a, given a population mean.
#
# Returns: t-value, two-tailed prob
df = n-1
svar = ((n-1)*v) / float(df) # looks redundant
t = (sm-popmean) / np.sqrt(svar*(1.0/n))
prob = betainc(0.5*df, 0.5, df/(df + t*t))
# return t,prob
npt.assert_(prob > 0.01, 'mean fail, t,prob = %f, %f, m, sm=%f,%f' %
(t, prob, popmean, sm))
def check_sample_var(sv, n, popvar):
# two-sided chisquare test for sample variance equal to
# hypothesized variance
df = n-1
chi2 = (n-1)*popvar/float(popvar)
pval = stats.distributions.chi2.sf(chi2, df) * 2
npt.assert_(pval > 0.01, 'var fail, t, pval = %f, %f, v, sv=%f, %f' %
(chi2, pval, popvar, sv))
def check_cdf_ppf(distfn, arg, msg):
values = [0.001, 0.5, 0.999]
npt.assert_almost_equal(distfn.cdf(distfn.ppf(values, *arg), *arg),
values, decimal=DECIMAL, err_msg=msg +
' - cdf-ppf roundtrip')
def check_sf_isf(distfn, arg, msg):
npt.assert_almost_equal(distfn.sf(distfn.isf([0.1, 0.5, 0.9], *arg), *arg),
[0.1, 0.5, 0.9], decimal=DECIMAL, err_msg=msg +
' - sf-isf roundtrip')
npt.assert_almost_equal(distfn.cdf([0.1, 0.9], *arg),
1.0 - distfn.sf([0.1, 0.9], *arg),
decimal=DECIMAL, err_msg=msg +
' - cdf-sf relationship')
def check_pdf(distfn, arg, msg):
# compares pdf at median with numerical derivative of cdf
median = distfn.ppf(0.5, *arg)
eps = 1e-6
pdfv = distfn.pdf(median, *arg)
if (pdfv < 1e-4) or (pdfv > 1e4):
# avoid checking a case where pdf is close to zero or
# huge (singularity)
median = median + 0.1
pdfv = distfn.pdf(median, *arg)
cdfdiff = (distfn.cdf(median + eps, *arg) -
distfn.cdf(median - eps, *arg))/eps/2.0
# replace with better diff and better test (more points),
# actually, this works pretty well
msg += ' - cdf-pdf relationship'
npt.assert_almost_equal(pdfv, cdfdiff, decimal=DECIMAL, err_msg=msg)
def check_pdf_logpdf(distfn, args, msg):
# compares pdf at several points with the log of the pdf
points = np.array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
vals = distfn.ppf(points, *args)
vals = vals[np.isfinite(vals)]
pdf = distfn.pdf(vals, *args)
logpdf = distfn.logpdf(vals, *args)
pdf = pdf[(pdf != 0) & np.isfinite(pdf)]
logpdf = logpdf[np.isfinite(logpdf)]
msg += " - logpdf-log(pdf) relationship"
npt.assert_almost_equal(np.log(pdf), logpdf, decimal=7, err_msg=msg)
def check_pdf_logpdf_at_endpoints(distfn, args, msg):
# compares pdf with the log of the pdf at the (finite) end points
points = np.array([0, 1])
vals = distfn.ppf(points, *args)
vals = vals[np.isfinite(vals)]
with npt.suppress_warnings() as sup:
# Several distributions incur divide by zero or encounter invalid values when computing
# the pdf or logpdf at the endpoints.
suppress_messsages = [
"divide by zero encountered in true_divide", # multiple distributions
"divide by zero encountered in log", # multiple distributions
"divide by zero encountered in power", # gengamma
"invalid value encountered in add", # genextreme
"invalid value encountered in subtract", # gengamma
"invalid value encountered in multiply" # recipinvgauss
]
for msg in suppress_messsages:
sup.filter(category=RuntimeWarning, message=msg)
pdf = distfn.pdf(vals, *args)
logpdf = distfn.logpdf(vals, *args)
pdf = pdf[(pdf != 0) & np.isfinite(pdf)]
logpdf = logpdf[np.isfinite(logpdf)]
msg += " - logpdf-log(pdf) relationship"
npt.assert_almost_equal(np.log(pdf), logpdf, decimal=7, err_msg=msg)
def check_sf_logsf(distfn, args, msg):
# compares sf at several points with the log of the sf
points = np.array([0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0])
vals = distfn.ppf(points, *args)
vals = vals[np.isfinite(vals)]
sf = distfn.sf(vals, *args)
logsf = distfn.logsf(vals, *args)
sf = sf[sf != 0]
logsf = logsf[np.isfinite(logsf)]
msg += " - logsf-log(sf) relationship"
npt.assert_almost_equal(np.log(sf), logsf, decimal=7, err_msg=msg)
def check_cdf_logcdf(distfn, args, msg):
# compares cdf at several points with the log of the cdf
points = np.array([0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0])
vals = distfn.ppf(points, *args)
vals = vals[np.isfinite(vals)]
cdf = distfn.cdf(vals, *args)
logcdf = distfn.logcdf(vals, *args)
cdf = cdf[cdf != 0]
logcdf = logcdf[np.isfinite(logcdf)]
msg += " - logcdf-log(cdf) relationship"
npt.assert_almost_equal(np.log(cdf), logcdf, decimal=7, err_msg=msg)
def check_ppf_broadcast(distfn, arg, msg):
# compares ppf for multiple argsets.
num_repeats = 5
args = [] * num_repeats
if arg:
args = [np.array([_] * num_repeats) for _ in arg]
median = distfn.ppf(0.5, *arg)
medians = distfn.ppf(0.5, *args)
msg += " - ppf multiple"
npt.assert_almost_equal(medians, [median] * num_repeats, decimal=7, err_msg=msg)
def check_distribution_rvs(dist, args, alpha, rvs):
# dist is either a cdf function or name of a distribution in scipy.stats.
# args are the args for scipy.stats.dist(*args)
# alpha is a significance level, ~0.01
# rvs is array_like of random variables
# test from scipy.stats.tests
# this version reuses existing random variables
D, pval = stats.kstest(rvs, dist, args=args, N=1000)
if (pval < alpha):
# The rvs passed in failed the K-S test, which _could_ happen
# but is unlikely if alpha is small enough.
# Repeat the the test with a new sample of rvs.
# Generate 1000 rvs, perform a K-S test that the new sample of rvs
# are distributed according to the distribution.
D, pval = stats.kstest(dist, dist, args=args, N=1000)
npt.assert_(pval > alpha, "D = " + str(D) + "; pval = " + str(pval) +
"; alpha = " + str(alpha) + "\nargs = " + str(args))
def check_vecentropy(distfn, args):
npt.assert_equal(distfn.vecentropy(*args), distfn._entropy(*args))
def check_loc_scale(distfn, arg, m, v, msg):
loc, scale = 10.0, 10.0
mt, vt = distfn.stats(loc=loc, scale=scale, *arg)
npt.assert_allclose(m*scale + loc, mt)
npt.assert_allclose(v*scale*scale, vt)
def check_ppf_private(distfn, arg, msg):
# fails by design for truncnorm self.nb not defined
ppfs = distfn._ppf(np.array([0.1, 0.5, 0.9]), *arg)
npt.assert_(not np.any(np.isnan(ppfs)), msg + 'ppf private is nan')
def check_retrieving_support(distfn, args):
loc, scale = 1, 2
supp = distfn.support(*args)
supp_loc_scale = distfn.support(*args, loc=loc, scale=scale)
npt.assert_almost_equal(np.array(supp)*scale + loc,
np.array(supp_loc_scale))
def check_fit_args(distfn, arg, rvs):
with np.errstate(all='ignore'), npt.suppress_warnings() as sup:
sup.filter(category=DeprecationWarning, message=".*frechet_")
sup.filter(category=RuntimeWarning,
message="The shape parameter of the erlang")
sup.filter(category=RuntimeWarning,
message="floating point number truncated")
vals = distfn.fit(rvs)
vals2 = distfn.fit(rvs, optimizer='powell')
# Only check the length of the return
# FIXME: should check the actual results to see if we are 'close'
# to what was created --- but what is 'close' enough
npt.assert_(len(vals) == 2+len(arg))
npt.assert_(len(vals2) == 2+len(arg))
def check_fit_args_fix(distfn, arg, rvs):
with np.errstate(all='ignore'), npt.suppress_warnings() as sup:
sup.filter(category=DeprecationWarning, message=".*frechet_")
sup.filter(category=RuntimeWarning,
message="The shape parameter of the erlang")
vals = distfn.fit(rvs, floc=0)
vals2 = distfn.fit(rvs, fscale=1)
npt.assert_(len(vals) == 2+len(arg))
npt.assert_(vals[-2] == 0)
npt.assert_(vals2[-1] == 1)
npt.assert_(len(vals2) == 2+len(arg))
if len(arg) > 0:
vals3 = distfn.fit(rvs, f0=arg[0])
npt.assert_(len(vals3) == 2+len(arg))
npt.assert_(vals3[0] == arg[0])
if len(arg) > 1:
vals4 = distfn.fit(rvs, f1=arg[1])
npt.assert_(len(vals4) == 2+len(arg))
npt.assert_(vals4[1] == arg[1])
if len(arg) > 2:
vals5 = distfn.fit(rvs, f2=arg[2])
npt.assert_(len(vals5) == 2+len(arg))
npt.assert_(vals5[2] == arg[2])
@pytest.mark.parametrize('method', ['pdf', 'logpdf', 'cdf', 'logcdf',
'sf', 'logsf', 'ppf', 'isf'])
@pytest.mark.parametrize('distname, args', distcont)
def test_methods_with_lists(method, distname, args):
# Test that the continuous distributions can accept Python lists
# as arguments.
with npt.suppress_warnings() as sup:
sup.filter(category=DeprecationWarning, message=".*frechet_")
dist = getattr(stats, distname)
f = getattr(dist, method)
if distname == 'invweibull' and method.startswith('log'):
x = [1.5, 2]
else:
x = [0.1, 0.2]
shape2 = [[a]*2 for a in args]
loc = [0, 0.1]
scale = [1, 1.01]
result = f(x, *shape2, loc=loc, scale=scale)
npt.assert_allclose(result,
[f(*v) for v in zip(x, *shape2, loc, scale)],
rtol=1e-15, atol=1e-15)

View file

@ -0,0 +1,273 @@
import numpy.testing as npt
import numpy as np
import pytest
from scipy import stats
from .common_tests import (check_normalization, check_moment, check_mean_expect,
check_var_expect, check_skew_expect,
check_kurt_expect, check_entropy,
check_private_entropy, check_edge_support,
check_named_args, check_random_state_property,
check_pickling, check_rvs_broadcast, check_freezing)
from scipy.stats._distr_params import distdiscrete
vals = ([1, 2, 3, 4], [0.1, 0.2, 0.3, 0.4])
distdiscrete += [[stats.rv_discrete(values=vals), ()]]
def cases_test_discrete_basic():
seen = set()
for distname, arg in distdiscrete:
yield distname, arg, distname not in seen
seen.add(distname)
@pytest.mark.parametrize('distname,arg,first_case', cases_test_discrete_basic())
def test_discrete_basic(distname, arg, first_case):
try:
distfn = getattr(stats, distname)
except TypeError:
distfn = distname
distname = 'sample distribution'
np.random.seed(9765456)
rvs = distfn.rvs(size=2000, *arg)
supp = np.unique(rvs)
m, v = distfn.stats(*arg)
check_cdf_ppf(distfn, arg, supp, distname + ' cdf_ppf')
check_pmf_cdf(distfn, arg, distname)
check_oth(distfn, arg, supp, distname + ' oth')
check_edge_support(distfn, arg)
alpha = 0.01
check_discrete_chisquare(distfn, arg, rvs, alpha,
distname + ' chisquare')
if first_case:
locscale_defaults = (0,)
meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf,
distfn.logsf]
# make sure arguments are within support
spec_k = {'randint': 11, 'hypergeom': 4, 'bernoulli': 0, }
k = spec_k.get(distname, 1)
check_named_args(distfn, k, arg, locscale_defaults, meths)
if distname != 'sample distribution':
check_scale_docstring(distfn)
check_random_state_property(distfn, arg)
check_pickling(distfn, arg)
check_freezing(distfn, arg)
# Entropy
check_entropy(distfn, arg, distname)
if distfn.__class__._entropy != stats.rv_discrete._entropy:
check_private_entropy(distfn, arg, stats.rv_discrete)
@pytest.mark.parametrize('distname,arg', distdiscrete)
def test_moments(distname, arg):
try:
distfn = getattr(stats, distname)
except TypeError:
distfn = distname
distname = 'sample distribution'
m, v, s, k = distfn.stats(*arg, moments='mvsk')
check_normalization(distfn, arg, distname)
# compare `stats` and `moment` methods
check_moment(distfn, arg, m, v, distname)
check_mean_expect(distfn, arg, m, distname)
check_var_expect(distfn, arg, m, v, distname)
check_skew_expect(distfn, arg, m, v, s, distname)
if distname not in ['zipf', 'yulesimon']:
check_kurt_expect(distfn, arg, m, v, k, distname)
# frozen distr moments
check_moment_frozen(distfn, arg, m, 1)
check_moment_frozen(distfn, arg, v+m*m, 2)
@pytest.mark.parametrize('dist,shape_args', distdiscrete)
def test_rvs_broadcast(dist, shape_args):
# If shape_only is True, it means the _rvs method of the
# distribution uses more than one random number to generate a random
# variate. That means the result of using rvs with broadcasting or
# with a nontrivial size will not necessarily be the same as using the
# numpy.vectorize'd version of rvs(), so we can only compare the shapes
# of the results, not the values.
# Whether or not a distribution is in the following list is an
# implementation detail of the distribution, not a requirement. If
# the implementation the rvs() method of a distribution changes, this
# test might also have to be changed.
shape_only = dist in ['betabinom', 'skellam', 'yulesimon', 'dlaplace']
try:
distfunc = getattr(stats, dist)
except TypeError:
distfunc = dist
dist = 'rv_discrete(values=(%r, %r))' % (dist.xk, dist.pk)
loc = np.zeros(2)
nargs = distfunc.numargs
allargs = []
bshape = []
# Generate shape parameter arguments...
for k in range(nargs):
shp = (k + 3,) + (1,)*(k + 1)
param_val = shape_args[k]
allargs.append(np.full(shp, param_val))
bshape.insert(0, shp[0])
allargs.append(loc)
bshape.append(loc.size)
# bshape holds the expected shape when loc, scale, and the shape
# parameters are all broadcast together.
check_rvs_broadcast(distfunc, dist, allargs, bshape, shape_only, [np.int_])
@pytest.mark.parametrize('dist,args', distdiscrete)
def test_ppf_with_loc(dist, args):
try:
distfn = getattr(stats, dist)
except TypeError:
distfn = dist
#check with a negative, no and positive relocation.
np.random.seed(1942349)
re_locs = [np.random.randint(-10, -1), 0, np.random.randint(1, 10)]
_a, _b = distfn.support(*args)
for loc in re_locs:
npt.assert_array_equal(
[_a-1+loc, _b+loc],
[distfn.ppf(0.0, *args, loc=loc), distfn.ppf(1.0, *args, loc=loc)]
)
def check_cdf_ppf(distfn, arg, supp, msg):
# cdf is a step function, and ppf(q) = min{k : cdf(k) >= q, k integer}
npt.assert_array_equal(distfn.ppf(distfn.cdf(supp, *arg), *arg),
supp, msg + '-roundtrip')
npt.assert_array_equal(distfn.ppf(distfn.cdf(supp, *arg) - 1e-8, *arg),
supp, msg + '-roundtrip')
if not hasattr(distfn, 'xk'):
_a, _b = distfn.support(*arg)
supp1 = supp[supp < _b]
npt.assert_array_equal(distfn.ppf(distfn.cdf(supp1, *arg) + 1e-8, *arg),
supp1 + distfn.inc, msg + ' ppf-cdf-next')
# -1e-8 could cause an error if pmf < 1e-8
def check_pmf_cdf(distfn, arg, distname):
if hasattr(distfn, 'xk'):
index = distfn.xk
else:
startind = int(distfn.ppf(0.01, *arg) - 1)
index = list(range(startind, startind + 10))
cdfs = distfn.cdf(index, *arg)
pmfs_cum = distfn.pmf(index, *arg).cumsum()
atol, rtol = 1e-10, 1e-10
if distname == 'skellam': # ncx2 accuracy
atol, rtol = 1e-5, 1e-5
npt.assert_allclose(cdfs - cdfs[0], pmfs_cum - pmfs_cum[0],
atol=atol, rtol=rtol)
def check_moment_frozen(distfn, arg, m, k):
npt.assert_allclose(distfn(*arg).moment(k), m,
atol=1e-10, rtol=1e-10)
def check_oth(distfn, arg, supp, msg):
# checking other methods of distfn
npt.assert_allclose(distfn.sf(supp, *arg), 1. - distfn.cdf(supp, *arg),
atol=1e-10, rtol=1e-10)
q = np.linspace(0.01, 0.99, 20)
npt.assert_allclose(distfn.isf(q, *arg), distfn.ppf(1. - q, *arg),
atol=1e-10, rtol=1e-10)
median_sf = distfn.isf(0.5, *arg)
npt.assert_(distfn.sf(median_sf - 1, *arg) > 0.5)
npt.assert_(distfn.cdf(median_sf + 1, *arg) > 0.5)
def check_discrete_chisquare(distfn, arg, rvs, alpha, msg):
"""Perform chisquare test for random sample of a discrete distribution
Parameters
----------
distname : string
name of distribution function
arg : sequence
parameters of distribution
alpha : float
significance level, threshold for p-value
Returns
-------
result : bool
0 if test passes, 1 if test fails
"""
wsupp = 0.05
# construct intervals with minimum mass `wsupp`.
# intervals are left-half-open as in a cdf difference
_a, _b = distfn.support(*arg)
lo = int(max(_a, -1000))
high = int(min(_b, 1000)) + 1
distsupport = range(lo, high)
last = 0
distsupp = [lo]
distmass = []
for ii in distsupport:
current = distfn.cdf(ii, *arg)
if current - last >= wsupp - 1e-14:
distsupp.append(ii)
distmass.append(current - last)
last = current
if current > (1 - wsupp):
break
if distsupp[-1] < _b:
distsupp.append(_b)
distmass.append(1 - last)
distsupp = np.array(distsupp)
distmass = np.array(distmass)
# convert intervals to right-half-open as required by histogram
histsupp = distsupp + 1e-8
histsupp[0] = _a
# find sample frequencies and perform chisquare test
freq, hsupp = np.histogram(rvs, histsupp)
chis, pval = stats.chisquare(np.array(freq), len(rvs)*distmass)
npt.assert_(pval > alpha,
'chisquare - test for %s at arg = %s with pval = %s' %
(msg, str(arg), str(pval)))
def check_scale_docstring(distfn):
if distfn.__doc__ is not None:
# Docstrings can be stripped if interpreter is run with -OO
npt.assert_('scale' not in distfn.__doc__)
@pytest.mark.parametrize('method', ['pmf', 'logpmf', 'cdf', 'logcdf',
'sf', 'logsf', 'ppf', 'isf'])
@pytest.mark.parametrize('distname, args', distdiscrete)
def test_methods_with_lists(method, distname, args):
# Test that the discrete distributions can accept Python lists
# as arguments.
try:
dist = getattr(stats, distname)
except TypeError:
return
if method in ['ppf', 'isf']:
z = [0.1, 0.2]
else:
z = [0, 1]
p2 = [[p]*2 for p in args]
loc = [0, 1]
result = dist.pmf(z, *p2, loc=loc)
npt.assert_allclose(result,
[dist.pmf(*v) for v in zip(z, *p2, loc)],
rtol=1e-15, atol=1e-15)

View file

@ -0,0 +1,68 @@
from scipy.stats import betabinom, hypergeom, bernoulli, boltzmann
import numpy as np
from numpy.testing import assert_almost_equal, assert_equal, assert_allclose
def test_hypergeom_logpmf():
# symmetries test
# f(k,N,K,n) = f(n-k,N,N-K,n) = f(K-k,N,K,N-n) = f(k,N,n,K)
k = 5
N = 50
K = 10
n = 5
logpmf1 = hypergeom.logpmf(k, N, K, n)
logpmf2 = hypergeom.logpmf(n - k, N, N - K, n)
logpmf3 = hypergeom.logpmf(K - k, N, K, N - n)
logpmf4 = hypergeom.logpmf(k, N, n, K)
assert_almost_equal(logpmf1, logpmf2, decimal=12)
assert_almost_equal(logpmf1, logpmf3, decimal=12)
assert_almost_equal(logpmf1, logpmf4, decimal=12)
# test related distribution
# Bernoulli distribution if n = 1
k = 1
N = 10
K = 7
n = 1
hypergeom_logpmf = hypergeom.logpmf(k, N, K, n)
bernoulli_logpmf = bernoulli.logpmf(k, K/N)
assert_almost_equal(hypergeom_logpmf, bernoulli_logpmf, decimal=12)
def test_boltzmann_upper_bound():
k = np.arange(-3, 5)
N = 1
p = boltzmann.pmf(k, 0.123, N)
expected = k == 0
assert_equal(p, expected)
lam = np.log(2)
N = 3
p = boltzmann.pmf(k, lam, N)
expected = [0, 0, 0, 4/7, 2/7, 1/7, 0, 0]
assert_allclose(p, expected, rtol=1e-13)
c = boltzmann.cdf(k, lam, N)
expected = [0, 0, 0, 4/7, 6/7, 1, 1, 1]
assert_allclose(c, expected, rtol=1e-13)
def test_betabinom_a_and_b_unity():
# test limiting case that betabinom(n, 1, 1) is a discrete uniform
# distribution from 0 to n
n = 20
k = np.arange(n + 1)
p = betabinom(n, 1, 1).pmf(k)
expected = np.repeat(1 / (n + 1), n + 1)
assert_almost_equal(p, expected)
def test_betabinom_bernoulli():
# test limiting case that betabinom(1, a, b) = bernoulli(a / (a + b))
a = 2.3
b = 0.63
k = np.arange(2)
p = betabinom(1, a, b).pmf(k)
expected = bernoulli(a / (a + b)).pmf(k)
assert_almost_equal(p, expected)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,120 @@
import os
import numpy as np
from numpy.testing import assert_allclose, suppress_warnings
import pytest
from scipy import stats
from .test_continuous_basic import distcont
# this is not a proper statistical test for convergence, but only
# verifies that the estimate and true values don't differ by too much
fit_sizes = [1000, 5000] # sample sizes to try
thresh_percent = 0.25 # percent of true parameters for fail cut-off
thresh_min = 0.75 # minimum difference estimate - true to fail test
failing_fits = [
'burr',
'chi2',
'gausshyper',
'genexpon',
'gengamma',
'kappa4',
'ksone',
'kstwo',
'mielke',
'ncf',
'ncx2',
'pearson3',
'powerlognorm',
'truncexpon',
'tukeylambda',
'vonmises',
'wrapcauchy',
'levy_stable',
'trapz'
]
# Don't run the fit test on these:
skip_fit = [
'erlang', # Subclass of gamma, generates a warning.
]
def cases_test_cont_fit():
# this tests the closeness of the estimated parameters to the true
# parameters with fit method of continuous distributions
# Note: is slow, some distributions don't converge with sample size <= 10000
for distname, arg in distcont:
if distname not in skip_fit:
yield distname, arg
@pytest.mark.slow
@pytest.mark.parametrize('distname,arg', cases_test_cont_fit())
def test_cont_fit(distname, arg):
if distname in failing_fits:
# Skip failing fits unless overridden
try:
xfail = not int(os.environ['SCIPY_XFAIL'])
except Exception:
xfail = True
if xfail:
msg = "Fitting %s doesn't work reliably yet" % distname
msg += " [Set environment variable SCIPY_XFAIL=1 to run this test nevertheless.]"
pytest.xfail(msg)
distfn = getattr(stats, distname)
truearg = np.hstack([arg, [0.0, 1.0]])
diffthreshold = np.max(np.vstack([truearg*thresh_percent,
np.full(distfn.numargs+2, thresh_min)]),
0)
for fit_size in fit_sizes:
# Note that if a fit succeeds, the other fit_sizes are skipped
np.random.seed(1234)
with np.errstate(all='ignore'), suppress_warnings() as sup:
sup.filter(category=DeprecationWarning, message=".*frechet_")
rvs = distfn.rvs(size=fit_size, *arg)
est = distfn.fit(rvs) # start with default values
diff = est - truearg
# threshold for location
diffthreshold[-2] = np.max([np.abs(rvs.mean())*thresh_percent,thresh_min])
if np.any(np.isnan(est)):
raise AssertionError('nan returned in fit')
else:
if np.all(np.abs(diff) <= diffthreshold):
break
else:
txt = 'parameter: %s\n' % str(truearg)
txt += 'estimated: %s\n' % str(est)
txt += 'diff : %s\n' % str(diff)
raise AssertionError('fit not very good in %s\n' % distfn.name + txt)
def _check_loc_scale_mle_fit(name, data, desired, atol=None):
d = getattr(stats, name)
actual = d.fit(data)[-2:]
assert_allclose(actual, desired, atol=atol,
err_msg='poor mle fit of (loc, scale) in %s' % name)
def test_non_default_loc_scale_mle_fit():
data = np.array([1.01, 1.78, 1.78, 1.78, 1.88, 1.88, 1.88, 2.00])
_check_loc_scale_mle_fit('uniform', data, [1.01, 0.99], 1e-3)
_check_loc_scale_mle_fit('expon', data, [1.01, 0.73875], 1e-3)
def test_expon_fit():
"""gh-6167"""
data = [0, 0, 0, 0, 2, 2, 2, 2]
phat = stats.expon.fit(data, floc=0)
assert_allclose(phat, [0, 1.0], atol=1e-3)

View file

@ -0,0 +1,470 @@
from scipy import stats
import numpy as np
from numpy.testing import (assert_almost_equal, assert_,
assert_array_almost_equal, assert_array_almost_equal_nulp, assert_allclose)
import pytest
from pytest import raises as assert_raises
def test_kde_1d():
#some basic tests comparing to normal distribution
np.random.seed(8765678)
n_basesample = 500
xn = np.random.randn(n_basesample)
xnmean = xn.mean()
xnstd = xn.std(ddof=1)
# get kde for original sample
gkde = stats.gaussian_kde(xn)
# evaluate the density function for the kde for some points
xs = np.linspace(-7,7,501)
kdepdf = gkde.evaluate(xs)
normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
intervall = xs[1] - xs[0]
assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
prob1 = gkde.integrate_box_1d(xnmean, np.inf)
prob2 = gkde.integrate_box_1d(-np.inf, xnmean)
assert_almost_equal(prob1, 0.5, decimal=1)
assert_almost_equal(prob2, 0.5, decimal=1)
assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13)
assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13)
assert_almost_equal(gkde.integrate_kde(gkde),
(kdepdf**2).sum()*intervall, decimal=2)
assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
(kdepdf*normpdf).sum()*intervall, decimal=2)
def test_kde_1d_weighted():
#some basic tests comparing to normal distribution
np.random.seed(8765678)
n_basesample = 500
xn = np.random.randn(n_basesample)
wn = np.random.rand(n_basesample)
xnmean = np.average(xn, weights=wn)
xnstd = np.sqrt(np.average((xn-xnmean)**2, weights=wn))
# get kde for original sample
gkde = stats.gaussian_kde(xn, weights=wn)
# evaluate the density function for the kde for some points
xs = np.linspace(-7,7,501)
kdepdf = gkde.evaluate(xs)
normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
intervall = xs[1] - xs[0]
assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
prob1 = gkde.integrate_box_1d(xnmean, np.inf)
prob2 = gkde.integrate_box_1d(-np.inf, xnmean)
assert_almost_equal(prob1, 0.5, decimal=1)
assert_almost_equal(prob2, 0.5, decimal=1)
assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13)
assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13)
assert_almost_equal(gkde.integrate_kde(gkde),
(kdepdf**2).sum()*intervall, decimal=2)
assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
(kdepdf*normpdf).sum()*intervall, decimal=2)
@pytest.mark.slow
def test_kde_2d():
#some basic tests comparing to normal distribution
np.random.seed(8765678)
n_basesample = 500
mean = np.array([1.0, 3.0])
covariance = np.array([[1.0, 2.0], [2.0, 6.0]])
# Need transpose (shape (2, 500)) for kde
xn = np.random.multivariate_normal(mean, covariance, size=n_basesample).T
# get kde for original sample
gkde = stats.gaussian_kde(xn)
# evaluate the density function for the kde for some points
x, y = np.mgrid[-7:7:500j, -7:7:500j]
grid_coords = np.vstack([x.ravel(), y.ravel()])
kdepdf = gkde.evaluate(grid_coords)
kdepdf = kdepdf.reshape(500, 500)
normpdf = stats.multivariate_normal.pdf(np.dstack([x, y]), mean=mean, cov=covariance)
intervall = y.ravel()[1] - y.ravel()[0]
assert_(np.sum((kdepdf - normpdf)**2) * (intervall**2) < 0.01)
small = -1e100
large = 1e100
prob1 = gkde.integrate_box([small, mean[1]], [large, large])
prob2 = gkde.integrate_box([small, small], [large, mean[1]])
assert_almost_equal(prob1, 0.5, decimal=1)
assert_almost_equal(prob2, 0.5, decimal=1)
assert_almost_equal(gkde.integrate_kde(gkde),
(kdepdf**2).sum()*(intervall**2), decimal=2)
assert_almost_equal(gkde.integrate_gaussian(mean, covariance),
(kdepdf*normpdf).sum()*(intervall**2), decimal=2)
@pytest.mark.slow
def test_kde_2d_weighted():
#some basic tests comparing to normal distribution
np.random.seed(8765678)
n_basesample = 500
mean = np.array([1.0, 3.0])
covariance = np.array([[1.0, 2.0], [2.0, 6.0]])
# Need transpose (shape (2, 500)) for kde
xn = np.random.multivariate_normal(mean, covariance, size=n_basesample).T
wn = np.random.rand(n_basesample)
# get kde for original sample
gkde = stats.gaussian_kde(xn, weights=wn)
# evaluate the density function for the kde for some points
x, y = np.mgrid[-7:7:500j, -7:7:500j]
grid_coords = np.vstack([x.ravel(), y.ravel()])
kdepdf = gkde.evaluate(grid_coords)
kdepdf = kdepdf.reshape(500, 500)
normpdf = stats.multivariate_normal.pdf(np.dstack([x, y]), mean=mean, cov=covariance)
intervall = y.ravel()[1] - y.ravel()[0]
assert_(np.sum((kdepdf - normpdf)**2) * (intervall**2) < 0.01)
small = -1e100
large = 1e100
prob1 = gkde.integrate_box([small, mean[1]], [large, large])
prob2 = gkde.integrate_box([small, small], [large, mean[1]])
assert_almost_equal(prob1, 0.5, decimal=1)
assert_almost_equal(prob2, 0.5, decimal=1)
assert_almost_equal(gkde.integrate_kde(gkde),
(kdepdf**2).sum()*(intervall**2), decimal=2)
assert_almost_equal(gkde.integrate_gaussian(mean, covariance),
(kdepdf*normpdf).sum()*(intervall**2), decimal=2)
def test_kde_bandwidth_method():
def scotts_factor(kde_obj):
"""Same as default, just check that it works."""
return np.power(kde_obj.n, -1./(kde_obj.d+4))
np.random.seed(8765678)
n_basesample = 50
xn = np.random.randn(n_basesample)
# Default
gkde = stats.gaussian_kde(xn)
# Supply a callable
gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor)
# Supply a scalar
gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor)
xs = np.linspace(-7,7,51)
kdepdf = gkde.evaluate(xs)
kdepdf2 = gkde2.evaluate(xs)
assert_almost_equal(kdepdf, kdepdf2)
kdepdf3 = gkde3.evaluate(xs)
assert_almost_equal(kdepdf, kdepdf3)
assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring')
def test_kde_bandwidth_method_weighted():
def scotts_factor(kde_obj):
"""Same as default, just check that it works."""
return np.power(kde_obj.neff, -1./(kde_obj.d+4))
np.random.seed(8765678)
n_basesample = 50
xn = np.random.randn(n_basesample)
# Default
gkde = stats.gaussian_kde(xn)
# Supply a callable
gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor)
# Supply a scalar
gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor)
xs = np.linspace(-7,7,51)
kdepdf = gkde.evaluate(xs)
kdepdf2 = gkde2.evaluate(xs)
assert_almost_equal(kdepdf, kdepdf2)
kdepdf3 = gkde3.evaluate(xs)
assert_almost_equal(kdepdf, kdepdf3)
assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring')
# Subclasses that should stay working (extracted from various sources).
# Unfortunately the earlier design of gaussian_kde made it necessary for users
# to create these kinds of subclasses, or call _compute_covariance() directly.
class _kde_subclass1(stats.gaussian_kde):
def __init__(self, dataset):
self.dataset = np.atleast_2d(dataset)
self.d, self.n = self.dataset.shape
self.covariance_factor = self.scotts_factor
self._compute_covariance()
class _kde_subclass2(stats.gaussian_kde):
def __init__(self, dataset):
self.covariance_factor = self.scotts_factor
super(_kde_subclass2, self).__init__(dataset)
class _kde_subclass3(stats.gaussian_kde):
def __init__(self, dataset, covariance):
self.covariance = covariance
stats.gaussian_kde.__init__(self, dataset)
def _compute_covariance(self):
self.inv_cov = np.linalg.inv(self.covariance)
self._norm_factor = np.sqrt(np.linalg.det(2 * np.pi * self.covariance))
class _kde_subclass4(stats.gaussian_kde):
def covariance_factor(self):
return 0.5 * self.silverman_factor()
def test_gaussian_kde_subclassing():
x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
xs = np.linspace(-10, 10, num=50)
# gaussian_kde itself
kde = stats.gaussian_kde(x1)
ys = kde(xs)
# subclass 1
kde1 = _kde_subclass1(x1)
y1 = kde1(xs)
assert_array_almost_equal_nulp(ys, y1, nulp=10)
# subclass 2
kde2 = _kde_subclass2(x1)
y2 = kde2(xs)
assert_array_almost_equal_nulp(ys, y2, nulp=10)
# subclass 3
kde3 = _kde_subclass3(x1, kde.covariance)
y3 = kde3(xs)
assert_array_almost_equal_nulp(ys, y3, nulp=10)
# subclass 4
kde4 = _kde_subclass4(x1)
y4 = kde4(x1)
y_expected = [0.06292987, 0.06346938, 0.05860291, 0.08657652, 0.07904017]
assert_array_almost_equal(y_expected, y4, decimal=6)
# Not a subclass, but check for use of _compute_covariance()
kde5 = kde
kde5.covariance_factor = lambda: kde.factor
kde5._compute_covariance()
y5 = kde5(xs)
assert_array_almost_equal_nulp(ys, y5, nulp=10)
def test_gaussian_kde_covariance_caching():
x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
xs = np.linspace(-10, 10, num=5)
# These expected values are from scipy 0.10, before some changes to
# gaussian_kde. They were not compared with any external reference.
y_expected = [0.02463386, 0.04689208, 0.05395444, 0.05337754, 0.01664475]
# Set the bandwidth, then reset it to the default.
kde = stats.gaussian_kde(x1)
kde.set_bandwidth(bw_method=0.5)
kde.set_bandwidth(bw_method='scott')
y2 = kde(xs)
assert_array_almost_equal(y_expected, y2, decimal=7)
def test_gaussian_kde_monkeypatch():
"""Ugly, but people may rely on this. See scipy pull request 123,
specifically the linked ML thread "Width of the Gaussian in stats.kde".
If it is necessary to break this later on, that is to be discussed on ML.
"""
x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
xs = np.linspace(-10, 10, num=50)
# The old monkeypatched version to get at Silverman's Rule.
kde = stats.gaussian_kde(x1)
kde.covariance_factor = kde.silverman_factor
kde._compute_covariance()
y1 = kde(xs)
# The new saner version.
kde2 = stats.gaussian_kde(x1, bw_method='silverman')
y2 = kde2(xs)
assert_array_almost_equal_nulp(y1, y2, nulp=10)
def test_kde_integer_input():
"""Regression test for #1181."""
x1 = np.arange(5)
kde = stats.gaussian_kde(x1)
y_expected = [0.13480721, 0.18222869, 0.19514935, 0.18222869, 0.13480721]
assert_array_almost_equal(kde(x1), y_expected, decimal=6)
_ftypes = ['float32', 'float64', 'float96', 'float128', 'int32', 'int64']
@pytest.mark.parametrize("bw_type", _ftypes + ["scott", "silverman"])
@pytest.mark.parametrize("weights_type", _ftypes)
@pytest.mark.parametrize("dataset_type", _ftypes)
@pytest.mark.parametrize("point_type", _ftypes)
def test_kde_output_dtype(point_type, dataset_type, weights_type, bw_type):
# Check whether the datatypes are available
point_type = getattr(np, point_type, None)
dataset_type = getattr(np, weights_type, None)
weights_type = getattr(np, weights_type, None)
if bw_type in ["scott", "silverman"]:
bw = bw_type
else:
bw_type = getattr(np, bw_type, None)
bw = bw_type(3) if bw_type else None
if any(dt is None for dt in [point_type, dataset_type, weights_type, bw]):
pytest.skip()
weights = np.arange(5, dtype=weights_type)
dataset = np.arange(5, dtype=dataset_type)
k = stats.kde.gaussian_kde(dataset, bw_method=bw, weights=weights)
points = np.arange(5, dtype=point_type)
result = k(points)
# weights are always cast to float64
assert result.dtype == np.result_type(dataset, points, np.float64(weights),
k.factor)
def test_pdf_logpdf():
np.random.seed(1)
n_basesample = 50
xn = np.random.randn(n_basesample)
# Default
gkde = stats.gaussian_kde(xn)
xs = np.linspace(-15, 12, 25)
pdf = gkde.evaluate(xs)
pdf2 = gkde.pdf(xs)
assert_almost_equal(pdf, pdf2, decimal=12)
logpdf = np.log(pdf)
logpdf2 = gkde.logpdf(xs)
assert_almost_equal(logpdf, logpdf2, decimal=12)
# There are more points than data
gkde = stats.gaussian_kde(xs)
pdf = np.log(gkde.evaluate(xn))
pdf2 = gkde.logpdf(xn)
assert_almost_equal(pdf, pdf2, decimal=12)
def test_pdf_logpdf_weighted():
np.random.seed(1)
n_basesample = 50
xn = np.random.randn(n_basesample)
wn = np.random.rand(n_basesample)
# Default
gkde = stats.gaussian_kde(xn, weights=wn)
xs = np.linspace(-15, 12, 25)
pdf = gkde.evaluate(xs)
pdf2 = gkde.pdf(xs)
assert_almost_equal(pdf, pdf2, decimal=12)
logpdf = np.log(pdf)
logpdf2 = gkde.logpdf(xs)
assert_almost_equal(logpdf, logpdf2, decimal=12)
# There are more points than data
gkde = stats.gaussian_kde(xs, weights=np.random.rand(len(xs)))
pdf = np.log(gkde.evaluate(xn))
pdf2 = gkde.logpdf(xn)
assert_almost_equal(pdf, pdf2, decimal=12)
def test_weights_intact():
# regression test for gh-9709: weights are not modified
np.random.seed(12345)
vals = np.random.lognormal(size=100)
weights = np.random.choice([1.0, 10.0, 100], size=vals.size)
orig_weights = weights.copy()
stats.gaussian_kde(np.log10(vals), weights=weights)
assert_allclose(weights, orig_weights, atol=1e-14, rtol=1e-14)
def test_weights_integer():
# integer weights are OK, cf gh-9709 (comment)
np.random.seed(12345)
values = [0.2, 13.5, 21.0, 75.0, 99.0]
weights = [1, 2, 4, 8, 16] # a list of integers
pdf_i = stats.gaussian_kde(values, weights=weights)
pdf_f = stats.gaussian_kde(values, weights=np.float64(weights))
xn = [0.3, 11, 88]
assert_allclose(pdf_i.evaluate(xn),
pdf_f.evaluate(xn), atol=1e-14, rtol=1e-14)
def test_seed():
# Test the seed option of the resample method
def test_seed_sub(gkde_trail):
n_sample = 200
# The results should be different without using seed
samp1 = gkde_trail.resample(n_sample)
samp2 = gkde_trail.resample(n_sample)
assert_raises(
AssertionError, assert_allclose, samp1, samp2, atol=1e-13
)
# Use integer seed
seed = 831
samp1 = gkde_trail.resample(n_sample, seed=seed)
samp2 = gkde_trail.resample(n_sample, seed=seed)
assert_allclose(samp1, samp2, atol=1e-13)
# Use RandomState
rstate1 = np.random.RandomState(seed=138)
samp1 = gkde_trail.resample(n_sample, seed=rstate1)
rstate2 = np.random.RandomState(seed=138)
samp2 = gkde_trail.resample(n_sample, seed=rstate2)
assert_allclose(samp1, samp2, atol=1e-13)
# check that np.random.Generator can be used (numpy >= 1.17)
if hasattr(np.random, 'default_rng'):
# obtain a np.random.Generator object
rng = np.random.default_rng(1234)
gkde_trail.resample(n_sample, seed=rng)
np.random.seed(8765678)
n_basesample = 500
wn = np.random.rand(n_basesample)
# Test 1D case
xn_1d = np.random.randn(n_basesample)
gkde_1d = stats.gaussian_kde(xn_1d)
test_seed_sub(gkde_1d)
gkde_1d_weighted = stats.gaussian_kde(xn_1d, weights=wn)
test_seed_sub(gkde_1d_weighted)
# Test 2D case
mean = np.array([1.0, 3.0])
covariance = np.array([[1.0, 2.0], [2.0, 6.0]])
xn_2d = np.random.multivariate_normal(mean, covariance, size=n_basesample).T
gkde_2d = stats.gaussian_kde(xn_2d)
test_seed_sub(gkde_2d)
gkde_2d_weighted = stats.gaussian_kde(xn_2d, weights=wn)
test_seed_sub(gkde_2d_weighted)

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,134 @@
import numpy as np
import numpy.ma as ma
import scipy.stats.mstats as ms
from numpy.testing import (assert_equal, assert_almost_equal, assert_,
assert_allclose)
def test_compare_medians_ms():
x = np.arange(7)
y = x + 10
assert_almost_equal(ms.compare_medians_ms(x, y), 0)
y2 = np.linspace(0, 1, num=10)
assert_almost_equal(ms.compare_medians_ms(x, y2), 0.017116406778)
def test_hdmedian():
# 1-D array
x = ma.arange(11)
assert_allclose(ms.hdmedian(x), 5, rtol=1e-14)
x.mask = ma.make_mask(x)
x.mask[:7] = False
assert_allclose(ms.hdmedian(x), 3, rtol=1e-14)
# Check that `var` keyword returns a value. TODO: check whether returned
# value is actually correct.
assert_(ms.hdmedian(x, var=True).size == 2)
# 2-D array
x2 = ma.arange(22).reshape((11, 2))
assert_allclose(ms.hdmedian(x2, axis=0), [10, 11])
x2.mask = ma.make_mask(x2)
x2.mask[:7, :] = False
assert_allclose(ms.hdmedian(x2, axis=0), [6, 7])
def test_rsh():
np.random.seed(132345)
x = np.random.randn(100)
res = ms.rsh(x)
# Just a sanity check that the code runs and output shape is correct.
# TODO: check that implementation is correct.
assert_(res.shape == x.shape)
# Check points keyword
res = ms.rsh(x, points=[0, 1.])
assert_(res.size == 2)
def test_mjci():
# Tests the Marits-Jarrett estimator
data = ma.array([77, 87, 88,114,151,210,219,246,253,262,
296,299,306,376,428,515,666,1310,2611])
assert_almost_equal(ms.mjci(data),[55.76819,45.84028,198.87875],5)
def test_trimmed_mean_ci():
# Tests the confidence intervals of the trimmed mean.
data = ma.array([545,555,558,572,575,576,578,580,
594,605,635,651,653,661,666])
assert_almost_equal(ms.trimmed_mean(data,0.2), 596.2, 1)
assert_equal(np.round(ms.trimmed_mean_ci(data,(0.2,0.2)),1),
[561.8, 630.6])
def test_idealfourths():
# Tests ideal-fourths
test = np.arange(100)
assert_almost_equal(np.asarray(ms.idealfourths(test)),
[24.416667,74.583333],6)
test_2D = test.repeat(3).reshape(-1,3)
assert_almost_equal(ms.idealfourths(test_2D, axis=0),
[[24.416667,24.416667,24.416667],
[74.583333,74.583333,74.583333]],6)
assert_almost_equal(ms.idealfourths(test_2D, axis=1),
test.repeat(2).reshape(-1,2))
test = [0, 0]
_result = ms.idealfourths(test)
assert_(np.isnan(_result).all())
class TestQuantiles(object):
data = [0.706560797,0.727229578,0.990399276,0.927065621,0.158953014,
0.887764025,0.239407086,0.349638551,0.972791145,0.149789972,
0.936947700,0.132359948,0.046041972,0.641675031,0.945530547,
0.224218684,0.771450991,0.820257774,0.336458052,0.589113496,
0.509736129,0.696838829,0.491323573,0.622767425,0.775189248,
0.641461450,0.118455200,0.773029450,0.319280007,0.752229111,
0.047841438,0.466295911,0.583850781,0.840581845,0.550086491,
0.466470062,0.504765074,0.226855960,0.362641207,0.891620942,
0.127898691,0.490094097,0.044882048,0.041441695,0.317976349,
0.504135618,0.567353033,0.434617473,0.636243375,0.231803616,
0.230154113,0.160011327,0.819464108,0.854706985,0.438809221,
0.487427267,0.786907310,0.408367937,0.405534192,0.250444460,
0.995309248,0.144389588,0.739947527,0.953543606,0.680051621,
0.388382017,0.863530727,0.006514031,0.118007779,0.924024803,
0.384236354,0.893687694,0.626534881,0.473051932,0.750134705,
0.241843555,0.432947602,0.689538104,0.136934797,0.150206859,
0.474335206,0.907775349,0.525869295,0.189184225,0.854284286,
0.831089744,0.251637345,0.587038213,0.254475554,0.237781276,
0.827928620,0.480283781,0.594514455,0.213641488,0.024194386,
0.536668589,0.699497811,0.892804071,0.093835427,0.731107772]
def test_hdquantiles(self):
data = self.data
assert_almost_equal(ms.hdquantiles(data,[0., 1.]),
[0.006514031, 0.995309248])
hdq = ms.hdquantiles(data,[0.25, 0.5, 0.75])
assert_almost_equal(hdq, [0.253210762, 0.512847491, 0.762232442,])
hdq = ms.hdquantiles_sd(data,[0.25, 0.5, 0.75])
assert_almost_equal(hdq, [0.03786954, 0.03805389, 0.03800152,], 4)
data = np.array(data).reshape(10,10)
hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0)
assert_almost_equal(hdq[:,0], ms.hdquantiles(data[:,0],[0.25,0.5,0.75]))
assert_almost_equal(hdq[:,-1], ms.hdquantiles(data[:,-1],[0.25,0.5,0.75]))
hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0,var=True)
assert_almost_equal(hdq[...,0],
ms.hdquantiles(data[:,0],[0.25,0.5,0.75],var=True))
assert_almost_equal(hdq[...,-1],
ms.hdquantiles(data[:,-1],[0.25,0.5,0.75], var=True))
def test_hdquantiles_sd(self):
# Only test that code runs, implementation not checked for correctness
res = ms.hdquantiles_sd(self.data)
assert_(res.size == 3)
def test_mquantiles_cimj(self):
# Only test that code runs, implementation not checked for correctness
ci_lower, ci_upper = ms.mquantiles_cimj(self.data)
assert_(ci_lower.size == ci_upper.size == 3)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,240 @@
import numpy as np
from numpy.testing import assert_equal, assert_array_equal
from scipy.stats import rankdata, tiecorrect
import pytest
class TestTieCorrect(object):
def test_empty(self):
"""An empty array requires no correction, should return 1.0."""
ranks = np.array([], dtype=np.float64)
c = tiecorrect(ranks)
assert_equal(c, 1.0)
def test_one(self):
"""A single element requires no correction, should return 1.0."""
ranks = np.array([1.0], dtype=np.float64)
c = tiecorrect(ranks)
assert_equal(c, 1.0)
def test_no_correction(self):
"""Arrays with no ties require no correction."""
ranks = np.arange(2.0)
c = tiecorrect(ranks)
assert_equal(c, 1.0)
ranks = np.arange(3.0)
c = tiecorrect(ranks)
assert_equal(c, 1.0)
def test_basic(self):
"""Check a few basic examples of the tie correction factor."""
# One tie of two elements
ranks = np.array([1.0, 2.5, 2.5])
c = tiecorrect(ranks)
T = 2.0
N = ranks.size
expected = 1.0 - (T**3 - T) / (N**3 - N)
assert_equal(c, expected)
# One tie of two elements (same as above, but tie is not at the end)
ranks = np.array([1.5, 1.5, 3.0])
c = tiecorrect(ranks)
T = 2.0
N = ranks.size
expected = 1.0 - (T**3 - T) / (N**3 - N)
assert_equal(c, expected)
# One tie of three elements
ranks = np.array([1.0, 3.0, 3.0, 3.0])
c = tiecorrect(ranks)
T = 3.0
N = ranks.size
expected = 1.0 - (T**3 - T) / (N**3 - N)
assert_equal(c, expected)
# Two ties, lengths 2 and 3.
ranks = np.array([1.5, 1.5, 4.0, 4.0, 4.0])
c = tiecorrect(ranks)
T1 = 2.0
T2 = 3.0
N = ranks.size
expected = 1.0 - ((T1**3 - T1) + (T2**3 - T2)) / (N**3 - N)
assert_equal(c, expected)
def test_overflow(self):
ntie, k = 2000, 5
a = np.repeat(np.arange(k), ntie)
n = a.size # ntie * k
out = tiecorrect(rankdata(a))
assert_equal(out, 1.0 - k * (ntie**3 - ntie) / float(n**3 - n))
class TestRankData(object):
def test_empty(self):
"""stats.rankdata([]) should return an empty array."""
a = np.array([], dtype=int)
r = rankdata(a)
assert_array_equal(r, np.array([], dtype=np.float64))
r = rankdata([])
assert_array_equal(r, np.array([], dtype=np.float64))
def test_one(self):
"""Check stats.rankdata with an array of length 1."""
data = [100]
a = np.array(data, dtype=int)
r = rankdata(a)
assert_array_equal(r, np.array([1.0], dtype=np.float64))
r = rankdata(data)
assert_array_equal(r, np.array([1.0], dtype=np.float64))
def test_basic(self):
"""Basic tests of stats.rankdata."""
data = [100, 10, 50]
expected = np.array([3.0, 1.0, 2.0], dtype=np.float64)
a = np.array(data, dtype=int)
r = rankdata(a)
assert_array_equal(r, expected)
r = rankdata(data)
assert_array_equal(r, expected)
data = [40, 10, 30, 10, 50]
expected = np.array([4.0, 1.5, 3.0, 1.5, 5.0], dtype=np.float64)
a = np.array(data, dtype=int)
r = rankdata(a)
assert_array_equal(r, expected)
r = rankdata(data)
assert_array_equal(r, expected)
data = [20, 20, 20, 10, 10, 10]
expected = np.array([5.0, 5.0, 5.0, 2.0, 2.0, 2.0], dtype=np.float64)
a = np.array(data, dtype=int)
r = rankdata(a)
assert_array_equal(r, expected)
r = rankdata(data)
assert_array_equal(r, expected)
# The docstring states explicitly that the argument is flattened.
a2d = a.reshape(2, 3)
r = rankdata(a2d)
assert_array_equal(r, expected)
def test_rankdata_object_string(self):
min_rank = lambda a: [1 + sum(i < j for i in a) for j in a]
max_rank = lambda a: [sum(i <= j for i in a) for j in a]
ordinal_rank = lambda a: min_rank([(x, i) for i, x in enumerate(a)])
def average_rank(a):
return [(i + j) / 2.0 for i, j in zip(min_rank(a), max_rank(a))]
def dense_rank(a):
b = np.unique(a)
return [1 + sum(i < j for i in b) for j in a]
rankf = dict(min=min_rank, max=max_rank, ordinal=ordinal_rank,
average=average_rank, dense=dense_rank)
def check_ranks(a):
for method in 'min', 'max', 'dense', 'ordinal', 'average':
out = rankdata(a, method=method)
assert_array_equal(out, rankf[method](a))
val = ['foo', 'bar', 'qux', 'xyz', 'abc', 'efg', 'ace', 'qwe', 'qaz']
check_ranks(np.random.choice(val, 200))
check_ranks(np.random.choice(val, 200).astype('object'))
val = np.array([0, 1, 2, 2.718, 3, 3.141], dtype='object')
check_ranks(np.random.choice(val, 200).astype('object'))
def test_large_int(self):
data = np.array([2**60, 2**60+1], dtype=np.uint64)
r = rankdata(data)
assert_array_equal(r, [1.0, 2.0])
data = np.array([2**60, 2**60+1], dtype=np.int64)
r = rankdata(data)
assert_array_equal(r, [1.0, 2.0])
data = np.array([2**60, -2**60+1], dtype=np.int64)
r = rankdata(data)
assert_array_equal(r, [2.0, 1.0])
def test_big_tie(self):
for n in [10000, 100000, 1000000]:
data = np.ones(n, dtype=int)
r = rankdata(data)
expected_rank = 0.5 * (n + 1)
assert_array_equal(r, expected_rank * data,
"test failed with n=%d" % n)
def test_axis(self):
data = [[0, 2, 1],
[4, 2, 2]]
expected0 = [[1., 1.5, 1.],
[2., 1.5, 2.]]
r0 = rankdata(data, axis=0)
assert_array_equal(r0, expected0)
expected1 = [[1., 3., 2.],
[3., 1.5, 1.5]]
r1 = rankdata(data, axis=1)
assert_array_equal(r1, expected1)
methods = ["average", "min", "max", "dense", "ordinal"]
dtypes = [np.float64] + [np.int_]*4
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("method, dtype", zip(methods, dtypes))
def test_size_0_axis(self, axis, method, dtype):
shape = (3, 0)
data = np.zeros(shape)
r = rankdata(data, method=method, axis=axis)
assert_equal(r.shape, shape)
assert_equal(r.dtype, dtype)
_cases = (
# values, method, expected
([], 'average', []),
([], 'min', []),
([], 'max', []),
([], 'dense', []),
([], 'ordinal', []),
#
([100], 'average', [1.0]),
([100], 'min', [1.0]),
([100], 'max', [1.0]),
([100], 'dense', [1.0]),
([100], 'ordinal', [1.0]),
#
([100, 100, 100], 'average', [2.0, 2.0, 2.0]),
([100, 100, 100], 'min', [1.0, 1.0, 1.0]),
([100, 100, 100], 'max', [3.0, 3.0, 3.0]),
([100, 100, 100], 'dense', [1.0, 1.0, 1.0]),
([100, 100, 100], 'ordinal', [1.0, 2.0, 3.0]),
#
([100, 300, 200], 'average', [1.0, 3.0, 2.0]),
([100, 300, 200], 'min', [1.0, 3.0, 2.0]),
([100, 300, 200], 'max', [1.0, 3.0, 2.0]),
([100, 300, 200], 'dense', [1.0, 3.0, 2.0]),
([100, 300, 200], 'ordinal', [1.0, 3.0, 2.0]),
#
([100, 200, 300, 200], 'average', [1.0, 2.5, 4.0, 2.5]),
([100, 200, 300, 200], 'min', [1.0, 2.0, 4.0, 2.0]),
([100, 200, 300, 200], 'max', [1.0, 3.0, 4.0, 3.0]),
([100, 200, 300, 200], 'dense', [1.0, 2.0, 3.0, 2.0]),
([100, 200, 300, 200], 'ordinal', [1.0, 2.0, 4.0, 3.0]),
#
([100, 200, 300, 200, 100], 'average', [1.5, 3.5, 5.0, 3.5, 1.5]),
([100, 200, 300, 200, 100], 'min', [1.0, 3.0, 5.0, 3.0, 1.0]),
([100, 200, 300, 200, 100], 'max', [2.0, 4.0, 5.0, 4.0, 2.0]),
([100, 200, 300, 200, 100], 'dense', [1.0, 2.0, 3.0, 2.0, 1.0]),
([100, 200, 300, 200, 100], 'ordinal', [1.0, 3.0, 5.0, 4.0, 2.0]),
#
([10] * 30, 'ordinal', np.arange(1.0, 31.0)),
)
def test_cases():
for values, method, expected in _cases:
r = rankdata(values, method=method)
assert_array_equal(r, expected)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,86 @@
import numpy as np
from numpy.testing import assert_allclose, assert_equal
from scipy.stats._tukeylambda_stats import (tukeylambda_variance,
tukeylambda_kurtosis)
def test_tukeylambda_stats_known_exact():
"""Compare results with some known exact formulas."""
# Some exact values of the Tukey Lambda variance and kurtosis:
# lambda var kurtosis
# 0 pi**2/3 6/5 (logistic distribution)
# 0.5 4 - pi (5/3 - pi/2)/(pi/4 - 1)**2 - 3
# 1 1/3 -6/5 (uniform distribution on (-1,1))
# 2 1/12 -6/5 (uniform distribution on (-1/2, 1/2))
# lambda = 0
var = tukeylambda_variance(0)
assert_allclose(var, np.pi**2 / 3, atol=1e-12)
kurt = tukeylambda_kurtosis(0)
assert_allclose(kurt, 1.2, atol=1e-10)
# lambda = 0.5
var = tukeylambda_variance(0.5)
assert_allclose(var, 4 - np.pi, atol=1e-12)
kurt = tukeylambda_kurtosis(0.5)
desired = (5./3 - np.pi/2) / (np.pi/4 - 1)**2 - 3
assert_allclose(kurt, desired, atol=1e-10)
# lambda = 1
var = tukeylambda_variance(1)
assert_allclose(var, 1.0 / 3, atol=1e-12)
kurt = tukeylambda_kurtosis(1)
assert_allclose(kurt, -1.2, atol=1e-10)
# lambda = 2
var = tukeylambda_variance(2)
assert_allclose(var, 1.0 / 12, atol=1e-12)
kurt = tukeylambda_kurtosis(2)
assert_allclose(kurt, -1.2, atol=1e-10)
def test_tukeylambda_stats_mpmath():
"""Compare results with some values that were computed using mpmath."""
a10 = dict(atol=1e-10, rtol=0)
a12 = dict(atol=1e-12, rtol=0)
data = [
# lambda variance kurtosis
[-0.1, 4.78050217874253547, 3.78559520346454510],
[-0.0649, 4.16428023599895777, 2.52019675947435718],
[-0.05, 3.93672267890775277, 2.13129793057777277],
[-0.001, 3.30128380390964882, 1.21452460083542988],
[0.001, 3.27850775649572176, 1.18560634779287585],
[0.03125, 2.95927803254615800, 0.804487555161819980],
[0.05, 2.78281053405464501, 0.611604043886644327],
[0.0649, 2.65282386754100551, 0.476834119532774540],
[1.2, 0.242153920578588346, -1.23428047169049726],
[10.0, 0.00095237579757703597, 2.37810697355144933],
[20.0, 0.00012195121951131043, 7.37654321002709531],
]
for lam, var_expected, kurt_expected in data:
var = tukeylambda_variance(lam)
assert_allclose(var, var_expected, **a12)
kurt = tukeylambda_kurtosis(lam)
assert_allclose(kurt, kurt_expected, **a10)
# Test with vector arguments (most of the other tests are for single
# values).
lam, var_expected, kurt_expected = zip(*data)
var = tukeylambda_variance(lam)
assert_allclose(var, var_expected, **a12)
kurt = tukeylambda_kurtosis(lam)
assert_allclose(kurt, kurt_expected, **a10)
def test_tukeylambda_stats_invalid():
"""Test values of lambda outside the domains of the functions."""
lam = [-1.0, -0.5]
var = tukeylambda_variance(lam)
assert_equal(var, np.array([np.nan, np.inf]))
lam = [-1.0, -0.25]
kurt = tukeylambda_kurtosis(lam)
assert_equal(kurt, np.array([np.nan, np.inf]))