Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
0
venv/Lib/site-packages/sklearn/utils/tests/__init__.py
Normal file
0
venv/Lib/site-packages/sklearn/utils/tests/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
10
venv/Lib/site-packages/sklearn/utils/tests/conftest.py
Normal file
10
venv/Lib/site-packages/sklearn/utils/tests/conftest.py
Normal file
|
@ -0,0 +1,10 @@
|
|||
import pytest
|
||||
|
||||
import sklearn
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def print_changed_only_false():
|
||||
sklearn.set_config(print_changed_only=False)
|
||||
yield
|
||||
sklearn.set_config(print_changed_only=True) # reset to default
|
266
venv/Lib/site-packages/sklearn/utils/tests/test_class_weight.py
Normal file
266
venv/Lib/site-packages/sklearn/utils/tests/test_class_weight.py
Normal file
|
@ -0,0 +1,266 @@
|
|||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
from sklearn.utils.class_weight import compute_class_weight
|
||||
from sklearn.utils.class_weight import compute_sample_weight
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
|
||||
|
||||
def test_compute_class_weight():
|
||||
# Test (and demo) compute_class_weight.
|
||||
y = np.asarray([2, 2, 2, 3, 3, 4])
|
||||
classes = np.unique(y)
|
||||
|
||||
cw = compute_class_weight("balanced", classes=classes, y=y)
|
||||
# total effect of samples is preserved
|
||||
class_counts = np.bincount(y)[2:]
|
||||
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
|
||||
assert cw[0] < cw[1] < cw[2]
|
||||
|
||||
|
||||
def test_compute_class_weight_not_present():
|
||||
# Raise error when y does not contain all class labels
|
||||
classes = np.arange(4)
|
||||
y = np.asarray([0, 0, 0, 1, 1, 2])
|
||||
with pytest.raises(ValueError):
|
||||
compute_class_weight("balanced", classes=classes, y=y)
|
||||
# Fix exception in error message formatting when missing label is a string
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/8312
|
||||
with pytest.raises(ValueError,
|
||||
match="Class label label_not_present not present"):
|
||||
compute_class_weight({"label_not_present": 1.}, classes=classes, y=y)
|
||||
# Raise error when y has items not in classes
|
||||
classes = np.arange(2)
|
||||
with pytest.raises(ValueError):
|
||||
compute_class_weight("balanced", classes=classes, y=y)
|
||||
with pytest.raises(ValueError):
|
||||
compute_class_weight({0: 1., 1: 2.}, classes=classes, y=y)
|
||||
|
||||
|
||||
def test_compute_class_weight_dict():
|
||||
classes = np.arange(3)
|
||||
class_weights = {0: 1.0, 1: 2.0, 2: 3.0}
|
||||
y = np.asarray([0, 0, 1, 2])
|
||||
cw = compute_class_weight(class_weights, classes=classes, y=y)
|
||||
|
||||
# When the user specifies class weights, compute_class_weights should just
|
||||
# return them.
|
||||
assert_array_almost_equal(np.asarray([1.0, 2.0, 3.0]), cw)
|
||||
|
||||
# When a class weight is specified that isn't in classes, a ValueError
|
||||
# should get raised
|
||||
msg = 'Class label 4 not present.'
|
||||
class_weights = {0: 1.0, 1: 2.0, 2: 3.0, 4: 1.5}
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
compute_class_weight(class_weights, classes=classes, y=y)
|
||||
|
||||
msg = 'Class label -1 not present.'
|
||||
class_weights = {-1: 5.0, 0: 1.0, 1: 2.0, 2: 3.0}
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
compute_class_weight(class_weights, classes=classes, y=y)
|
||||
|
||||
|
||||
def test_compute_class_weight_invariance():
|
||||
# Test that results with class_weight="balanced" is invariant wrt
|
||||
# class imbalance if the number of samples is identical.
|
||||
# The test uses a balanced two class dataset with 100 datapoints.
|
||||
# It creates three versions, one where class 1 is duplicated
|
||||
# resulting in 150 points of class 1 and 50 of class 0,
|
||||
# one where there are 50 points in class 1 and 150 in class 0,
|
||||
# and one where there are 100 points of each class (this one is balanced
|
||||
# again).
|
||||
# With balancing class weights, all three should give the same model.
|
||||
X, y = make_blobs(centers=2, random_state=0)
|
||||
# create dataset where class 1 is duplicated twice
|
||||
X_1 = np.vstack([X] + [X[y == 1]] * 2)
|
||||
y_1 = np.hstack([y] + [y[y == 1]] * 2)
|
||||
# create dataset where class 0 is duplicated twice
|
||||
X_0 = np.vstack([X] + [X[y == 0]] * 2)
|
||||
y_0 = np.hstack([y] + [y[y == 0]] * 2)
|
||||
# duplicate everything
|
||||
X_ = np.vstack([X] * 2)
|
||||
y_ = np.hstack([y] * 2)
|
||||
# results should be identical
|
||||
logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1)
|
||||
logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0)
|
||||
logreg = LogisticRegression(class_weight="balanced").fit(X_, y_)
|
||||
assert_array_almost_equal(logreg1.coef_, logreg0.coef_)
|
||||
assert_array_almost_equal(logreg.coef_, logreg0.coef_)
|
||||
|
||||
|
||||
def test_compute_class_weight_balanced_negative():
|
||||
# Test compute_class_weight when labels are negative
|
||||
# Test with balanced class labels.
|
||||
classes = np.array([-2, -1, 0])
|
||||
y = np.asarray([-1, -1, 0, 0, -2, -2])
|
||||
|
||||
cw = compute_class_weight("balanced", classes=classes, y=y)
|
||||
assert len(cw) == len(classes)
|
||||
assert_array_almost_equal(cw, np.array([1., 1., 1.]))
|
||||
|
||||
# Test with unbalanced class labels.
|
||||
y = np.asarray([-1, 0, 0, -2, -2, -2])
|
||||
|
||||
cw = compute_class_weight("balanced", classes=classes, y=y)
|
||||
assert len(cw) == len(classes)
|
||||
class_counts = np.bincount(y + 2)
|
||||
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
|
||||
assert_array_almost_equal(cw, [2. / 3, 2., 1.])
|
||||
|
||||
|
||||
def test_compute_class_weight_balanced_unordered():
|
||||
# Test compute_class_weight when classes are unordered
|
||||
classes = np.array([1, 0, 3])
|
||||
y = np.asarray([1, 0, 0, 3, 3, 3])
|
||||
|
||||
cw = compute_class_weight("balanced", classes=classes, y=y)
|
||||
class_counts = np.bincount(y)[classes]
|
||||
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
|
||||
assert_array_almost_equal(cw, [2., 1., 2. / 3])
|
||||
|
||||
|
||||
def test_compute_class_weight_default():
|
||||
# Test for the case where no weight is given for a present class.
|
||||
# Current behaviour is to assign the unweighted classes a weight of 1.
|
||||
y = np.asarray([2, 2, 2, 3, 3, 4])
|
||||
classes = np.unique(y)
|
||||
classes_len = len(classes)
|
||||
|
||||
# Test for non specified weights
|
||||
cw = compute_class_weight(None, classes=classes, y=y)
|
||||
assert len(cw) == classes_len
|
||||
assert_array_almost_equal(cw, np.ones(3))
|
||||
|
||||
# Tests for partly specified weights
|
||||
cw = compute_class_weight({2: 1.5}, classes=classes, y=y)
|
||||
assert len(cw) == classes_len
|
||||
assert_array_almost_equal(cw, [1.5, 1., 1.])
|
||||
|
||||
cw = compute_class_weight({2: 1.5, 4: 0.5}, classes=classes, y=y)
|
||||
assert len(cw) == classes_len
|
||||
assert_array_almost_equal(cw, [1.5, 1., 0.5])
|
||||
|
||||
|
||||
def test_compute_sample_weight():
|
||||
# Test (and demo) compute_sample_weight.
|
||||
# Test with balanced classes
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
|
||||
|
||||
# Test with user-defined weights
|
||||
sample_weight = compute_sample_weight({1: 2, 2: 1}, y)
|
||||
assert_array_almost_equal(sample_weight, [2., 2., 2., 1., 1., 1.])
|
||||
|
||||
# Test with column vector of balanced classes
|
||||
y = np.asarray([[1], [1], [1], [2], [2], [2]])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
|
||||
|
||||
# Test with unbalanced classes
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
expected_balanced = np.array([0.7777, 0.7777, 0.7777, 0.7777, 0.7777,
|
||||
0.7777, 2.3333])
|
||||
assert_array_almost_equal(sample_weight, expected_balanced, decimal=4)
|
||||
|
||||
# Test with `None` weights
|
||||
sample_weight = compute_sample_weight(None, y)
|
||||
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 1.])
|
||||
|
||||
# Test with multi-output of balanced classes
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
|
||||
|
||||
# Test with multi-output with user-defined weights
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
sample_weight = compute_sample_weight([{1: 2, 2: 1}, {0: 1, 1: 2}], y)
|
||||
assert_array_almost_equal(sample_weight, [2., 2., 2., 2., 2., 2.])
|
||||
|
||||
# Test with multi-output of unbalanced classes
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [3, -1]])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, expected_balanced ** 2, decimal=3)
|
||||
|
||||
|
||||
def test_compute_sample_weight_with_subsample():
|
||||
# Test compute_sample_weight with subsamples specified.
|
||||
# Test with balanced classes and all samples present
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
|
||||
|
||||
# Test with column vector of balanced classes and all samples present
|
||||
y = np.asarray([[1], [1], [1], [2], [2], [2]])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
|
||||
|
||||
# Test with a subsample
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(4))
|
||||
assert_array_almost_equal(sample_weight, [2. / 3, 2. / 3,
|
||||
2. / 3, 2., 2., 2.])
|
||||
|
||||
# Test with a bootstrap subsample
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y,
|
||||
indices=[0, 1, 1, 2, 2, 3])
|
||||
expected_balanced = np.asarray([0.6, 0.6, 0.6, 3., 3., 3.])
|
||||
assert_array_almost_equal(sample_weight, expected_balanced)
|
||||
|
||||
# Test with a bootstrap subsample for multi-output
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
sample_weight = compute_sample_weight("balanced", y,
|
||||
indices=[0, 1, 1, 2, 2, 3])
|
||||
assert_array_almost_equal(sample_weight, expected_balanced ** 2)
|
||||
|
||||
# Test with a missing class
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
|
||||
|
||||
# Test with a missing class for multi-output
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
|
||||
|
||||
|
||||
def test_compute_sample_weight_errors():
|
||||
# Test compute_sample_weight raises errors expected.
|
||||
# Invalid preset string
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
y_ = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
compute_sample_weight("ni", y)
|
||||
with pytest.raises(ValueError):
|
||||
compute_sample_weight("ni", y, indices=range(4))
|
||||
with pytest.raises(ValueError):
|
||||
compute_sample_weight("ni", y_)
|
||||
with pytest.raises(ValueError):
|
||||
compute_sample_weight("ni", y_, indices=range(4))
|
||||
|
||||
# Not "balanced" for subsample
|
||||
with pytest.raises(ValueError):
|
||||
compute_sample_weight({1: 2, 2: 1}, y, indices=range(4))
|
||||
|
||||
# Not a list or preset for multi-output
|
||||
with pytest.raises(ValueError):
|
||||
compute_sample_weight({1: 2, 2: 1}, y_)
|
||||
|
||||
# Incorrect length list for multi-output
|
||||
with pytest.raises(ValueError):
|
||||
compute_sample_weight([{1: 2, 2: 1}], y_)
|
||||
|
||||
|
||||
def test_compute_sample_weight_more_than_32():
|
||||
# Non-regression smoke test for #12146
|
||||
y = np.arange(50) # more than 32 distinct classes
|
||||
indices = np.arange(50) # use subsampling
|
||||
weight = compute_sample_weight('balanced', y, indices=indices)
|
||||
assert_array_almost_equal(weight, np.ones(y.shape[0]))
|
229
venv/Lib/site-packages/sklearn/utils/tests/test_cython_blas.py
Normal file
229
venv/Lib/site-packages/sklearn/utils/tests/test_cython_blas.py
Normal file
|
@ -0,0 +1,229 @@
|
|||
import pytest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils._cython_blas import _dot_memview
|
||||
from sklearn.utils._cython_blas import _asum_memview
|
||||
from sklearn.utils._cython_blas import _axpy_memview
|
||||
from sklearn.utils._cython_blas import _nrm2_memview
|
||||
from sklearn.utils._cython_blas import _copy_memview
|
||||
from sklearn.utils._cython_blas import _scal_memview
|
||||
from sklearn.utils._cython_blas import _rotg_memview
|
||||
from sklearn.utils._cython_blas import _rot_memview
|
||||
from sklearn.utils._cython_blas import _gemv_memview
|
||||
from sklearn.utils._cython_blas import _ger_memview
|
||||
from sklearn.utils._cython_blas import _gemm_memview
|
||||
from sklearn.utils._cython_blas import RowMajor, ColMajor
|
||||
from sklearn.utils._cython_blas import Trans, NoTrans
|
||||
|
||||
|
||||
def _numpy_to_cython(dtype):
|
||||
cython = pytest.importorskip("cython")
|
||||
if dtype == np.float32:
|
||||
return cython.float
|
||||
elif dtype == np.float64:
|
||||
return cython.double
|
||||
|
||||
|
||||
RTOL = {np.float32: 1e-6, np.float64: 1e-12}
|
||||
ORDER = {RowMajor: 'C', ColMajor: 'F'}
|
||||
|
||||
|
||||
def _no_op(x):
|
||||
return x
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_dot(dtype):
|
||||
dot = _dot_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(10).astype(dtype, copy=False)
|
||||
|
||||
expected = x.dot(y)
|
||||
actual = dot(x, y)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_asum(dtype):
|
||||
asum = _asum_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
|
||||
expected = np.abs(x).sum()
|
||||
actual = asum(x)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_axpy(dtype):
|
||||
axpy = _axpy_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(10).astype(dtype, copy=False)
|
||||
alpha = 2.5
|
||||
|
||||
expected = alpha * x + y
|
||||
axpy(alpha, x, y)
|
||||
|
||||
assert_allclose(y, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_nrm2(dtype):
|
||||
nrm2 = _nrm2_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
|
||||
expected = np.linalg.norm(x)
|
||||
actual = nrm2(x)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_copy(dtype):
|
||||
copy = _copy_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = np.empty_like(x)
|
||||
|
||||
expected = x.copy()
|
||||
copy(x, y)
|
||||
|
||||
assert_allclose(y, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_scal(dtype):
|
||||
scal = _scal_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
alpha = 2.5
|
||||
|
||||
expected = alpha * x
|
||||
scal(alpha, x)
|
||||
|
||||
assert_allclose(x, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_rotg(dtype):
|
||||
rotg = _rotg_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
a = dtype(rng.randn())
|
||||
b = dtype(rng.randn())
|
||||
c, s = 0.0, 0.0
|
||||
|
||||
def expected_rotg(a, b):
|
||||
roe = a if abs(a) > abs(b) else b
|
||||
if a == 0 and b == 0:
|
||||
c, s, r, z = (1, 0, 0, 0)
|
||||
else:
|
||||
r = np.sqrt(a**2 + b**2) * (1 if roe >= 0 else -1)
|
||||
c, s = a/r, b/r
|
||||
z = s if roe == a else (1 if c == 0 else 1 / c)
|
||||
return r, z, c, s
|
||||
|
||||
expected = expected_rotg(a, b)
|
||||
actual = rotg(a, b, c, s)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_rot(dtype):
|
||||
rot = _rot_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(10).astype(dtype, copy=False)
|
||||
c = dtype(rng.randn())
|
||||
s = dtype(rng.randn())
|
||||
|
||||
expected_x = c * x + s * y
|
||||
expected_y = c * y - s * x
|
||||
|
||||
rot(x, y, c, s)
|
||||
|
||||
assert_allclose(x, expected_x)
|
||||
assert_allclose(y, expected_y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize("opA, transA",
|
||||
[(_no_op, NoTrans), (np.transpose, Trans)],
|
||||
ids=["NoTrans", "Trans"])
|
||||
@pytest.mark.parametrize("order", [RowMajor, ColMajor],
|
||||
ids=["RowMajor", "ColMajor"])
|
||||
def test_gemv(dtype, opA, transA, order):
|
||||
gemv = _gemv_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
A = np.asarray(opA(rng.random_sample((20, 10)).astype(dtype, copy=False)),
|
||||
order=ORDER[order])
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(20).astype(dtype, copy=False)
|
||||
alpha, beta = 2.5, -0.5
|
||||
|
||||
expected = alpha * opA(A).dot(x) + beta * y
|
||||
gemv(transA, alpha, A, x, beta, y)
|
||||
|
||||
assert_allclose(y, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize("order", [RowMajor, ColMajor],
|
||||
ids=["RowMajor", "ColMajor"])
|
||||
def test_ger(dtype, order):
|
||||
ger = _ger_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(20).astype(dtype, copy=False)
|
||||
A = np.asarray(rng.random_sample((10, 20)).astype(dtype, copy=False),
|
||||
order=ORDER[order])
|
||||
alpha = 2.5
|
||||
|
||||
expected = alpha * np.outer(x, y) + A
|
||||
ger(alpha, x, y, A)
|
||||
|
||||
assert_allclose(A, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize("opB, transB",
|
||||
[(_no_op, NoTrans), (np.transpose, Trans)],
|
||||
ids=["NoTrans", "Trans"])
|
||||
@pytest.mark.parametrize("opA, transA",
|
||||
[(_no_op, NoTrans), (np.transpose, Trans)],
|
||||
ids=["NoTrans", "Trans"])
|
||||
@pytest.mark.parametrize("order", [RowMajor, ColMajor],
|
||||
ids=["RowMajor", "ColMajor"])
|
||||
def test_gemm(dtype, opA, transA, opB, transB, order):
|
||||
gemm = _gemm_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
A = np.asarray(opA(rng.random_sample((30, 10)).astype(dtype, copy=False)),
|
||||
order=ORDER[order])
|
||||
B = np.asarray(opB(rng.random_sample((10, 20)).astype(dtype, copy=False)),
|
||||
order=ORDER[order])
|
||||
C = np.asarray(rng.random_sample((30, 20)).astype(dtype, copy=False),
|
||||
order=ORDER[order])
|
||||
alpha, beta = 2.5, -0.5
|
||||
|
||||
expected = alpha * opA(A).dot(opB(B)) + beta * C
|
||||
gemm(transA, transB, alpha, A, B, beta, C)
|
||||
|
||||
assert_allclose(C, expected, rtol=RTOL[dtype])
|
|
@ -0,0 +1,128 @@
|
|||
import pytest
|
||||
import types
|
||||
import numpy as np
|
||||
import warnings
|
||||
|
||||
from sklearn.dummy import DummyClassifier
|
||||
from sklearn.utils import all_estimators
|
||||
from sklearn.utils.estimator_checks import choose_check_classifiers_labels
|
||||
from sklearn.utils.estimator_checks import NotAnArray
|
||||
from sklearn.utils.estimator_checks import enforce_estimator_tags_y
|
||||
from sklearn.utils.estimator_checks import is_public_parameter
|
||||
from sklearn.utils.estimator_checks import pairwise_estimator_convert_X
|
||||
from sklearn.utils.estimator_checks import set_checking_parameters
|
||||
from sklearn.utils.optimize import newton_cg
|
||||
from sklearn.utils.random import random_choice_csc
|
||||
from sklearn.utils import safe_indexing
|
||||
|
||||
|
||||
# This file tests the utils that are deprecated
|
||||
|
||||
|
||||
# TODO: remove in 0.24
|
||||
def test_choose_check_classifiers_labels_deprecated():
|
||||
with pytest.warns(FutureWarning,
|
||||
match="removed in version 0.24"):
|
||||
choose_check_classifiers_labels(None, None, None)
|
||||
|
||||
|
||||
# TODO: remove in 0.24
|
||||
def test_enforce_estimator_tags_y():
|
||||
with pytest.warns(FutureWarning,
|
||||
match="removed in version 0.24"):
|
||||
enforce_estimator_tags_y(DummyClassifier(), np.array([0, 1]))
|
||||
|
||||
|
||||
# TODO: remove in 0.24
|
||||
def test_notanarray():
|
||||
with pytest.warns(FutureWarning,
|
||||
match="removed in version 0.24"):
|
||||
NotAnArray([1, 2])
|
||||
|
||||
|
||||
# TODO: remove in 0.24
|
||||
def test_is_public_parameter():
|
||||
with pytest.warns(FutureWarning,
|
||||
match="removed in version 0.24"):
|
||||
is_public_parameter('hello')
|
||||
|
||||
|
||||
# TODO: remove in 0.24
|
||||
def test_pairwise_estimator_convert_X():
|
||||
with pytest.warns(FutureWarning,
|
||||
match="removed in version 0.24"):
|
||||
pairwise_estimator_convert_X([[1, 2]], DummyClassifier())
|
||||
|
||||
|
||||
# TODO: remove in 0.24
|
||||
def test_set_checking_parameters():
|
||||
with pytest.warns(FutureWarning,
|
||||
match="removed in version 0.24"):
|
||||
set_checking_parameters(DummyClassifier())
|
||||
|
||||
|
||||
# TODO: remove in 0.24
|
||||
def test_newton_cg():
|
||||
rng = np.random.RandomState(0)
|
||||
A = rng.normal(size=(10, 10))
|
||||
x0 = np.ones(10)
|
||||
|
||||
def func(x):
|
||||
Ax = A.dot(x)
|
||||
return .5 * (Ax).dot(Ax)
|
||||
|
||||
def grad(x):
|
||||
return A.T.dot(A.dot(x))
|
||||
|
||||
def grad_hess(x):
|
||||
return grad(x), lambda x: A.T.dot(A.dot(x))
|
||||
|
||||
with pytest.warns(FutureWarning,
|
||||
match="removed in version 0.24"):
|
||||
newton_cg(grad_hess, func, grad, x0)
|
||||
|
||||
|
||||
# TODO: remove in 0.24
|
||||
def test_random_choice_csc():
|
||||
with pytest.warns(FutureWarning,
|
||||
match="removed in version 0.24"):
|
||||
random_choice_csc(10, [[2]])
|
||||
|
||||
|
||||
# TODO: remove in 0.24
|
||||
def test_safe_indexing():
|
||||
with pytest.warns(FutureWarning,
|
||||
match="removed in version 0.24"):
|
||||
safe_indexing([1, 2], 0)
|
||||
|
||||
|
||||
# TODO: remove in 0.24
|
||||
def test_partial_dependence_no_shadowing():
|
||||
# Non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/15842
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", category=FutureWarning)
|
||||
from sklearn.inspection.partial_dependence import partial_dependence as _ # noqa
|
||||
|
||||
# Calling all_estimators() also triggers a recursive import of all
|
||||
# submodules, including deprecated ones.
|
||||
all_estimators()
|
||||
|
||||
from sklearn.inspection import partial_dependence
|
||||
assert isinstance(partial_dependence, types.FunctionType)
|
||||
|
||||
|
||||
# TODO: remove in 0.24
|
||||
def test_dict_learning_no_shadowing():
|
||||
# Non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/15842
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", category=FutureWarning)
|
||||
from sklearn.decomposition.dict_learning import dict_learning as _ # noqa
|
||||
|
||||
# Calling all_estimators() also triggers a recursive import of all
|
||||
# submodules, including deprecated ones.
|
||||
all_estimators()
|
||||
|
||||
from sklearn.decomposition import dict_learning
|
||||
assert isinstance(dict_learning, types.FunctionType)
|
|
@ -0,0 +1,59 @@
|
|||
# Authors: Raghav RV <rvraghav93@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
|
||||
import pickle
|
||||
|
||||
from sklearn.utils.deprecation import _is_deprecated
|
||||
from sklearn.utils.deprecation import deprecated
|
||||
from sklearn.utils._testing import assert_warns_message
|
||||
|
||||
|
||||
@deprecated('qwerty')
|
||||
class MockClass1:
|
||||
pass
|
||||
|
||||
|
||||
class MockClass2:
|
||||
@deprecated('mockclass2_method')
|
||||
def method(self):
|
||||
pass
|
||||
|
||||
|
||||
class MockClass3:
|
||||
@deprecated()
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
|
||||
class MockClass4:
|
||||
pass
|
||||
|
||||
|
||||
@deprecated()
|
||||
def mock_function():
|
||||
return 10
|
||||
|
||||
|
||||
def test_deprecated():
|
||||
assert_warns_message(FutureWarning, 'qwerty', MockClass1)
|
||||
assert_warns_message(FutureWarning, 'mockclass2_method',
|
||||
MockClass2().method)
|
||||
assert_warns_message(FutureWarning, 'deprecated', MockClass3)
|
||||
val = assert_warns_message(FutureWarning, 'deprecated',
|
||||
mock_function)
|
||||
assert val == 10
|
||||
|
||||
|
||||
def test_is_deprecated():
|
||||
# Test if _is_deprecated helper identifies wrapping via deprecated
|
||||
# NOTE it works only for class methods and functions
|
||||
assert _is_deprecated(MockClass1.__init__)
|
||||
assert _is_deprecated(MockClass2().method)
|
||||
assert _is_deprecated(MockClass3.__init__)
|
||||
assert not _is_deprecated(MockClass4.__init__)
|
||||
assert _is_deprecated(mock_function)
|
||||
|
||||
|
||||
def test_pickle():
|
||||
pickle.loads(pickle.dumps(mock_function))
|
|
@ -0,0 +1,640 @@
|
|||
import unittest
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
import joblib
|
||||
|
||||
from io import StringIO
|
||||
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.utils import deprecated
|
||||
from sklearn.utils._testing import (assert_raises_regex,
|
||||
ignore_warnings,
|
||||
assert_warns, assert_raises,
|
||||
SkipTest)
|
||||
from sklearn.utils.estimator_checks import check_estimator, _NotAnArray
|
||||
from sklearn.utils.estimator_checks \
|
||||
import check_class_weight_balanced_linear_classifier
|
||||
from sklearn.utils.estimator_checks import set_random_state
|
||||
from sklearn.utils.estimator_checks import _set_checking_parameters
|
||||
from sklearn.utils.estimator_checks import check_estimators_unfitted
|
||||
from sklearn.utils.estimator_checks import check_fit_score_takes_y
|
||||
from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
|
||||
from sklearn.utils.estimator_checks import check_classifier_data_not_an_array
|
||||
from sklearn.utils.estimator_checks import check_regressor_data_not_an_array
|
||||
from sklearn.utils.validation import check_is_fitted
|
||||
from sklearn.utils.estimator_checks import check_outlier_corruption
|
||||
from sklearn.utils.fixes import np_version, parse_version
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.linear_model import LinearRegression, SGDClassifier
|
||||
from sklearn.mixture import GaussianMixture
|
||||
from sklearn.cluster import MiniBatchKMeans
|
||||
from sklearn.decomposition import NMF
|
||||
from sklearn.linear_model import MultiTaskElasticNet, LogisticRegression
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.neighbors import KNeighborsRegressor
|
||||
from sklearn.utils.validation import check_array
|
||||
from sklearn.utils import all_estimators
|
||||
|
||||
|
||||
class CorrectNotFittedError(ValueError):
|
||||
"""Exception class to raise if estimator is used before fitting.
|
||||
|
||||
Like NotFittedError, it inherits from ValueError, but not from
|
||||
AttributeError. Used for testing only.
|
||||
"""
|
||||
|
||||
|
||||
class BaseBadClassifier(ClassifierMixin, BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
|
||||
class ChangesDict(BaseEstimator):
|
||||
def __init__(self, key=0):
|
||||
self.key = key
|
||||
|
||||
def fit(self, X, y=None):
|
||||
X, y = self._validate_data(X, y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
X = check_array(X)
|
||||
self.key = 1000
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
|
||||
class SetsWrongAttribute(BaseEstimator):
|
||||
def __init__(self, acceptable_key=0):
|
||||
self.acceptable_key = acceptable_key
|
||||
|
||||
def fit(self, X, y=None):
|
||||
self.wrong_attribute = 0
|
||||
X, y = self._validate_data(X, y)
|
||||
return self
|
||||
|
||||
|
||||
class ChangesWrongAttribute(BaseEstimator):
|
||||
def __init__(self, wrong_attribute=0):
|
||||
self.wrong_attribute = wrong_attribute
|
||||
|
||||
def fit(self, X, y=None):
|
||||
self.wrong_attribute = 1
|
||||
X, y = self._validate_data(X, y)
|
||||
return self
|
||||
|
||||
|
||||
class ChangesUnderscoreAttribute(BaseEstimator):
|
||||
def fit(self, X, y=None):
|
||||
self._good_attribute = 1
|
||||
X, y = self._validate_data(X, y)
|
||||
return self
|
||||
|
||||
|
||||
class RaisesErrorInSetParams(BaseEstimator):
|
||||
def __init__(self, p=0):
|
||||
self.p = p
|
||||
|
||||
def set_params(self, **kwargs):
|
||||
if 'p' in kwargs:
|
||||
p = kwargs.pop('p')
|
||||
if p < 0:
|
||||
raise ValueError("p can't be less than 0")
|
||||
self.p = p
|
||||
return super().set_params(**kwargs)
|
||||
|
||||
def fit(self, X, y=None):
|
||||
X, y = self._validate_data(X, y)
|
||||
return self
|
||||
|
||||
|
||||
class ModifiesValueInsteadOfRaisingError(BaseEstimator):
|
||||
def __init__(self, p=0):
|
||||
self.p = p
|
||||
|
||||
def set_params(self, **kwargs):
|
||||
if 'p' in kwargs:
|
||||
p = kwargs.pop('p')
|
||||
if p < 0:
|
||||
p = 0
|
||||
self.p = p
|
||||
return super().set_params(**kwargs)
|
||||
|
||||
def fit(self, X, y=None):
|
||||
X, y = self._validate_data(X, y)
|
||||
return self
|
||||
|
||||
|
||||
class ModifiesAnotherValue(BaseEstimator):
|
||||
def __init__(self, a=0, b='method1'):
|
||||
self.a = a
|
||||
self.b = b
|
||||
|
||||
def set_params(self, **kwargs):
|
||||
if 'a' in kwargs:
|
||||
a = kwargs.pop('a')
|
||||
self.a = a
|
||||
if a is None:
|
||||
kwargs.pop('b')
|
||||
self.b = 'method2'
|
||||
return super().set_params(**kwargs)
|
||||
|
||||
def fit(self, X, y=None):
|
||||
X, y = self._validate_data(X, y)
|
||||
return self
|
||||
|
||||
|
||||
class NoCheckinPredict(BaseBadClassifier):
|
||||
def fit(self, X, y):
|
||||
X, y = self._validate_data(X, y)
|
||||
return self
|
||||
|
||||
|
||||
class NoSparseClassifier(BaseBadClassifier):
|
||||
def fit(self, X, y):
|
||||
X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'])
|
||||
if sp.issparse(X):
|
||||
raise ValueError("Nonsensical Error")
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
X = check_array(X)
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
|
||||
class CorrectNotFittedErrorClassifier(BaseBadClassifier):
|
||||
def fit(self, X, y):
|
||||
X, y = self._validate_data(X, y)
|
||||
self.coef_ = np.ones(X.shape[1])
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
check_is_fitted(self)
|
||||
X = check_array(X)
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
|
||||
class NoSampleWeightPandasSeriesType(BaseEstimator):
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
# Convert data
|
||||
X, y = self._validate_data(
|
||||
X, y,
|
||||
accept_sparse=("csr", "csc"),
|
||||
multi_output=True,
|
||||
y_numeric=True)
|
||||
# Function is only called after we verify that pandas is installed
|
||||
from pandas import Series
|
||||
if isinstance(sample_weight, Series):
|
||||
raise ValueError("Estimator does not accept 'sample_weight'"
|
||||
"of type pandas.Series")
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
X = check_array(X)
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
|
||||
class BadBalancedWeightsClassifier(BaseBadClassifier):
|
||||
def __init__(self, class_weight=None):
|
||||
self.class_weight = class_weight
|
||||
|
||||
def fit(self, X, y):
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.utils import compute_class_weight
|
||||
|
||||
label_encoder = LabelEncoder().fit(y)
|
||||
classes = label_encoder.classes_
|
||||
class_weight = compute_class_weight(self.class_weight, classes=classes,
|
||||
y=y)
|
||||
|
||||
# Intentionally modify the balanced class_weight
|
||||
# to simulate a bug and raise an exception
|
||||
if self.class_weight == "balanced":
|
||||
class_weight += 1.
|
||||
|
||||
# Simply assigning coef_ to the class_weight
|
||||
self.coef_ = class_weight
|
||||
return self
|
||||
|
||||
|
||||
class BadTransformerWithoutMixin(BaseEstimator):
|
||||
def fit(self, X, y=None):
|
||||
X = self._validate_data(X)
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
X = check_array(X)
|
||||
return X
|
||||
|
||||
|
||||
class NotInvariantPredict(BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
# Convert data
|
||||
X, y = self._validate_data(
|
||||
X, y,
|
||||
accept_sparse=("csr", "csc"),
|
||||
multi_output=True,
|
||||
y_numeric=True)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
# return 1 if X has more than one element else return 0
|
||||
X = check_array(X)
|
||||
if X.shape[0] > 1:
|
||||
return np.ones(X.shape[0])
|
||||
return np.zeros(X.shape[0])
|
||||
|
||||
|
||||
class LargeSparseNotSupportedClassifier(BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
X, y = self._validate_data(
|
||||
X, y,
|
||||
accept_sparse=("csr", "csc", "coo"),
|
||||
accept_large_sparse=True,
|
||||
multi_output=True,
|
||||
y_numeric=True)
|
||||
if sp.issparse(X):
|
||||
if X.getformat() == "coo":
|
||||
if X.row.dtype == "int64" or X.col.dtype == "int64":
|
||||
raise ValueError(
|
||||
"Estimator doesn't support 64-bit indices")
|
||||
elif X.getformat() in ["csc", "csr"]:
|
||||
assert "int64" not in (X.indices.dtype, X.indptr.dtype),\
|
||||
"Estimator doesn't support 64-bit indices"
|
||||
|
||||
return self
|
||||
|
||||
|
||||
class SparseTransformer(BaseEstimator):
|
||||
def fit(self, X, y=None):
|
||||
self.X_shape_ = self._validate_data(X).shape
|
||||
return self
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
return self.fit(X, y).transform(X)
|
||||
|
||||
def transform(self, X):
|
||||
X = check_array(X)
|
||||
if X.shape[1] != self.X_shape_[1]:
|
||||
raise ValueError('Bad number of features')
|
||||
return sp.csr_matrix(X)
|
||||
|
||||
|
||||
class EstimatorInconsistentForPandas(BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
try:
|
||||
from pandas import DataFrame
|
||||
if isinstance(X, DataFrame):
|
||||
self.value_ = X.iloc[0, 0]
|
||||
else:
|
||||
X = check_array(X)
|
||||
self.value_ = X[1, 0]
|
||||
return self
|
||||
|
||||
except ImportError:
|
||||
X = check_array(X)
|
||||
self.value_ = X[1, 0]
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
X = check_array(X)
|
||||
return np.array([self.value_] * X.shape[0])
|
||||
|
||||
|
||||
class UntaggedBinaryClassifier(SGDClassifier):
|
||||
# Toy classifier that only supports binary classification, will fail tests.
|
||||
def fit(self, X, y, coef_init=None, intercept_init=None,
|
||||
sample_weight=None):
|
||||
super().fit(X, y, coef_init, intercept_init, sample_weight)
|
||||
if len(self.classes_) > 2:
|
||||
raise ValueError('Only 2 classes are supported')
|
||||
return self
|
||||
|
||||
def partial_fit(self, X, y, classes=None, sample_weight=None):
|
||||
super().partial_fit(X=X, y=y, classes=classes,
|
||||
sample_weight=sample_weight)
|
||||
if len(self.classes_) > 2:
|
||||
raise ValueError('Only 2 classes are supported')
|
||||
return self
|
||||
|
||||
|
||||
class TaggedBinaryClassifier(UntaggedBinaryClassifier):
|
||||
# Toy classifier that only supports binary classification.
|
||||
def _more_tags(self):
|
||||
return {'binary_only': True}
|
||||
|
||||
|
||||
class RequiresPositiveYRegressor(LinearRegression):
|
||||
|
||||
def fit(self, X, y):
|
||||
X, y = self._validate_data(X, y, multi_output=True)
|
||||
if (y <= 0).any():
|
||||
raise ValueError('negative y values not supported!')
|
||||
return super().fit(X, y)
|
||||
|
||||
def _more_tags(self):
|
||||
return {"requires_positive_y": True}
|
||||
|
||||
|
||||
def test_not_an_array_array_function():
|
||||
if np_version < parse_version('1.17'):
|
||||
raise SkipTest("array_function protocol not supported in numpy <1.17")
|
||||
not_array = _NotAnArray(np.ones(10))
|
||||
msg = "Don't want to call array_function sum!"
|
||||
assert_raises_regex(TypeError, msg, np.sum, not_array)
|
||||
# always returns True
|
||||
assert np.may_share_memory(not_array, None)
|
||||
|
||||
|
||||
def test_check_fit_score_takes_y_works_on_deprecated_fit():
|
||||
# Tests that check_fit_score_takes_y works on a class with
|
||||
# a deprecated fit method
|
||||
|
||||
class TestEstimatorWithDeprecatedFitMethod(BaseEstimator):
|
||||
@deprecated("Deprecated for the purpose of testing "
|
||||
"check_fit_score_takes_y")
|
||||
def fit(self, X, y):
|
||||
return self
|
||||
|
||||
check_fit_score_takes_y("test", TestEstimatorWithDeprecatedFitMethod())
|
||||
|
||||
|
||||
@ignore_warnings("Passing a class is depr", category=FutureWarning) # 0.24
|
||||
def test_check_estimator():
|
||||
# tests that the estimator actually fails on "bad" estimators.
|
||||
# not a complete test of all checks, which are very extensive.
|
||||
|
||||
# check that we have a set_params and can clone
|
||||
msg = "it does not implement a 'get_params' method"
|
||||
assert_raises_regex(TypeError, msg, check_estimator, object)
|
||||
msg = "object has no attribute '_get_tags'"
|
||||
assert_raises_regex(AttributeError, msg, check_estimator, object())
|
||||
# check that values returned by get_params match set_params
|
||||
msg = "get_params result does not match what was passed to set_params"
|
||||
assert_raises_regex(AssertionError, msg, check_estimator,
|
||||
ModifiesValueInsteadOfRaisingError())
|
||||
assert_warns(UserWarning, check_estimator, RaisesErrorInSetParams())
|
||||
assert_raises_regex(AssertionError, msg, check_estimator,
|
||||
ModifiesAnotherValue())
|
||||
# check that we have a fit method
|
||||
msg = "object has no attribute 'fit'"
|
||||
assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator)
|
||||
assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator())
|
||||
# check that fit does input validation
|
||||
msg = "ValueError not raised"
|
||||
assert_raises_regex(AssertionError, msg, check_estimator,
|
||||
BaseBadClassifier)
|
||||
assert_raises_regex(AssertionError, msg, check_estimator,
|
||||
BaseBadClassifier())
|
||||
# check that sample_weights in fit accepts pandas.Series type
|
||||
try:
|
||||
from pandas import Series # noqa
|
||||
msg = ("Estimator NoSampleWeightPandasSeriesType raises error if "
|
||||
"'sample_weight' parameter is of type pandas.Series")
|
||||
assert_raises_regex(
|
||||
ValueError, msg, check_estimator, NoSampleWeightPandasSeriesType)
|
||||
except ImportError:
|
||||
pass
|
||||
# check that predict does input validation (doesn't accept dicts in input)
|
||||
msg = "Estimator doesn't check for NaN and inf in predict"
|
||||
assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict)
|
||||
assert_raises_regex(AssertionError, msg, check_estimator,
|
||||
NoCheckinPredict())
|
||||
# check that estimator state does not change
|
||||
# at transform/predict/predict_proba time
|
||||
msg = 'Estimator changes __dict__ during predict'
|
||||
assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict)
|
||||
# check that `fit` only changes attribures that
|
||||
# are private (start with an _ or end with a _).
|
||||
msg = ('Estimator ChangesWrongAttribute should not change or mutate '
|
||||
'the parameter wrong_attribute from 0 to 1 during fit.')
|
||||
assert_raises_regex(AssertionError, msg,
|
||||
check_estimator, ChangesWrongAttribute)
|
||||
check_estimator(ChangesUnderscoreAttribute)
|
||||
# check that `fit` doesn't add any public attribute
|
||||
msg = (r'Estimator adds public attribute\(s\) during the fit method.'
|
||||
' Estimators are only allowed to add private attributes'
|
||||
' either started with _ or ended'
|
||||
' with _ but wrong_attribute added')
|
||||
assert_raises_regex(AssertionError, msg,
|
||||
check_estimator, SetsWrongAttribute)
|
||||
# check for invariant method
|
||||
name = NotInvariantPredict.__name__
|
||||
method = 'predict'
|
||||
msg = ("{method} of {name} is not invariant when applied "
|
||||
"to a subset.").format(method=method, name=name)
|
||||
assert_raises_regex(AssertionError, msg,
|
||||
check_estimator, NotInvariantPredict)
|
||||
# check for sparse matrix input handling
|
||||
name = NoSparseClassifier.__name__
|
||||
msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name
|
||||
# the check for sparse input handling prints to the stdout,
|
||||
# instead of raising an error, so as not to remove the original traceback.
|
||||
# that means we need to jump through some hoops to catch it.
|
||||
old_stdout = sys.stdout
|
||||
string_buffer = StringIO()
|
||||
sys.stdout = string_buffer
|
||||
try:
|
||||
check_estimator(NoSparseClassifier)
|
||||
except:
|
||||
pass
|
||||
finally:
|
||||
sys.stdout = old_stdout
|
||||
assert msg in string_buffer.getvalue()
|
||||
|
||||
# Large indices test on bad estimator
|
||||
msg = ('Estimator LargeSparseNotSupportedClassifier doesn\'t seem to '
|
||||
r'support \S{3}_64 matrix, and is not failing gracefully.*')
|
||||
assert_raises_regex(AssertionError, msg, check_estimator,
|
||||
LargeSparseNotSupportedClassifier)
|
||||
|
||||
# does error on binary_only untagged estimator
|
||||
msg = 'Only 2 classes are supported'
|
||||
assert_raises_regex(ValueError, msg, check_estimator,
|
||||
UntaggedBinaryClassifier)
|
||||
|
||||
# non-regression test for estimators transforming to sparse data
|
||||
check_estimator(SparseTransformer())
|
||||
|
||||
# doesn't error on actual estimator
|
||||
check_estimator(LogisticRegression)
|
||||
check_estimator(LogisticRegression(C=0.01))
|
||||
check_estimator(MultiTaskElasticNet)
|
||||
check_estimator(MultiTaskElasticNet())
|
||||
|
||||
# doesn't error on binary_only tagged estimator
|
||||
check_estimator(TaggedBinaryClassifier)
|
||||
|
||||
# Check regressor with requires_positive_y estimator tag
|
||||
msg = 'negative y values not supported!'
|
||||
assert_raises_regex(ValueError, msg, check_estimator,
|
||||
RequiresPositiveYRegressor)
|
||||
|
||||
|
||||
def test_check_outlier_corruption():
|
||||
# should raise AssertionError
|
||||
decision = np.array([0., 1., 1.5, 2.])
|
||||
assert_raises(AssertionError, check_outlier_corruption, 1, 2, decision)
|
||||
# should pass
|
||||
decision = np.array([0., 1., 1., 2.])
|
||||
check_outlier_corruption(1, 2, decision)
|
||||
|
||||
|
||||
def test_check_estimator_transformer_no_mixin():
|
||||
# check that TransformerMixin is not required for transformer tests to run
|
||||
assert_raises_regex(AttributeError, '.*fit_transform.*',
|
||||
check_estimator, BadTransformerWithoutMixin())
|
||||
|
||||
|
||||
def test_check_estimator_clones():
|
||||
# check that check_estimator doesn't modify the estimator it receives
|
||||
from sklearn.datasets import load_iris
|
||||
iris = load_iris()
|
||||
|
||||
for Estimator in [GaussianMixture, LinearRegression,
|
||||
RandomForestClassifier, NMF, SGDClassifier,
|
||||
MiniBatchKMeans]:
|
||||
with ignore_warnings(category=FutureWarning):
|
||||
# when 'est = SGDClassifier()'
|
||||
est = Estimator()
|
||||
_set_checking_parameters(est)
|
||||
set_random_state(est)
|
||||
# without fitting
|
||||
old_hash = joblib.hash(est)
|
||||
check_estimator(est)
|
||||
assert old_hash == joblib.hash(est)
|
||||
|
||||
with ignore_warnings(category=FutureWarning):
|
||||
# when 'est = SGDClassifier()'
|
||||
est = Estimator()
|
||||
_set_checking_parameters(est)
|
||||
set_random_state(est)
|
||||
# with fitting
|
||||
est.fit(iris.data + 10, iris.target)
|
||||
old_hash = joblib.hash(est)
|
||||
check_estimator(est)
|
||||
assert old_hash == joblib.hash(est)
|
||||
|
||||
|
||||
def test_check_estimators_unfitted():
|
||||
# check that a ValueError/AttributeError is raised when calling predict
|
||||
# on an unfitted estimator
|
||||
msg = "NotFittedError not raised by predict"
|
||||
assert_raises_regex(AssertionError, msg, check_estimators_unfitted,
|
||||
"estimator", NoSparseClassifier())
|
||||
|
||||
# check that CorrectNotFittedError inherit from either ValueError
|
||||
# or AttributeError
|
||||
check_estimators_unfitted("estimator", CorrectNotFittedErrorClassifier())
|
||||
|
||||
|
||||
def test_check_no_attributes_set_in_init():
|
||||
class NonConformantEstimatorPrivateSet(BaseEstimator):
|
||||
def __init__(self):
|
||||
self.you_should_not_set_this_ = None
|
||||
|
||||
class NonConformantEstimatorNoParamSet(BaseEstimator):
|
||||
def __init__(self, you_should_set_this_=None):
|
||||
pass
|
||||
|
||||
assert_raises_regex(AssertionError,
|
||||
"Estimator estimator_name should not set any"
|
||||
" attribute apart from parameters during init."
|
||||
r" Found attributes \['you_should_not_set_this_'\].",
|
||||
check_no_attributes_set_in_init,
|
||||
'estimator_name',
|
||||
NonConformantEstimatorPrivateSet())
|
||||
assert_raises_regex(AssertionError,
|
||||
"Estimator estimator_name should store all "
|
||||
"parameters as an attribute during init. "
|
||||
"Did not find attributes "
|
||||
r"\['you_should_set_this_'\].",
|
||||
check_no_attributes_set_in_init,
|
||||
'estimator_name',
|
||||
NonConformantEstimatorNoParamSet())
|
||||
|
||||
|
||||
def test_check_estimator_pairwise():
|
||||
# check that check_estimator() works on estimator with _pairwise
|
||||
# kernel or metric
|
||||
|
||||
# test precomputed kernel
|
||||
est = SVC(kernel='precomputed')
|
||||
check_estimator(est)
|
||||
|
||||
# test precomputed metric
|
||||
est = KNeighborsRegressor(metric='precomputed')
|
||||
check_estimator(est)
|
||||
|
||||
|
||||
def test_check_classifier_data_not_an_array():
|
||||
assert_raises_regex(AssertionError,
|
||||
'Not equal to tolerance',
|
||||
check_classifier_data_not_an_array,
|
||||
'estimator_name',
|
||||
EstimatorInconsistentForPandas())
|
||||
|
||||
|
||||
def test_check_regressor_data_not_an_array():
|
||||
assert_raises_regex(AssertionError,
|
||||
'Not equal to tolerance',
|
||||
check_regressor_data_not_an_array,
|
||||
'estimator_name',
|
||||
EstimatorInconsistentForPandas())
|
||||
|
||||
|
||||
@ignore_warnings("Passing a class is depr", category=FutureWarning) # 0.24
|
||||
def test_check_estimator_required_parameters_skip():
|
||||
# TODO: remove whole test in 0.24 since passes classes to check_estimator()
|
||||
# isn't supported anymore
|
||||
class MyEstimator(BaseEstimator):
|
||||
_required_parameters = ["special_parameter"]
|
||||
|
||||
def __init__(self, special_parameter):
|
||||
self.special_parameter = special_parameter
|
||||
|
||||
assert_raises_regex(SkipTest, r"Can't instantiate estimator MyEstimator "
|
||||
r"which requires parameters "
|
||||
r"\['special_parameter'\]",
|
||||
check_estimator, MyEstimator)
|
||||
|
||||
|
||||
def run_tests_without_pytest():
|
||||
"""Runs the tests in this file without using pytest.
|
||||
"""
|
||||
main_module = sys.modules['__main__']
|
||||
test_functions = [getattr(main_module, name) for name in dir(main_module)
|
||||
if name.startswith('test_')]
|
||||
test_cases = [unittest.FunctionTestCase(fn) for fn in test_functions]
|
||||
suite = unittest.TestSuite()
|
||||
suite.addTests(test_cases)
|
||||
runner = unittest.TextTestRunner()
|
||||
runner.run(suite)
|
||||
|
||||
|
||||
def test_check_class_weight_balanced_linear_classifier():
|
||||
# check that ill-computed balanced weights raises an exception
|
||||
assert_raises_regex(AssertionError,
|
||||
"Classifier estimator_name is not computing"
|
||||
" class_weight=balanced properly.",
|
||||
check_class_weight_balanced_linear_classifier,
|
||||
'estimator_name',
|
||||
BadBalancedWeightsClassifier)
|
||||
|
||||
|
||||
def test_all_estimators_all_public():
|
||||
# all_estimator should not fail when pytest is not installed and return
|
||||
# only public estimators
|
||||
estimators = all_estimators()
|
||||
for est in estimators:
|
||||
assert not est.__class__.__name__.startswith("_")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# This module is run as a script to check that we have no dependency on
|
||||
# pytest for estimator checks.
|
||||
run_tests_without_pytest()
|
|
@ -0,0 +1,267 @@
|
|||
from contextlib import closing
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn import config_context
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.pipeline import FeatureUnion
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.ensemble import VotingClassifier
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.cluster import Birch
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.svm import LinearSVR
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.multiclass import OneVsOneClassifier
|
||||
from sklearn.ensemble import StackingClassifier
|
||||
from sklearn.ensemble import StackingRegressor
|
||||
from sklearn.gaussian_process import GaussianProcessRegressor
|
||||
from sklearn.gaussian_process.kernels import RationalQuadratic
|
||||
from sklearn.utils._estimator_html_repr import _write_label_html
|
||||
from sklearn.utils._estimator_html_repr import _get_visual_block
|
||||
from sklearn.utils._estimator_html_repr import estimator_html_repr
|
||||
|
||||
|
||||
@pytest.mark.parametrize("checked", [True, False])
|
||||
def test_write_label_html(checked):
|
||||
# Test checking logic and labeling
|
||||
name = "LogisticRegression"
|
||||
tool_tip = "hello-world"
|
||||
|
||||
with closing(StringIO()) as out:
|
||||
_write_label_html(out, name, tool_tip, checked=checked)
|
||||
html_label = out.getvalue()
|
||||
assert 'LogisticRegression</label>' in html_label
|
||||
assert html_label.startswith('<div class="sk-label-container">')
|
||||
assert '<pre>hello-world</pre>' in html_label
|
||||
if checked:
|
||||
assert 'checked>' in html_label
|
||||
|
||||
|
||||
@pytest.mark.parametrize('est', ['passthrough', 'drop', None])
|
||||
def test_get_visual_block_single_str_none(est):
|
||||
# Test estimators that are represnted by strings
|
||||
est_html_info = _get_visual_block(est)
|
||||
assert est_html_info.kind == 'single'
|
||||
assert est_html_info.estimators == est
|
||||
assert est_html_info.names == str(est)
|
||||
assert est_html_info.name_details == str(est)
|
||||
|
||||
|
||||
def test_get_visual_block_single_estimator():
|
||||
est = LogisticRegression(C=10.0)
|
||||
est_html_info = _get_visual_block(est)
|
||||
assert est_html_info.kind == 'single'
|
||||
assert est_html_info.estimators == est
|
||||
assert est_html_info.names == est.__class__.__name__
|
||||
assert est_html_info.name_details == str(est)
|
||||
|
||||
|
||||
def test_get_visual_block_pipeline():
|
||||
pipe = Pipeline([
|
||||
('imputer', SimpleImputer()),
|
||||
('do_nothing', 'passthrough'),
|
||||
('do_nothing_more', None),
|
||||
('classifier', LogisticRegression())
|
||||
])
|
||||
est_html_info = _get_visual_block(pipe)
|
||||
assert est_html_info.kind == 'serial'
|
||||
assert est_html_info.estimators == tuple(step[1] for step in pipe.steps)
|
||||
assert est_html_info.names == ['imputer: SimpleImputer',
|
||||
'do_nothing: passthrough',
|
||||
'do_nothing_more: passthrough',
|
||||
'classifier: LogisticRegression']
|
||||
assert est_html_info.name_details == [str(est) for _, est in pipe.steps]
|
||||
|
||||
|
||||
def test_get_visual_block_feature_union():
|
||||
f_union = FeatureUnion([
|
||||
('pca', PCA()), ('svd', TruncatedSVD())
|
||||
])
|
||||
est_html_info = _get_visual_block(f_union)
|
||||
assert est_html_info.kind == 'parallel'
|
||||
assert est_html_info.names == ('pca', 'svd')
|
||||
assert est_html_info.estimators == tuple(
|
||||
trans[1] for trans in f_union.transformer_list)
|
||||
assert est_html_info.name_details == (None, None)
|
||||
|
||||
|
||||
def test_get_visual_block_voting():
|
||||
clf = VotingClassifier([
|
||||
('log_reg', LogisticRegression()),
|
||||
('mlp', MLPClassifier())
|
||||
])
|
||||
est_html_info = _get_visual_block(clf)
|
||||
assert est_html_info.kind == 'parallel'
|
||||
assert est_html_info.estimators == tuple(trans[1]
|
||||
for trans in clf.estimators)
|
||||
assert est_html_info.names == ('log_reg', 'mlp')
|
||||
assert est_html_info.name_details == (None, None)
|
||||
|
||||
|
||||
def test_get_visual_block_column_transformer():
|
||||
ct = ColumnTransformer([
|
||||
('pca', PCA(), ['num1', 'num2']),
|
||||
('svd', TruncatedSVD, [0, 3])
|
||||
])
|
||||
est_html_info = _get_visual_block(ct)
|
||||
assert est_html_info.kind == 'parallel'
|
||||
assert est_html_info.estimators == tuple(
|
||||
trans[1] for trans in ct.transformers)
|
||||
assert est_html_info.names == ('pca', 'svd')
|
||||
assert est_html_info.name_details == (['num1', 'num2'], [0, 3])
|
||||
|
||||
|
||||
def test_estimator_html_repr_pipeline():
|
||||
num_trans = Pipeline(steps=[
|
||||
('pass', 'passthrough'),
|
||||
('imputer', SimpleImputer(strategy='median'))
|
||||
])
|
||||
|
||||
cat_trans = Pipeline(steps=[
|
||||
('imputer', SimpleImputer(strategy='constant',
|
||||
missing_values='empty')),
|
||||
('one-hot', OneHotEncoder(drop='first'))
|
||||
])
|
||||
|
||||
preprocess = ColumnTransformer([
|
||||
('num', num_trans, ['a', 'b', 'c', 'd', 'e']),
|
||||
('cat', cat_trans, [0, 1, 2, 3])
|
||||
])
|
||||
|
||||
feat_u = FeatureUnion([
|
||||
('pca', PCA(n_components=1)),
|
||||
('tsvd', Pipeline([('first', TruncatedSVD(n_components=3)),
|
||||
('select', SelectPercentile())]))
|
||||
])
|
||||
|
||||
clf = VotingClassifier([
|
||||
('lr', LogisticRegression(solver='lbfgs', random_state=1)),
|
||||
('mlp', MLPClassifier(alpha=0.001))
|
||||
])
|
||||
|
||||
pipe = Pipeline([
|
||||
('preprocessor', preprocess), ('feat_u', feat_u), ('classifier', clf)
|
||||
])
|
||||
html_output = estimator_html_repr(pipe)
|
||||
|
||||
# top level estimators show estimator with changes
|
||||
assert str(pipe) in html_output
|
||||
for _, est in pipe.steps:
|
||||
assert (f"<div class=\"sk-toggleable__content\">"
|
||||
f"<pre>{str(est)}") in html_output
|
||||
|
||||
# low level estimators do not show changes
|
||||
with config_context(print_changed_only=True):
|
||||
assert str(num_trans['pass']) in html_output
|
||||
assert 'passthrough</label>' in html_output
|
||||
assert str(num_trans['imputer']) in html_output
|
||||
|
||||
for _, _, cols in preprocess.transformers:
|
||||
assert f"<pre>{cols}</pre>" in html_output
|
||||
|
||||
# feature union
|
||||
for name, _ in feat_u.transformer_list:
|
||||
assert f"<label>{name}</label>" in html_output
|
||||
|
||||
pca = feat_u.transformer_list[0][1]
|
||||
assert f"<pre>{str(pca)}</pre>" in html_output
|
||||
|
||||
tsvd = feat_u.transformer_list[1][1]
|
||||
first = tsvd['first']
|
||||
select = tsvd['select']
|
||||
assert f"<pre>{str(first)}</pre>" in html_output
|
||||
assert f"<pre>{str(select)}</pre>" in html_output
|
||||
|
||||
# voting classifer
|
||||
for name, est in clf.estimators:
|
||||
assert f"<label>{name}</label>" in html_output
|
||||
assert f"<pre>{str(est)}</pre>" in html_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("final_estimator", [None, LinearSVC()])
|
||||
def test_stacking_classsifer(final_estimator):
|
||||
estimators = [('mlp', MLPClassifier(alpha=0.001)),
|
||||
('tree', DecisionTreeClassifier())]
|
||||
clf = StackingClassifier(
|
||||
estimators=estimators, final_estimator=final_estimator)
|
||||
|
||||
html_output = estimator_html_repr(clf)
|
||||
|
||||
assert str(clf) in html_output
|
||||
# If final_estimator's default changes from LogisticRegression
|
||||
# this should be updated
|
||||
if final_estimator is None:
|
||||
assert "LogisticRegression(" in html_output
|
||||
else:
|
||||
assert final_estimator.__class__.__name__ in html_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("final_estimator", [None, LinearSVR()])
|
||||
def test_stacking_regressor(final_estimator):
|
||||
reg = StackingRegressor(
|
||||
estimators=[('svr', LinearSVR())], final_estimator=final_estimator)
|
||||
html_output = estimator_html_repr(reg)
|
||||
|
||||
assert str(reg.estimators[0][0]) in html_output
|
||||
assert "LinearSVR</label>" in html_output
|
||||
if final_estimator is None:
|
||||
assert "RidgeCV</label>" in html_output
|
||||
else:
|
||||
assert final_estimator.__class__.__name__ in html_output
|
||||
|
||||
|
||||
def test_birch_duck_typing_meta():
|
||||
# Test duck typing meta estimators with Birch
|
||||
birch = Birch(n_clusters=AgglomerativeClustering(n_clusters=3))
|
||||
html_output = estimator_html_repr(birch)
|
||||
|
||||
# inner estimators do not show changes
|
||||
with config_context(print_changed_only=True):
|
||||
assert f"<pre>{str(birch.n_clusters)}" in html_output
|
||||
assert "AgglomerativeClustering</label>" in html_output
|
||||
|
||||
# outer estimator contains all changes
|
||||
assert f"<pre>{str(birch)}" in html_output
|
||||
|
||||
|
||||
def test_ovo_classifier_duck_typing_meta():
|
||||
# Test duck typing metaestimators with OVO
|
||||
ovo = OneVsOneClassifier(LinearSVC(penalty='l1'))
|
||||
html_output = estimator_html_repr(ovo)
|
||||
|
||||
# inner estimators do not show changes
|
||||
with config_context(print_changed_only=True):
|
||||
assert f"<pre>{str(ovo.estimator)}" in html_output
|
||||
assert "LinearSVC</label>" in html_output
|
||||
|
||||
# outter estimator
|
||||
assert f"<pre>{str(ovo)}" in html_output
|
||||
|
||||
|
||||
def test_duck_typing_nested_estimator():
|
||||
# Test duck typing metaestimators with GP
|
||||
kernel = RationalQuadratic(length_scale=1.0, alpha=0.1)
|
||||
gp = GaussianProcessRegressor(kernel=kernel)
|
||||
html_output = estimator_html_repr(gp)
|
||||
|
||||
assert f"<pre>{str(kernel)}" in html_output
|
||||
assert f"<pre>{str(gp)}" in html_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize('print_changed_only', [True, False])
|
||||
def test_one_estimator_print_change_only(print_changed_only):
|
||||
pca = PCA(n_components=10)
|
||||
|
||||
with config_context(print_changed_only=print_changed_only):
|
||||
pca_repr = str(pca)
|
||||
html_output = estimator_html_repr(pca)
|
||||
assert pca_repr in html_output
|
722
venv/Lib/site-packages/sklearn/utils/tests/test_extmath.py
Normal file
722
venv/Lib/site-packages/sklearn/utils/tests/test_extmath.py
Normal file
|
@ -0,0 +1,722 @@
|
|||
# Authors: Olivier Grisel <olivier.grisel@ensta.org>
|
||||
# Mathieu Blondel <mathieu@mblondel.org>
|
||||
# Denis Engemann <denis-alexander.engemann@inria.fr>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
from scipy import linalg
|
||||
from scipy import stats
|
||||
from scipy.special import expit
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils._testing import assert_allclose_dense_sparse
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_warns
|
||||
from sklearn.utils._testing import assert_warns_message
|
||||
from sklearn.utils._testing import skip_if_32bit
|
||||
|
||||
from sklearn.utils.extmath import density
|
||||
from sklearn.utils.extmath import randomized_svd
|
||||
from sklearn.utils.extmath import row_norms
|
||||
from sklearn.utils.extmath import weighted_mode
|
||||
from sklearn.utils.extmath import cartesian
|
||||
from sklearn.utils.extmath import log_logistic
|
||||
from sklearn.utils.extmath import svd_flip
|
||||
from sklearn.utils.extmath import _incremental_mean_and_var
|
||||
from sklearn.utils.extmath import _deterministic_vector_sign_flip
|
||||
from sklearn.utils.extmath import softmax
|
||||
from sklearn.utils.extmath import stable_cumsum
|
||||
from sklearn.utils.extmath import safe_min
|
||||
from sklearn.utils.extmath import safe_sparse_dot
|
||||
from sklearn.datasets import make_low_rank_matrix
|
||||
|
||||
|
||||
def test_density():
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randint(10, size=(10, 5))
|
||||
X[1, 2] = 0
|
||||
X[5, 3] = 0
|
||||
X_csr = sparse.csr_matrix(X)
|
||||
X_csc = sparse.csc_matrix(X)
|
||||
X_coo = sparse.coo_matrix(X)
|
||||
X_lil = sparse.lil_matrix(X)
|
||||
|
||||
for X_ in (X_csr, X_csc, X_coo, X_lil):
|
||||
assert density(X_) == density(X)
|
||||
|
||||
|
||||
def test_uniform_weights():
|
||||
# with uniform weights, results should be identical to stats.mode
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.randint(10, size=(10, 5))
|
||||
weights = np.ones(x.shape)
|
||||
|
||||
for axis in (None, 0, 1):
|
||||
mode, score = stats.mode(x, axis)
|
||||
mode2, score2 = weighted_mode(x, weights, axis=axis)
|
||||
|
||||
assert_array_equal(mode, mode2)
|
||||
assert_array_equal(score, score2)
|
||||
|
||||
|
||||
def test_random_weights():
|
||||
# set this up so that each row should have a weighted mode of 6,
|
||||
# with a score that is easily reproduced
|
||||
mode_result = 6
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.randint(mode_result, size=(100, 10))
|
||||
w = rng.random_sample(x.shape)
|
||||
|
||||
x[:, :5] = mode_result
|
||||
w[:, :5] += 1
|
||||
|
||||
mode, score = weighted_mode(x, w, axis=1)
|
||||
|
||||
assert_array_equal(mode, mode_result)
|
||||
assert_array_almost_equal(score.ravel(), w[:, :5].sum(1))
|
||||
|
||||
|
||||
def check_randomized_svd_low_rank(dtype):
|
||||
# Check that extmath.randomized_svd is consistent with linalg.svd
|
||||
n_samples = 100
|
||||
n_features = 500
|
||||
rank = 5
|
||||
k = 10
|
||||
decimal = 5 if dtype == np.float32 else 7
|
||||
dtype = np.dtype(dtype)
|
||||
|
||||
# generate a matrix X of approximate effective rank `rank` and no noise
|
||||
# component (very structured signal):
|
||||
X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
|
||||
effective_rank=rank, tail_strength=0.0,
|
||||
random_state=0).astype(dtype, copy=False)
|
||||
assert X.shape == (n_samples, n_features)
|
||||
|
||||
# compute the singular values of X using the slow exact method
|
||||
U, s, V = linalg.svd(X, full_matrices=False)
|
||||
|
||||
# Convert the singular values to the specific dtype
|
||||
U = U.astype(dtype, copy=False)
|
||||
s = s.astype(dtype, copy=False)
|
||||
V = V.astype(dtype, copy=False)
|
||||
|
||||
for normalizer in ['auto', 'LU', 'QR']: # 'none' would not be stable
|
||||
# compute the singular values of X using the fast approximate method
|
||||
Ua, sa, Va = randomized_svd(
|
||||
X, k, power_iteration_normalizer=normalizer, random_state=0)
|
||||
|
||||
# If the input dtype is float, then the output dtype is float of the
|
||||
# same bit size (f32 is not upcast to f64)
|
||||
# But if the input dtype is int, the output dtype is float64
|
||||
if dtype.kind == 'f':
|
||||
assert Ua.dtype == dtype
|
||||
assert sa.dtype == dtype
|
||||
assert Va.dtype == dtype
|
||||
else:
|
||||
assert Ua.dtype == np.float64
|
||||
assert sa.dtype == np.float64
|
||||
assert Va.dtype == np.float64
|
||||
|
||||
assert Ua.shape == (n_samples, k)
|
||||
assert sa.shape == (k,)
|
||||
assert Va.shape == (k, n_features)
|
||||
|
||||
# ensure that the singular values of both methods are equal up to the
|
||||
# real rank of the matrix
|
||||
assert_almost_equal(s[:k], sa, decimal=decimal)
|
||||
|
||||
# check the singular vectors too (while not checking the sign)
|
||||
assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va),
|
||||
decimal=decimal)
|
||||
|
||||
# check the sparse matrix representation
|
||||
X = sparse.csr_matrix(X)
|
||||
|
||||
# compute the singular values of X using the fast approximate method
|
||||
Ua, sa, Va = \
|
||||
randomized_svd(X, k, power_iteration_normalizer=normalizer,
|
||||
random_state=0)
|
||||
if dtype.kind == 'f':
|
||||
assert Ua.dtype == dtype
|
||||
assert sa.dtype == dtype
|
||||
assert Va.dtype == dtype
|
||||
else:
|
||||
assert Ua.dtype.kind == 'f'
|
||||
assert sa.dtype.kind == 'f'
|
||||
assert Va.dtype.kind == 'f'
|
||||
|
||||
assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dtype',
|
||||
(np.int32, np.int64, np.float32, np.float64))
|
||||
def test_randomized_svd_low_rank_all_dtypes(dtype):
|
||||
check_randomized_svd_low_rank(dtype)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dtype',
|
||||
(np.float32, np.float64))
|
||||
def test_row_norms(dtype):
|
||||
X = np.random.RandomState(42).randn(100, 100)
|
||||
if dtype is np.float32:
|
||||
precision = 4
|
||||
else:
|
||||
precision = 5
|
||||
|
||||
X = X.astype(dtype, copy=False)
|
||||
sq_norm = (X ** 2).sum(axis=1)
|
||||
|
||||
assert_array_almost_equal(sq_norm, row_norms(X, squared=True),
|
||||
precision)
|
||||
assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision)
|
||||
|
||||
for csr_index_dtype in [np.int32, np.int64]:
|
||||
Xcsr = sparse.csr_matrix(X, dtype=dtype)
|
||||
# csr_matrix will use int32 indices by default,
|
||||
# up-casting those to int64 when necessary
|
||||
if csr_index_dtype is np.int64:
|
||||
Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype, copy=False)
|
||||
Xcsr.indices = Xcsr.indices.astype(csr_index_dtype, copy=False)
|
||||
assert Xcsr.indices.dtype == csr_index_dtype
|
||||
assert Xcsr.indptr.dtype == csr_index_dtype
|
||||
assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True),
|
||||
precision)
|
||||
assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr),
|
||||
precision)
|
||||
|
||||
|
||||
def test_randomized_svd_low_rank_with_noise():
|
||||
# Check that extmath.randomized_svd can handle noisy matrices
|
||||
n_samples = 100
|
||||
n_features = 500
|
||||
rank = 5
|
||||
k = 10
|
||||
|
||||
# generate a matrix X wity structure approximate rank `rank` and an
|
||||
# important noisy component
|
||||
X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
|
||||
effective_rank=rank, tail_strength=0.1,
|
||||
random_state=0)
|
||||
assert X.shape == (n_samples, n_features)
|
||||
|
||||
# compute the singular values of X using the slow exact method
|
||||
_, s, _ = linalg.svd(X, full_matrices=False)
|
||||
|
||||
for normalizer in ['auto', 'none', 'LU', 'QR']:
|
||||
# compute the singular values of X using the fast approximate
|
||||
# method without the iterated power method
|
||||
_, sa, _ = randomized_svd(X, k, n_iter=0,
|
||||
power_iteration_normalizer=normalizer,
|
||||
random_state=0)
|
||||
|
||||
# the approximation does not tolerate the noise:
|
||||
assert np.abs(s[:k] - sa).max() > 0.01
|
||||
|
||||
# compute the singular values of X using the fast approximate
|
||||
# method with iterated power method
|
||||
_, sap, _ = randomized_svd(X, k,
|
||||
power_iteration_normalizer=normalizer,
|
||||
random_state=0)
|
||||
|
||||
# the iterated power method is helping getting rid of the noise:
|
||||
assert_almost_equal(s[:k], sap, decimal=3)
|
||||
|
||||
|
||||
def test_randomized_svd_infinite_rank():
|
||||
# Check that extmath.randomized_svd can handle noisy matrices
|
||||
n_samples = 100
|
||||
n_features = 500
|
||||
rank = 5
|
||||
k = 10
|
||||
|
||||
# let us try again without 'low_rank component': just regularly but slowly
|
||||
# decreasing singular values: the rank of the data matrix is infinite
|
||||
X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
|
||||
effective_rank=rank, tail_strength=1.0,
|
||||
random_state=0)
|
||||
assert X.shape == (n_samples, n_features)
|
||||
|
||||
# compute the singular values of X using the slow exact method
|
||||
_, s, _ = linalg.svd(X, full_matrices=False)
|
||||
for normalizer in ['auto', 'none', 'LU', 'QR']:
|
||||
# compute the singular values of X using the fast approximate method
|
||||
# without the iterated power method
|
||||
_, sa, _ = randomized_svd(X, k, n_iter=0,
|
||||
power_iteration_normalizer=normalizer)
|
||||
|
||||
# the approximation does not tolerate the noise:
|
||||
assert np.abs(s[:k] - sa).max() > 0.1
|
||||
|
||||
# compute the singular values of X using the fast approximate method
|
||||
# with iterated power method
|
||||
_, sap, _ = randomized_svd(X, k, n_iter=5,
|
||||
power_iteration_normalizer=normalizer)
|
||||
|
||||
# the iterated power method is still managing to get most of the
|
||||
# structure at the requested rank
|
||||
assert_almost_equal(s[:k], sap, decimal=3)
|
||||
|
||||
|
||||
def test_randomized_svd_transpose_consistency():
|
||||
# Check that transposing the design matrix has limited impact
|
||||
n_samples = 100
|
||||
n_features = 500
|
||||
rank = 4
|
||||
k = 10
|
||||
|
||||
X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
|
||||
effective_rank=rank, tail_strength=0.5,
|
||||
random_state=0)
|
||||
assert X.shape == (n_samples, n_features)
|
||||
|
||||
U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False,
|
||||
random_state=0)
|
||||
U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True,
|
||||
random_state=0)
|
||||
U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose='auto',
|
||||
random_state=0)
|
||||
U4, s4, V4 = linalg.svd(X, full_matrices=False)
|
||||
|
||||
assert_almost_equal(s1, s4[:k], decimal=3)
|
||||
assert_almost_equal(s2, s4[:k], decimal=3)
|
||||
assert_almost_equal(s3, s4[:k], decimal=3)
|
||||
|
||||
assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]),
|
||||
decimal=2)
|
||||
assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]),
|
||||
decimal=2)
|
||||
|
||||
# in this case 'auto' is equivalent to transpose
|
||||
assert_almost_equal(s2, s3)
|
||||
|
||||
|
||||
def test_randomized_svd_power_iteration_normalizer():
|
||||
# randomized_svd with power_iteration_normalized='none' diverges for
|
||||
# large number of power iterations on this dataset
|
||||
rng = np.random.RandomState(42)
|
||||
X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng)
|
||||
X += 3 * rng.randint(0, 2, size=X.shape)
|
||||
n_components = 50
|
||||
|
||||
# Check that it diverges with many (non-normalized) power iterations
|
||||
U, s, V = randomized_svd(X, n_components, n_iter=2,
|
||||
power_iteration_normalizer='none')
|
||||
A = X - U.dot(np.diag(s).dot(V))
|
||||
error_2 = linalg.norm(A, ord='fro')
|
||||
U, s, V = randomized_svd(X, n_components, n_iter=20,
|
||||
power_iteration_normalizer='none')
|
||||
A = X - U.dot(np.diag(s).dot(V))
|
||||
error_20 = linalg.norm(A, ord='fro')
|
||||
assert np.abs(error_2 - error_20) > 100
|
||||
|
||||
for normalizer in ['LU', 'QR', 'auto']:
|
||||
U, s, V = randomized_svd(X, n_components, n_iter=2,
|
||||
power_iteration_normalizer=normalizer,
|
||||
random_state=0)
|
||||
A = X - U.dot(np.diag(s).dot(V))
|
||||
error_2 = linalg.norm(A, ord='fro')
|
||||
|
||||
for i in [5, 10, 50]:
|
||||
U, s, V = randomized_svd(X, n_components, n_iter=i,
|
||||
power_iteration_normalizer=normalizer,
|
||||
random_state=0)
|
||||
A = X - U.dot(np.diag(s).dot(V))
|
||||
error = linalg.norm(A, ord='fro')
|
||||
assert 15 > np.abs(error_2 - error)
|
||||
|
||||
|
||||
def test_randomized_svd_sparse_warnings():
|
||||
# randomized_svd throws a warning for lil and dok matrix
|
||||
rng = np.random.RandomState(42)
|
||||
X = make_low_rank_matrix(50, 20, effective_rank=10, random_state=rng)
|
||||
n_components = 5
|
||||
for cls in (sparse.lil_matrix, sparse.dok_matrix):
|
||||
X = cls(X)
|
||||
assert_warns_message(
|
||||
sparse.SparseEfficiencyWarning,
|
||||
"Calculating SVD of a {} is expensive. "
|
||||
"csr_matrix is more efficient.".format(cls.__name__),
|
||||
randomized_svd, X, n_components, n_iter=1,
|
||||
power_iteration_normalizer='none')
|
||||
|
||||
|
||||
def test_svd_flip():
|
||||
# Check that svd_flip works in both situations, and reconstructs input.
|
||||
rs = np.random.RandomState(1999)
|
||||
n_samples = 20
|
||||
n_features = 10
|
||||
X = rs.randn(n_samples, n_features)
|
||||
|
||||
# Check matrix reconstruction
|
||||
U, S, V = linalg.svd(X, full_matrices=False)
|
||||
U1, V1 = svd_flip(U, V, u_based_decision=False)
|
||||
assert_almost_equal(np.dot(U1 * S, V1), X, decimal=6)
|
||||
|
||||
# Check transposed matrix reconstruction
|
||||
XT = X.T
|
||||
U, S, V = linalg.svd(XT, full_matrices=False)
|
||||
U2, V2 = svd_flip(U, V, u_based_decision=True)
|
||||
assert_almost_equal(np.dot(U2 * S, V2), XT, decimal=6)
|
||||
|
||||
# Check that different flip methods are equivalent under reconstruction
|
||||
U_flip1, V_flip1 = svd_flip(U, V, u_based_decision=True)
|
||||
assert_almost_equal(np.dot(U_flip1 * S, V_flip1), XT, decimal=6)
|
||||
U_flip2, V_flip2 = svd_flip(U, V, u_based_decision=False)
|
||||
assert_almost_equal(np.dot(U_flip2 * S, V_flip2), XT, decimal=6)
|
||||
|
||||
|
||||
def test_randomized_svd_sign_flip():
|
||||
a = np.array([[2.0, 0.0], [0.0, 1.0]])
|
||||
u1, s1, v1 = randomized_svd(a, 2, flip_sign=True, random_state=41)
|
||||
for seed in range(10):
|
||||
u2, s2, v2 = randomized_svd(a, 2, flip_sign=True, random_state=seed)
|
||||
assert_almost_equal(u1, u2)
|
||||
assert_almost_equal(v1, v2)
|
||||
assert_almost_equal(np.dot(u2 * s2, v2), a)
|
||||
assert_almost_equal(np.dot(u2.T, u2), np.eye(2))
|
||||
assert_almost_equal(np.dot(v2.T, v2), np.eye(2))
|
||||
|
||||
|
||||
def test_randomized_svd_sign_flip_with_transpose():
|
||||
# Check if the randomized_svd sign flipping is always done based on u
|
||||
# irrespective of transpose.
|
||||
# See https://github.com/scikit-learn/scikit-learn/issues/5608
|
||||
# for more details.
|
||||
def max_loading_is_positive(u, v):
|
||||
"""
|
||||
returns bool tuple indicating if the values maximising np.abs
|
||||
are positive across all rows for u and across all columns for v.
|
||||
"""
|
||||
u_based = (np.abs(u).max(axis=0) == u.max(axis=0)).all()
|
||||
v_based = (np.abs(v).max(axis=1) == v.max(axis=1)).all()
|
||||
return u_based, v_based
|
||||
|
||||
mat = np.arange(10 * 8).reshape(10, -1)
|
||||
|
||||
# Without transpose
|
||||
u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True)
|
||||
u_based, v_based = max_loading_is_positive(u_flipped, v_flipped)
|
||||
assert u_based
|
||||
assert not v_based
|
||||
|
||||
# With transpose
|
||||
u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd(
|
||||
mat, 3, flip_sign=True, transpose=True)
|
||||
u_based, v_based = max_loading_is_positive(
|
||||
u_flipped_with_transpose, v_flipped_with_transpose)
|
||||
assert u_based
|
||||
assert not v_based
|
||||
|
||||
|
||||
def test_cartesian():
|
||||
# Check if cartesian product delivers the right results
|
||||
|
||||
axes = (np.array([1, 2, 3]), np.array([4, 5]), np.array([6, 7]))
|
||||
|
||||
true_out = np.array([[1, 4, 6],
|
||||
[1, 4, 7],
|
||||
[1, 5, 6],
|
||||
[1, 5, 7],
|
||||
[2, 4, 6],
|
||||
[2, 4, 7],
|
||||
[2, 5, 6],
|
||||
[2, 5, 7],
|
||||
[3, 4, 6],
|
||||
[3, 4, 7],
|
||||
[3, 5, 6],
|
||||
[3, 5, 7]])
|
||||
|
||||
out = cartesian(axes)
|
||||
assert_array_equal(true_out, out)
|
||||
|
||||
# check single axis
|
||||
x = np.arange(3)
|
||||
assert_array_equal(x[:, np.newaxis], cartesian((x,)))
|
||||
|
||||
|
||||
def test_logistic_sigmoid():
|
||||
# Check correctness and robustness of logistic sigmoid implementation
|
||||
def naive_log_logistic(x):
|
||||
return np.log(expit(x))
|
||||
|
||||
x = np.linspace(-2, 2, 50)
|
||||
assert_array_almost_equal(log_logistic(x), naive_log_logistic(x))
|
||||
|
||||
extreme_x = np.array([-100., 100.])
|
||||
assert_array_almost_equal(log_logistic(extreme_x), [-100, 0])
|
||||
|
||||
|
||||
def test_incremental_variance_update_formulas():
|
||||
# Test Youngs and Cramer incremental variance formulas.
|
||||
# Doggie data from https://www.mathsisfun.com/data/standard-deviation.html
|
||||
A = np.array([[600, 470, 170, 430, 300],
|
||||
[600, 470, 170, 430, 300],
|
||||
[600, 470, 170, 430, 300],
|
||||
[600, 470, 170, 430, 300]]).T
|
||||
idx = 2
|
||||
X1 = A[:idx, :]
|
||||
X2 = A[idx:, :]
|
||||
|
||||
old_means = X1.mean(axis=0)
|
||||
old_variances = X1.var(axis=0)
|
||||
old_sample_count = np.full(X1.shape[1], X1.shape[0], dtype=np.int32)
|
||||
final_means, final_variances, final_count = \
|
||||
_incremental_mean_and_var(X2, old_means, old_variances,
|
||||
old_sample_count)
|
||||
assert_almost_equal(final_means, A.mean(axis=0), 6)
|
||||
assert_almost_equal(final_variances, A.var(axis=0), 6)
|
||||
assert_almost_equal(final_count, A.shape[0])
|
||||
|
||||
|
||||
def test_incremental_mean_and_variance_ignore_nan():
|
||||
old_means = np.array([535., 535., 535., 535.])
|
||||
old_variances = np.array([4225., 4225., 4225., 4225.])
|
||||
old_sample_count = np.array([2, 2, 2, 2], dtype=np.int32)
|
||||
|
||||
X = np.array([[170, 170, 170, 170],
|
||||
[430, 430, 430, 430],
|
||||
[300, 300, 300, 300]])
|
||||
|
||||
X_nan = np.array([[170, np.nan, 170, 170],
|
||||
[np.nan, 170, 430, 430],
|
||||
[430, 430, np.nan, 300],
|
||||
[300, 300, 300, np.nan]])
|
||||
|
||||
X_means, X_variances, X_count = _incremental_mean_and_var(
|
||||
X, old_means, old_variances, old_sample_count)
|
||||
X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var(
|
||||
X_nan, old_means, old_variances, old_sample_count)
|
||||
|
||||
assert_allclose(X_nan_means, X_means)
|
||||
assert_allclose(X_nan_variances, X_variances)
|
||||
assert_allclose(X_nan_count, X_count)
|
||||
|
||||
|
||||
@skip_if_32bit
|
||||
def test_incremental_variance_numerical_stability():
|
||||
# Test Youngs and Cramer incremental variance formulas.
|
||||
|
||||
def np_var(A):
|
||||
return A.var(axis=0)
|
||||
|
||||
# Naive one pass variance computation - not numerically stable
|
||||
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
|
||||
def one_pass_var(X):
|
||||
n = X.shape[0]
|
||||
exp_x2 = (X ** 2).sum(axis=0) / n
|
||||
expx_2 = (X.sum(axis=0) / n) ** 2
|
||||
return exp_x2 - expx_2
|
||||
|
||||
# Two-pass algorithm, stable.
|
||||
# We use it as a benchmark. It is not an online algorithm
|
||||
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
|
||||
def two_pass_var(X):
|
||||
mean = X.mean(axis=0)
|
||||
Y = X.copy()
|
||||
return np.mean((Y - mean)**2, axis=0)
|
||||
|
||||
# Naive online implementation
|
||||
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm
|
||||
# This works only for chunks for size 1
|
||||
def naive_mean_variance_update(x, last_mean, last_variance,
|
||||
last_sample_count):
|
||||
updated_sample_count = (last_sample_count + 1)
|
||||
samples_ratio = last_sample_count / float(updated_sample_count)
|
||||
updated_mean = x / updated_sample_count + last_mean * samples_ratio
|
||||
updated_variance = last_variance * samples_ratio + \
|
||||
(x - last_mean) * (x - updated_mean) / updated_sample_count
|
||||
return updated_mean, updated_variance, updated_sample_count
|
||||
|
||||
# We want to show a case when one_pass_var has error > 1e-3 while
|
||||
# _batch_mean_variance_update has less.
|
||||
tol = 200
|
||||
n_features = 2
|
||||
n_samples = 10000
|
||||
x1 = np.array(1e8, dtype=np.float64)
|
||||
x2 = np.log(1e-5, dtype=np.float64)
|
||||
A0 = np.full((n_samples // 2, n_features), x1, dtype=np.float64)
|
||||
A1 = np.full((n_samples // 2, n_features), x2, dtype=np.float64)
|
||||
A = np.vstack((A0, A1))
|
||||
|
||||
# Naive one pass var: >tol (=1063)
|
||||
assert np.abs(np_var(A) - one_pass_var(A)).max() > tol
|
||||
|
||||
# Starting point for online algorithms: after A0
|
||||
|
||||
# Naive implementation: >tol (436)
|
||||
mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2
|
||||
for i in range(A1.shape[0]):
|
||||
mean, var, n = \
|
||||
naive_mean_variance_update(A1[i, :], mean, var, n)
|
||||
assert n == A.shape[0]
|
||||
# the mean is also slightly unstable
|
||||
assert np.abs(A.mean(axis=0) - mean).max() > 1e-6
|
||||
assert np.abs(np_var(A) - var).max() > tol
|
||||
|
||||
# Robust implementation: <tol (177)
|
||||
mean, var = A0[0, :], np.zeros(n_features)
|
||||
n = np.full(n_features, n_samples // 2, dtype=np.int32)
|
||||
for i in range(A1.shape[0]):
|
||||
mean, var, n = \
|
||||
_incremental_mean_and_var(A1[i, :].reshape((1, A1.shape[1])),
|
||||
mean, var, n)
|
||||
assert_array_equal(n, A.shape[0])
|
||||
assert_array_almost_equal(A.mean(axis=0), mean)
|
||||
assert tol > np.abs(np_var(A) - var).max()
|
||||
|
||||
|
||||
def test_incremental_variance_ddof():
|
||||
# Test that degrees of freedom parameter for calculations are correct.
|
||||
rng = np.random.RandomState(1999)
|
||||
X = rng.randn(50, 10)
|
||||
n_samples, n_features = X.shape
|
||||
for batch_size in [11, 20, 37]:
|
||||
steps = np.arange(0, X.shape[0], batch_size)
|
||||
if steps[-1] != X.shape[0]:
|
||||
steps = np.hstack([steps, n_samples])
|
||||
|
||||
for i, j in zip(steps[:-1], steps[1:]):
|
||||
batch = X[i:j, :]
|
||||
if i == 0:
|
||||
incremental_means = batch.mean(axis=0)
|
||||
incremental_variances = batch.var(axis=0)
|
||||
# Assign this twice so that the test logic is consistent
|
||||
incremental_count = batch.shape[0]
|
||||
sample_count = np.full(batch.shape[1], batch.shape[0],
|
||||
dtype=np.int32)
|
||||
else:
|
||||
result = _incremental_mean_and_var(
|
||||
batch, incremental_means, incremental_variances,
|
||||
sample_count)
|
||||
(incremental_means, incremental_variances,
|
||||
incremental_count) = result
|
||||
sample_count += batch.shape[0]
|
||||
|
||||
calculated_means = np.mean(X[:j], axis=0)
|
||||
calculated_variances = np.var(X[:j], axis=0)
|
||||
assert_almost_equal(incremental_means, calculated_means, 6)
|
||||
assert_almost_equal(incremental_variances,
|
||||
calculated_variances, 6)
|
||||
assert_array_equal(incremental_count, sample_count)
|
||||
|
||||
|
||||
def test_vector_sign_flip():
|
||||
# Testing that sign flip is working & largest value has positive sign
|
||||
data = np.random.RandomState(36).randn(5, 5)
|
||||
max_abs_rows = np.argmax(np.abs(data), axis=1)
|
||||
data_flipped = _deterministic_vector_sign_flip(data)
|
||||
max_rows = np.argmax(data_flipped, axis=1)
|
||||
assert_array_equal(max_abs_rows, max_rows)
|
||||
signs = np.sign(data[range(data.shape[0]), max_abs_rows])
|
||||
assert_array_equal(data, data_flipped * signs[:, np.newaxis])
|
||||
|
||||
|
||||
def test_softmax():
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(3, 5)
|
||||
exp_X = np.exp(X)
|
||||
sum_exp_X = np.sum(exp_X, axis=1).reshape((-1, 1))
|
||||
assert_array_almost_equal(softmax(X), exp_X / sum_exp_X)
|
||||
|
||||
|
||||
def test_stable_cumsum():
|
||||
assert_array_equal(stable_cumsum([1, 2, 3]), np.cumsum([1, 2, 3]))
|
||||
r = np.random.RandomState(0).rand(100000)
|
||||
assert_warns(RuntimeWarning, stable_cumsum, r, rtol=0, atol=0)
|
||||
|
||||
# test axis parameter
|
||||
A = np.random.RandomState(36).randint(1000, size=(5, 5, 5))
|
||||
assert_array_equal(stable_cumsum(A, axis=0), np.cumsum(A, axis=0))
|
||||
assert_array_equal(stable_cumsum(A, axis=1), np.cumsum(A, axis=1))
|
||||
assert_array_equal(stable_cumsum(A, axis=2), np.cumsum(A, axis=2))
|
||||
|
||||
|
||||
def test_safe_min():
|
||||
msg = ("safe_min is deprecated in version 0.22 and will be removed "
|
||||
"in version 0.24.")
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
safe_min(np.ones(10))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("A_array_constr", [np.array, sparse.csr_matrix],
|
||||
ids=["dense", "sparse"])
|
||||
@pytest.mark.parametrize("B_array_constr", [np.array, sparse.csr_matrix],
|
||||
ids=["dense", "sparse"])
|
||||
def test_safe_sparse_dot_2d(A_array_constr, B_array_constr):
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
A = rng.random_sample((30, 10))
|
||||
B = rng.random_sample((10, 20))
|
||||
expected = np.dot(A, B)
|
||||
|
||||
A = A_array_constr(A)
|
||||
B = B_array_constr(B)
|
||||
actual = safe_sparse_dot(A, B, dense_output=True)
|
||||
|
||||
assert_allclose(actual, expected)
|
||||
|
||||
|
||||
def test_safe_sparse_dot_nd():
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
# dense ND / sparse
|
||||
A = rng.random_sample((2, 3, 4, 5, 6))
|
||||
B = rng.random_sample((6, 7))
|
||||
expected = np.dot(A, B)
|
||||
B = sparse.csr_matrix(B)
|
||||
actual = safe_sparse_dot(A, B)
|
||||
assert_allclose(actual, expected)
|
||||
|
||||
# sparse / dense ND
|
||||
A = rng.random_sample((2, 3))
|
||||
B = rng.random_sample((4, 5, 3, 6))
|
||||
expected = np.dot(A, B)
|
||||
A = sparse.csr_matrix(A)
|
||||
actual = safe_sparse_dot(A, B)
|
||||
assert_allclose(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("A_array_constr", [np.array, sparse.csr_matrix],
|
||||
ids=["dense", "sparse"])
|
||||
def test_safe_sparse_dot_2d_1d(A_array_constr):
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
B = rng.random_sample((10))
|
||||
|
||||
# 2D @ 1D
|
||||
A = rng.random_sample((30, 10))
|
||||
expected = np.dot(A, B)
|
||||
A = A_array_constr(A)
|
||||
actual = safe_sparse_dot(A, B)
|
||||
assert_allclose(actual, expected)
|
||||
|
||||
# 1D @ 2D
|
||||
A = rng.random_sample((10, 30))
|
||||
expected = np.dot(B, A)
|
||||
A = A_array_constr(A)
|
||||
actual = safe_sparse_dot(B, A)
|
||||
assert_allclose(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dense_output", [True, False])
|
||||
def test_safe_sparse_dot_dense_output(dense_output):
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
A = sparse.random(30, 10, density=0.1, random_state=rng)
|
||||
B = sparse.random(10, 20, density=0.1, random_state=rng)
|
||||
|
||||
expected = A.dot(B)
|
||||
actual = safe_sparse_dot(A, B, dense_output=dense_output)
|
||||
|
||||
assert sparse.issparse(actual) == (not dense_output)
|
||||
|
||||
if dense_output:
|
||||
expected = expected.toarray()
|
||||
assert_allclose_dense_sparse(actual, expected)
|
31
venv/Lib/site-packages/sklearn/utils/tests/test_fast_dict.py
Normal file
31
venv/Lib/site-packages/sklearn/utils/tests/test_fast_dict.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
""" Test fast_dict.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils._fast_dict import IntFloatDict, argmin
|
||||
|
||||
|
||||
def test_int_float_dict():
|
||||
rng = np.random.RandomState(0)
|
||||
keys = np.unique(rng.randint(100, size=10).astype(np.intp))
|
||||
values = rng.rand(len(keys))
|
||||
|
||||
d = IntFloatDict(keys, values)
|
||||
for key, value in zip(keys, values):
|
||||
assert d[key] == value
|
||||
assert len(d) == len(keys)
|
||||
|
||||
d.append(120, 3.)
|
||||
assert d[120] == 3.0
|
||||
assert len(d) == len(keys) + 1
|
||||
for i in range(2000):
|
||||
d.append(i + 1000, 4.0)
|
||||
assert d[1100] == 4.0
|
||||
|
||||
|
||||
def test_int_float_dict_argmin():
|
||||
# Test the argmin implementation on the IntFloatDict
|
||||
keys = np.arange(100, dtype=np.intp)
|
||||
values = np.arange(100, dtype=np.float64)
|
||||
d = IntFloatDict(keys, values)
|
||||
assert argmin(d) == (0, 0)
|
91
venv/Lib/site-packages/sklearn/utils/tests/test_fixes.py
Normal file
91
venv/Lib/site-packages/sklearn/utils/tests/test_fixes.py
Normal file
|
@ -0,0 +1,91 @@
|
|||
# Authors: Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Justin Vincent
|
||||
# Lars Buitinck
|
||||
# License: BSD 3 clause
|
||||
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.stats
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
|
||||
from sklearn.utils.fixes import _joblib_parallel_args
|
||||
from sklearn.utils.fixes import _object_dtype_isnan
|
||||
from sklearn.utils.fixes import loguniform
|
||||
from sklearn.utils.fixes import MaskedArray
|
||||
|
||||
|
||||
@pytest.mark.parametrize('joblib_version', ('0.11', '0.12.0'))
|
||||
def test_joblib_parallel_args(monkeypatch, joblib_version):
|
||||
import joblib
|
||||
monkeypatch.setattr(joblib, '__version__', joblib_version)
|
||||
|
||||
if joblib_version == '0.12.0':
|
||||
# arguments are simply passed through
|
||||
assert _joblib_parallel_args(prefer='threads') == {'prefer': 'threads'}
|
||||
assert _joblib_parallel_args(prefer='processes', require=None) == {
|
||||
'prefer': 'processes', 'require': None}
|
||||
assert _joblib_parallel_args(non_existing=1) == {'non_existing': 1}
|
||||
elif joblib_version == '0.11':
|
||||
# arguments are mapped to the corresponding backend
|
||||
assert _joblib_parallel_args(prefer='threads') == {
|
||||
'backend': 'threading'}
|
||||
assert _joblib_parallel_args(prefer='processes') == {
|
||||
'backend': 'multiprocessing'}
|
||||
with pytest.raises(ValueError):
|
||||
_joblib_parallel_args(prefer='invalid')
|
||||
assert _joblib_parallel_args(
|
||||
prefer='processes', require='sharedmem') == {
|
||||
'backend': 'threading'}
|
||||
with pytest.raises(ValueError):
|
||||
_joblib_parallel_args(require='invalid')
|
||||
with pytest.raises(NotImplementedError):
|
||||
_joblib_parallel_args(verbose=True)
|
||||
else:
|
||||
raise ValueError
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype, val", ([object, 1],
|
||||
[object, "a"],
|
||||
[float, 1]))
|
||||
def test_object_dtype_isnan(dtype, val):
|
||||
X = np.array([[val, np.nan],
|
||||
[np.nan, val]], dtype=dtype)
|
||||
|
||||
expected_mask = np.array([[False, True],
|
||||
[True, False]])
|
||||
|
||||
mask = _object_dtype_isnan(X)
|
||||
|
||||
assert_array_equal(mask, expected_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("low,high,base",
|
||||
[(-1, 0, 10), (0, 2, np.exp(1)), (-1, 1, 2)])
|
||||
def test_loguniform(low, high, base):
|
||||
rv = loguniform(base ** low, base ** high)
|
||||
assert isinstance(rv, scipy.stats._distn_infrastructure.rv_frozen)
|
||||
rvs = rv.rvs(size=2000, random_state=0)
|
||||
|
||||
# Test the basics; right bounds, right size
|
||||
assert (base ** low <= rvs).all() and (rvs <= base ** high).all()
|
||||
assert len(rvs) == 2000
|
||||
|
||||
# Test that it's actually (fairly) uniform
|
||||
log_rvs = np.array([math.log(x, base) for x in rvs])
|
||||
counts, _ = np.histogram(log_rvs)
|
||||
assert counts.mean() == 200
|
||||
assert np.abs(counts - counts.mean()).max() <= 40
|
||||
|
||||
# Test that random_state works
|
||||
assert (
|
||||
loguniform(base ** low, base ** high).rvs(random_state=0)
|
||||
== loguniform(base ** low, base ** high).rvs(random_state=0)
|
||||
)
|
||||
|
||||
|
||||
def test_masked_array_deprecated(): # TODO: remove in 0.25
|
||||
with pytest.warns(FutureWarning, match='is deprecated'):
|
||||
MaskedArray()
|
|
@ -0,0 +1,77 @@
|
|||
from sklearn.utils.metaestimators import if_delegate_has_method
|
||||
|
||||
|
||||
class Prefix:
|
||||
def func(self):
|
||||
pass
|
||||
|
||||
|
||||
class MockMetaEstimator:
|
||||
"""This is a mock meta estimator"""
|
||||
a_prefix = Prefix()
|
||||
|
||||
@if_delegate_has_method(delegate="a_prefix")
|
||||
def func(self):
|
||||
"""This is a mock delegated function"""
|
||||
pass
|
||||
|
||||
|
||||
def test_delegated_docstring():
|
||||
assert "This is a mock delegated function" \
|
||||
in str(MockMetaEstimator.__dict__['func'].__doc__)
|
||||
assert "This is a mock delegated function" \
|
||||
in str(MockMetaEstimator.func.__doc__)
|
||||
assert "This is a mock delegated function" \
|
||||
in str(MockMetaEstimator().func.__doc__)
|
||||
|
||||
|
||||
class MetaEst:
|
||||
"""A mock meta estimator"""
|
||||
def __init__(self, sub_est, better_sub_est=None):
|
||||
self.sub_est = sub_est
|
||||
self.better_sub_est = better_sub_est
|
||||
|
||||
@if_delegate_has_method(delegate='sub_est')
|
||||
def predict(self):
|
||||
pass
|
||||
|
||||
|
||||
class MetaEstTestTuple(MetaEst):
|
||||
"""A mock meta estimator to test passing a tuple of delegates"""
|
||||
|
||||
@if_delegate_has_method(delegate=('sub_est', 'better_sub_est'))
|
||||
def predict(self):
|
||||
pass
|
||||
|
||||
|
||||
class MetaEstTestList(MetaEst):
|
||||
"""A mock meta estimator to test passing a list of delegates"""
|
||||
|
||||
@if_delegate_has_method(delegate=['sub_est', 'better_sub_est'])
|
||||
def predict(self):
|
||||
pass
|
||||
|
||||
|
||||
class HasPredict:
|
||||
"""A mock sub-estimator with predict method"""
|
||||
|
||||
def predict(self):
|
||||
pass
|
||||
|
||||
|
||||
class HasNoPredict:
|
||||
"""A mock sub-estimator with no predict method"""
|
||||
pass
|
||||
|
||||
|
||||
def test_if_delegate_has_method():
|
||||
assert hasattr(MetaEst(HasPredict()), 'predict')
|
||||
assert not hasattr(MetaEst(HasNoPredict()), 'predict')
|
||||
assert not hasattr(MetaEstTestTuple(HasNoPredict(), HasNoPredict()),
|
||||
'predict')
|
||||
assert hasattr(MetaEstTestTuple(HasPredict(), HasNoPredict()), 'predict')
|
||||
assert not hasattr(MetaEstTestTuple(HasNoPredict(), HasPredict()),
|
||||
'predict')
|
||||
assert not hasattr(MetaEstTestList(HasNoPredict(), HasPredict()),
|
||||
'predict')
|
||||
assert hasattr(MetaEstTestList(HasPredict(), HasPredict()), 'predict')
|
439
venv/Lib/site-packages/sklearn/utils/tests/test_multiclass.py
Normal file
439
venv/Lib/site-packages/sklearn/utils/tests/test_multiclass.py
Normal file
|
@ -0,0 +1,439 @@
|
|||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
from itertools import product
|
||||
import pytest
|
||||
|
||||
from scipy.sparse import issparse
|
||||
from scipy.sparse import csc_matrix
|
||||
from scipy.sparse import csr_matrix
|
||||
from scipy.sparse import coo_matrix
|
||||
from scipy.sparse import dok_matrix
|
||||
from scipy.sparse import lil_matrix
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils.estimator_checks import _NotAnArray
|
||||
from sklearn.utils.fixes import parse_version
|
||||
|
||||
from sklearn.utils.multiclass import unique_labels
|
||||
from sklearn.utils.multiclass import is_multilabel
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
from sklearn.utils.multiclass import class_distribution
|
||||
from sklearn.utils.multiclass import check_classification_targets
|
||||
from sklearn.utils.multiclass import _ovr_decision_function
|
||||
|
||||
from sklearn.utils.metaestimators import _safe_split
|
||||
from sklearn.model_selection import ShuffleSplit
|
||||
from sklearn.svm import SVC
|
||||
from sklearn import datasets
|
||||
|
||||
|
||||
EXAMPLES = {
|
||||
'multilabel-indicator': [
|
||||
# valid when the data is formatted as sparse or dense, identified
|
||||
# by CSR format when the testing takes place
|
||||
csr_matrix(np.random.RandomState(42).randint(2, size=(10, 10))),
|
||||
[[0, 1], [1, 0]],
|
||||
[[0, 1]],
|
||||
csr_matrix(np.array([[0, 1], [1, 0]])),
|
||||
csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.bool)),
|
||||
csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.int8)),
|
||||
csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.uint8)),
|
||||
csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.float)),
|
||||
csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.float32)),
|
||||
csr_matrix(np.array([[0, 0], [0, 0]])),
|
||||
csr_matrix(np.array([[0, 1]])),
|
||||
# Only valid when data is dense
|
||||
[[-1, 1], [1, -1]],
|
||||
np.array([[-1, 1], [1, -1]]),
|
||||
np.array([[-3, 3], [3, -3]]),
|
||||
_NotAnArray(np.array([[-3, 3], [3, -3]])),
|
||||
],
|
||||
'multiclass': [
|
||||
[1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
|
||||
np.array([1, 0, 2]),
|
||||
np.array([1, 0, 2], dtype=np.int8),
|
||||
np.array([1, 0, 2], dtype=np.uint8),
|
||||
np.array([1, 0, 2], dtype=np.float),
|
||||
np.array([1, 0, 2], dtype=np.float32),
|
||||
np.array([[1], [0], [2]]),
|
||||
_NotAnArray(np.array([1, 0, 2])),
|
||||
[0, 1, 2],
|
||||
['a', 'b', 'c'],
|
||||
np.array(['a', 'b', 'c']),
|
||||
np.array(['a', 'b', 'c'], dtype=object),
|
||||
np.array(['a', 'b', 'c'], dtype=object),
|
||||
],
|
||||
'multiclass-multioutput': [
|
||||
[[1, 0, 2, 2], [1, 4, 2, 4]],
|
||||
[['a', 'b'], ['c', 'd']],
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
|
||||
np.array([['a', 'b'], ['c', 'd']]),
|
||||
np.array([['a', 'b'], ['c', 'd']]),
|
||||
np.array([['a', 'b'], ['c', 'd']], dtype=object),
|
||||
np.array([[1, 0, 2]]),
|
||||
_NotAnArray(np.array([[1, 0, 2]])),
|
||||
],
|
||||
'binary': [
|
||||
[0, 1],
|
||||
[1, 1],
|
||||
[],
|
||||
[0],
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.bool),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
|
||||
np.array([[0], [1]]),
|
||||
_NotAnArray(np.array([[0], [1]])),
|
||||
[1, -1],
|
||||
[3, 5],
|
||||
['a'],
|
||||
['a', 'b'],
|
||||
['abc', 'def'],
|
||||
np.array(['abc', 'def']),
|
||||
['a', 'b'],
|
||||
np.array(['abc', 'def'], dtype=object),
|
||||
],
|
||||
'continuous': [
|
||||
[1e-5],
|
||||
[0, .5],
|
||||
np.array([[0], [.5]]),
|
||||
np.array([[0], [.5]], dtype=np.float32),
|
||||
],
|
||||
'continuous-multioutput': [
|
||||
np.array([[0, .5], [.5, 0]]),
|
||||
np.array([[0, .5], [.5, 0]], dtype=np.float32),
|
||||
np.array([[0, .5]]),
|
||||
],
|
||||
'unknown': [
|
||||
[[]],
|
||||
[()],
|
||||
# sequence of sequences that weren't supported even before deprecation
|
||||
np.array([np.array([]), np.array([1, 2, 3])], dtype=object),
|
||||
[np.array([]), np.array([1, 2, 3])],
|
||||
[{1, 2, 3}, {1, 2}],
|
||||
[frozenset([1, 2, 3]), frozenset([1, 2])],
|
||||
|
||||
# and also confusable as sequences of sequences
|
||||
[{0: 'a', 1: 'b'}, {0: 'a'}],
|
||||
|
||||
# empty second dimension
|
||||
np.array([[], []]),
|
||||
|
||||
# 3d
|
||||
np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
|
||||
]
|
||||
}
|
||||
|
||||
NON_ARRAY_LIKE_EXAMPLES = [
|
||||
{1, 2, 3},
|
||||
{0: 'a', 1: 'b'},
|
||||
{0: [5], 1: [5]},
|
||||
'abc',
|
||||
frozenset([1, 2, 3]),
|
||||
None,
|
||||
]
|
||||
|
||||
MULTILABEL_SEQUENCES = [
|
||||
[[1], [2], [0, 1]],
|
||||
[(), (2), (0, 1)],
|
||||
np.array([[], [1, 2]], dtype='object'),
|
||||
_NotAnArray(np.array([[], [1, 2]], dtype='object'))
|
||||
]
|
||||
|
||||
|
||||
def test_unique_labels():
|
||||
# Empty iterable
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels()
|
||||
|
||||
# Multiclass problem
|
||||
assert_array_equal(unique_labels(range(10)), np.arange(10))
|
||||
assert_array_equal(unique_labels(np.arange(10)), np.arange(10))
|
||||
assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4]))
|
||||
|
||||
# Multilabel indicator
|
||||
assert_array_equal(unique_labels(np.array([[0, 0, 1],
|
||||
[1, 0, 1],
|
||||
[0, 0, 0]])),
|
||||
np.arange(3))
|
||||
|
||||
assert_array_equal(unique_labels(np.array([[0, 0, 1],
|
||||
[0, 0, 0]])),
|
||||
np.arange(3))
|
||||
|
||||
# Several arrays passed
|
||||
assert_array_equal(unique_labels([4, 0, 2], range(5)),
|
||||
np.arange(5))
|
||||
assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)),
|
||||
np.arange(3))
|
||||
|
||||
# Border line case with binary indicator matrix
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([4, 0, 2], np.ones((5, 5)))
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(np.ones((5, 4)), np.ones((5, 5)))
|
||||
|
||||
assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))),
|
||||
np.arange(5))
|
||||
|
||||
|
||||
def test_unique_labels_non_specific():
|
||||
# Test unique_labels with a variety of collected examples
|
||||
|
||||
# Smoke test for all supported format
|
||||
for format in ["binary", "multiclass", "multilabel-indicator"]:
|
||||
for y in EXAMPLES[format]:
|
||||
unique_labels(y)
|
||||
|
||||
# We don't support those format at the moment
|
||||
for example in NON_ARRAY_LIKE_EXAMPLES:
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(example)
|
||||
|
||||
for y_type in ["unknown", "continuous", 'continuous-multioutput',
|
||||
'multiclass-multioutput']:
|
||||
for example in EXAMPLES[y_type]:
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(example)
|
||||
|
||||
|
||||
def test_unique_labels_mixed_types():
|
||||
# Mix with binary or multiclass and multilabel
|
||||
mix_clf_format = product(EXAMPLES["multilabel-indicator"],
|
||||
EXAMPLES["multiclass"] +
|
||||
EXAMPLES["binary"])
|
||||
|
||||
for y_multilabel, y_multiclass in mix_clf_format:
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(y_multiclass, y_multilabel)
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(y_multilabel, y_multiclass)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([[1, 2]], [["a", "d"]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(["1", 2])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([["1", 2], [1, 3]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([["1", "2"], [2, 3]])
|
||||
|
||||
|
||||
def test_is_multilabel():
|
||||
for group, group_examples in EXAMPLES.items():
|
||||
if group in ['multilabel-indicator']:
|
||||
dense_exp = True
|
||||
else:
|
||||
dense_exp = False
|
||||
|
||||
for example in group_examples:
|
||||
# Only mark explicitly defined sparse examples as valid sparse
|
||||
# multilabel-indicators
|
||||
if group == 'multilabel-indicator' and issparse(example):
|
||||
sparse_exp = True
|
||||
else:
|
||||
sparse_exp = False
|
||||
|
||||
if (issparse(example) or
|
||||
(hasattr(example, '__array__') and
|
||||
np.asarray(example).ndim == 2 and
|
||||
np.asarray(example).dtype.kind in 'biuf' and
|
||||
np.asarray(example).shape[1] > 0)):
|
||||
examples_sparse = [sparse_matrix(example)
|
||||
for sparse_matrix in [coo_matrix,
|
||||
csc_matrix,
|
||||
csr_matrix,
|
||||
dok_matrix,
|
||||
lil_matrix]]
|
||||
for exmpl_sparse in examples_sparse:
|
||||
assert sparse_exp == is_multilabel(exmpl_sparse), (
|
||||
'is_multilabel(%r) should be %s'
|
||||
% (exmpl_sparse, sparse_exp))
|
||||
|
||||
# Densify sparse examples before testing
|
||||
if issparse(example):
|
||||
example = example.toarray()
|
||||
|
||||
assert dense_exp == is_multilabel(example), (
|
||||
'is_multilabel(%r) should be %s'
|
||||
% (example, dense_exp))
|
||||
|
||||
|
||||
def test_check_classification_targets():
|
||||
for y_type in EXAMPLES.keys():
|
||||
if y_type in ["unknown", "continuous", 'continuous-multioutput']:
|
||||
for example in EXAMPLES[y_type]:
|
||||
msg = 'Unknown label type: '
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
check_classification_targets(example)
|
||||
else:
|
||||
for example in EXAMPLES[y_type]:
|
||||
check_classification_targets(example)
|
||||
|
||||
|
||||
# @ignore_warnings
|
||||
def test_type_of_target():
|
||||
for group, group_examples in EXAMPLES.items():
|
||||
for example in group_examples:
|
||||
assert type_of_target(example) == group, (
|
||||
'type_of_target(%r) should be %r, got %r'
|
||||
% (example, group, type_of_target(example)))
|
||||
|
||||
for example in NON_ARRAY_LIKE_EXAMPLES:
|
||||
msg_regex = r'Expected array-like \(array or non-string sequence\).*'
|
||||
with pytest.raises(ValueError, match=msg_regex):
|
||||
type_of_target(example)
|
||||
|
||||
for example in MULTILABEL_SEQUENCES:
|
||||
msg = ('You appear to be using a legacy multi-label data '
|
||||
'representation. Sequence of sequences are no longer supported;'
|
||||
' use a binary array or sparse matrix instead.')
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
type_of_target(example)
|
||||
|
||||
|
||||
def test_type_of_target_pandas_sparse():
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
if parse_version(pd.__version__) >= parse_version('0.25'):
|
||||
pd_sparse_array = pd.arrays.SparseArray
|
||||
else:
|
||||
pd_sparse_array = pd.SparseArray
|
||||
|
||||
y = pd_sparse_array([1, np.nan, np.nan, 1, np.nan])
|
||||
msg = "y cannot be class 'SparseSeries' or 'SparseArray'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
type_of_target(y)
|
||||
|
||||
|
||||
def test_class_distribution():
|
||||
y = np.array([[1, 0, 0, 1],
|
||||
[2, 2, 0, 1],
|
||||
[1, 3, 0, 1],
|
||||
[4, 2, 0, 1],
|
||||
[2, 0, 0, 1],
|
||||
[1, 3, 0, 1]])
|
||||
# Define the sparse matrix with a mix of implicit and explicit zeros
|
||||
data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1])
|
||||
indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5])
|
||||
indptr = np.array([0, 6, 11, 11, 17])
|
||||
y_sp = sp.csc_matrix((data, indices, indptr), shape=(6, 4))
|
||||
|
||||
classes, n_classes, class_prior = class_distribution(y)
|
||||
classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp)
|
||||
classes_expected = [[1, 2, 4],
|
||||
[0, 2, 3],
|
||||
[0],
|
||||
[1]]
|
||||
n_classes_expected = [3, 3, 1, 1]
|
||||
class_prior_expected = [[3/6, 2/6, 1/6],
|
||||
[1/3, 1/3, 1/3],
|
||||
[1.0],
|
||||
[1.0]]
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
assert_array_almost_equal(classes[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
|
||||
|
||||
assert_array_almost_equal(classes_sp[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
|
||||
|
||||
# Test again with explicit sample weights
|
||||
(classes,
|
||||
n_classes,
|
||||
class_prior) = class_distribution(y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0])
|
||||
(classes_sp,
|
||||
n_classes_sp,
|
||||
class_prior_sp) = class_distribution(y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0])
|
||||
class_prior_expected = [[4/9, 3/9, 2/9],
|
||||
[2/9, 4/9, 3/9],
|
||||
[1.0],
|
||||
[1.0]]
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
assert_array_almost_equal(classes[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
|
||||
|
||||
assert_array_almost_equal(classes_sp[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
|
||||
|
||||
|
||||
def test_safe_split_with_precomputed_kernel():
|
||||
clf = SVC()
|
||||
clfp = SVC(kernel="precomputed")
|
||||
|
||||
iris = datasets.load_iris()
|
||||
X, y = iris.data, iris.target
|
||||
K = np.dot(X, X.T)
|
||||
|
||||
cv = ShuffleSplit(test_size=0.25, random_state=0)
|
||||
train, test = list(cv.split(X))[0]
|
||||
|
||||
X_train, y_train = _safe_split(clf, X, y, train)
|
||||
K_train, y_train2 = _safe_split(clfp, K, y, train)
|
||||
assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
|
||||
assert_array_almost_equal(y_train, y_train2)
|
||||
|
||||
X_test, y_test = _safe_split(clf, X, y, test, train)
|
||||
K_test, y_test2 = _safe_split(clfp, K, y, test, train)
|
||||
assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
|
||||
assert_array_almost_equal(y_test, y_test2)
|
||||
|
||||
|
||||
def test_ovr_decision_function():
|
||||
# test properties for ovr decision function
|
||||
|
||||
predictions = np.array([[0, 1, 1],
|
||||
[0, 1, 0],
|
||||
[0, 1, 1],
|
||||
[0, 1, 1]])
|
||||
|
||||
confidences = np.array([[-1e16, 0, -1e16],
|
||||
[1., 2., -3.],
|
||||
[-5., 2., 5.],
|
||||
[-0.5, 0.2, 0.5]])
|
||||
|
||||
n_classes = 3
|
||||
|
||||
dec_values = _ovr_decision_function(predictions, confidences, n_classes)
|
||||
|
||||
# check that the decision values are within 0.5 range of the votes
|
||||
votes = np.array([[1, 0, 2],
|
||||
[1, 1, 1],
|
||||
[1, 0, 2],
|
||||
[1, 0, 2]])
|
||||
|
||||
assert_allclose(votes, dec_values, atol=0.5)
|
||||
|
||||
# check that the prediction are what we expect
|
||||
# highest vote or highest confidence if there is a tie.
|
||||
# for the second sample we have a tie (should be won by 1)
|
||||
expected_prediction = np.array([2, 1, 2, 2])
|
||||
assert_array_equal(np.argmax(dec_values, axis=1), expected_prediction)
|
||||
|
||||
# third and fourth sample have the same vote but third sample
|
||||
# has higher confidence, this should reflect on the decision values
|
||||
assert (dec_values[2, 2] > dec_values[3, 2])
|
||||
|
||||
# assert subset invariance.
|
||||
dec_values_one = [_ovr_decision_function(np.array([predictions[i]]),
|
||||
np.array([confidences[i]]),
|
||||
n_classes)[0] for i in range(4)]
|
||||
|
||||
assert_allclose(dec_values, dec_values_one, atol=1e-6)
|
|
@ -0,0 +1,78 @@
|
|||
# Author: Olivier Grisel <olivier.grisel@ensta.org>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
from sklearn.utils.murmurhash import murmurhash3_32
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
|
||||
def test_mmhash3_int():
|
||||
assert murmurhash3_32(3) == 847579505
|
||||
assert murmurhash3_32(3, seed=0) == 847579505
|
||||
assert murmurhash3_32(3, seed=42) == -1823081949
|
||||
|
||||
assert murmurhash3_32(3, positive=False) == 847579505
|
||||
assert murmurhash3_32(3, seed=0, positive=False) == 847579505
|
||||
assert murmurhash3_32(3, seed=42, positive=False) == -1823081949
|
||||
|
||||
assert murmurhash3_32(3, positive=True) == 847579505
|
||||
assert murmurhash3_32(3, seed=0, positive=True) == 847579505
|
||||
assert murmurhash3_32(3, seed=42, positive=True) == 2471885347
|
||||
|
||||
|
||||
def test_mmhash3_int_array():
|
||||
rng = np.random.RandomState(42)
|
||||
keys = rng.randint(-5342534, 345345, size=3 * 2 * 1).astype(np.int32)
|
||||
keys = keys.reshape((3, 2, 1))
|
||||
|
||||
for seed in [0, 42]:
|
||||
expected = np.array([murmurhash3_32(int(k), seed)
|
||||
for k in keys.flat])
|
||||
expected = expected.reshape(keys.shape)
|
||||
assert_array_equal(murmurhash3_32(keys, seed), expected)
|
||||
|
||||
for seed in [0, 42]:
|
||||
expected = np.array([murmurhash3_32(k, seed, positive=True)
|
||||
for k in keys.flat])
|
||||
expected = expected.reshape(keys.shape)
|
||||
assert_array_equal(murmurhash3_32(keys, seed, positive=True),
|
||||
expected)
|
||||
|
||||
|
||||
def test_mmhash3_bytes():
|
||||
assert murmurhash3_32(b'foo', 0) == -156908512
|
||||
assert murmurhash3_32(b'foo', 42) == -1322301282
|
||||
|
||||
assert murmurhash3_32(b'foo', 0, positive=True) == 4138058784
|
||||
assert murmurhash3_32(b'foo', 42, positive=True) == 2972666014
|
||||
|
||||
|
||||
def test_mmhash3_unicode():
|
||||
assert murmurhash3_32('foo', 0) == -156908512
|
||||
assert murmurhash3_32('foo', 42) == -1322301282
|
||||
|
||||
assert murmurhash3_32('foo', 0, positive=True) == 4138058784
|
||||
assert murmurhash3_32('foo', 42, positive=True) == 2972666014
|
||||
|
||||
|
||||
def test_no_collision_on_byte_range():
|
||||
previous_hashes = set()
|
||||
for i in range(100):
|
||||
h = murmurhash3_32(' ' * i, 0)
|
||||
assert h not in previous_hashes, \
|
||||
"Found collision on growing empty string"
|
||||
|
||||
|
||||
def test_uniform_distribution():
|
||||
n_bins, n_samples = 10, 100000
|
||||
bins = np.zeros(n_bins, dtype=np.float64)
|
||||
|
||||
for i in range(n_samples):
|
||||
bins[murmurhash3_32(i, positive=True) % n_bins] += 1
|
||||
|
||||
means = bins / n_samples
|
||||
expected = np.full(n_bins, 1. / n_bins)
|
||||
|
||||
assert_array_almost_equal(means / expected, np.ones(n_bins), 2)
|
32
venv/Lib/site-packages/sklearn/utils/tests/test_optimize.py
Normal file
32
venv/Lib/site-packages/sklearn/utils/tests/test_optimize.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
import numpy as np
|
||||
|
||||
from sklearn.utils.optimize import _newton_cg
|
||||
from scipy.optimize import fmin_ncg
|
||||
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
|
||||
|
||||
def test_newton_cg():
|
||||
# Test that newton_cg gives same result as scipy's fmin_ncg
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
A = rng.normal(size=(10, 10))
|
||||
x0 = np.ones(10)
|
||||
|
||||
def func(x):
|
||||
Ax = A.dot(x)
|
||||
return .5 * (Ax).dot(Ax)
|
||||
|
||||
def grad(x):
|
||||
return A.T.dot(A.dot(x))
|
||||
|
||||
def hess(x, p):
|
||||
return p.dot(A.T.dot(A.dot(x.all())))
|
||||
|
||||
def grad_hess(x):
|
||||
return grad(x), lambda x: A.T.dot(A.dot(x))
|
||||
|
||||
assert_array_almost_equal(
|
||||
_newton_cg(grad_hess, func, grad, x0, tol=1e-10)[0],
|
||||
fmin_ncg(f=func, x0=x0, fprime=grad, fhess_p=hess)
|
||||
)
|
576
venv/Lib/site-packages/sklearn/utils/tests/test_pprint.py
Normal file
576
venv/Lib/site-packages/sklearn/utils/tests/test_pprint.py
Normal file
|
@ -0,0 +1,576 @@
|
|||
import re
|
||||
from pprint import PrettyPrinter
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils._pprint import _EstimatorPrettyPrinter
|
||||
from sklearn.linear_model import LogisticRegressionCV
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
from sklearn.feature_selection import SelectKBest, chi2
|
||||
from sklearn import set_config, config_context
|
||||
|
||||
|
||||
# Ignore flake8 (lots of line too long issues)
|
||||
# flake8: noqa
|
||||
|
||||
# Constructors excerpted to test pprinting
|
||||
class LogisticRegression(BaseEstimator):
|
||||
def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
|
||||
fit_intercept=True, intercept_scaling=1, class_weight=None,
|
||||
random_state=None, solver='warn', max_iter=100,
|
||||
multi_class='warn', verbose=0, warm_start=False, n_jobs=None,
|
||||
l1_ratio=None):
|
||||
self.penalty = penalty
|
||||
self.dual = dual
|
||||
self.tol = tol
|
||||
self.C = C
|
||||
self.fit_intercept = fit_intercept
|
||||
self.intercept_scaling = intercept_scaling
|
||||
self.class_weight = class_weight
|
||||
self.random_state = random_state
|
||||
self.solver = solver
|
||||
self.max_iter = max_iter
|
||||
self.multi_class = multi_class
|
||||
self.verbose = verbose
|
||||
self.warm_start = warm_start
|
||||
self.n_jobs = n_jobs
|
||||
self.l1_ratio = l1_ratio
|
||||
|
||||
def fit(self, X, y):
|
||||
return self
|
||||
|
||||
|
||||
class StandardScaler(TransformerMixin, BaseEstimator):
|
||||
def __init__(self, copy=True, with_mean=True, with_std=True):
|
||||
self.with_mean = with_mean
|
||||
self.with_std = with_std
|
||||
self.copy = copy
|
||||
|
||||
def transform(self, X, copy=None):
|
||||
return self
|
||||
|
||||
|
||||
class RFE(BaseEstimator):
|
||||
def __init__(self, estimator, n_features_to_select=None, step=1,
|
||||
verbose=0):
|
||||
self.estimator = estimator
|
||||
self.n_features_to_select = n_features_to_select
|
||||
self.step = step
|
||||
self.verbose = verbose
|
||||
|
||||
|
||||
class GridSearchCV(BaseEstimator):
|
||||
def __init__(self, estimator, param_grid, scoring=None,
|
||||
n_jobs=None, iid='warn', refit=True, cv='warn', verbose=0,
|
||||
pre_dispatch='2*n_jobs', error_score='raise-deprecating',
|
||||
return_train_score=False):
|
||||
self.estimator = estimator
|
||||
self.param_grid = param_grid
|
||||
self.scoring = scoring
|
||||
self.n_jobs = n_jobs
|
||||
self.iid = iid
|
||||
self.refit = refit
|
||||
self.cv = cv
|
||||
self.verbose = verbose
|
||||
self.pre_dispatch = pre_dispatch
|
||||
self.error_score = error_score
|
||||
self.return_train_score = return_train_score
|
||||
|
||||
|
||||
class CountVectorizer(BaseEstimator):
|
||||
def __init__(self, input='content', encoding='utf-8',
|
||||
decode_error='strict', strip_accents=None,
|
||||
lowercase=True, preprocessor=None, tokenizer=None,
|
||||
stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
|
||||
ngram_range=(1, 1), analyzer='word',
|
||||
max_df=1.0, min_df=1, max_features=None,
|
||||
vocabulary=None, binary=False, dtype=np.int64):
|
||||
self.input = input
|
||||
self.encoding = encoding
|
||||
self.decode_error = decode_error
|
||||
self.strip_accents = strip_accents
|
||||
self.preprocessor = preprocessor
|
||||
self.tokenizer = tokenizer
|
||||
self.analyzer = analyzer
|
||||
self.lowercase = lowercase
|
||||
self.token_pattern = token_pattern
|
||||
self.stop_words = stop_words
|
||||
self.max_df = max_df
|
||||
self.min_df = min_df
|
||||
self.max_features = max_features
|
||||
self.ngram_range = ngram_range
|
||||
self.vocabulary = vocabulary
|
||||
self.binary = binary
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
class Pipeline(BaseEstimator):
|
||||
def __init__(self, steps, memory=None):
|
||||
self.steps = steps
|
||||
self.memory = memory
|
||||
|
||||
|
||||
class SVC(BaseEstimator):
|
||||
def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated',
|
||||
coef0=0.0, shrinking=True, probability=False,
|
||||
tol=1e-3, cache_size=200, class_weight=None,
|
||||
verbose=False, max_iter=-1, decision_function_shape='ovr',
|
||||
random_state=None):
|
||||
self.kernel = kernel
|
||||
self.degree = degree
|
||||
self.gamma = gamma
|
||||
self.coef0 = coef0
|
||||
self.tol = tol
|
||||
self.C = C
|
||||
self.shrinking = shrinking
|
||||
self.probability = probability
|
||||
self.cache_size = cache_size
|
||||
self.class_weight = class_weight
|
||||
self.verbose = verbose
|
||||
self.max_iter = max_iter
|
||||
self.decision_function_shape = decision_function_shape
|
||||
self.random_state = random_state
|
||||
|
||||
|
||||
class PCA(BaseEstimator):
|
||||
def __init__(self, n_components=None, copy=True, whiten=False,
|
||||
svd_solver='auto', tol=0.0, iterated_power='auto',
|
||||
random_state=None):
|
||||
self.n_components = n_components
|
||||
self.copy = copy
|
||||
self.whiten = whiten
|
||||
self.svd_solver = svd_solver
|
||||
self.tol = tol
|
||||
self.iterated_power = iterated_power
|
||||
self.random_state = random_state
|
||||
|
||||
|
||||
class NMF(BaseEstimator):
|
||||
def __init__(self, n_components=None, init=None, solver='cd',
|
||||
beta_loss='frobenius', tol=1e-4, max_iter=200,
|
||||
random_state=None, alpha=0., l1_ratio=0., verbose=0,
|
||||
shuffle=False):
|
||||
self.n_components = n_components
|
||||
self.init = init
|
||||
self.solver = solver
|
||||
self.beta_loss = beta_loss
|
||||
self.tol = tol
|
||||
self.max_iter = max_iter
|
||||
self.random_state = random_state
|
||||
self.alpha = alpha
|
||||
self.l1_ratio = l1_ratio
|
||||
self.verbose = verbose
|
||||
self.shuffle = shuffle
|
||||
|
||||
|
||||
class SimpleImputer(BaseEstimator):
|
||||
def __init__(self, missing_values=np.nan, strategy="mean",
|
||||
fill_value=None, verbose=0, copy=True):
|
||||
self.missing_values = missing_values
|
||||
self.strategy = strategy
|
||||
self.fill_value = fill_value
|
||||
self.verbose = verbose
|
||||
self.copy = copy
|
||||
|
||||
|
||||
def test_basic(print_changed_only_false):
|
||||
# Basic pprint test
|
||||
lr = LogisticRegression()
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=None, max_iter=100,
|
||||
multi_class='warn', n_jobs=None, penalty='l2',
|
||||
random_state=None, solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert lr.__repr__() == expected
|
||||
|
||||
|
||||
def test_changed_only():
|
||||
# Make sure the changed_only param is correctly used when True (default)
|
||||
lr = LogisticRegression(C=99)
|
||||
expected = """LogisticRegression(C=99)"""
|
||||
assert lr.__repr__() == expected
|
||||
|
||||
# Check with a repr that doesn't fit on a single line
|
||||
lr = LogisticRegression(C=99, class_weight=.4, fit_intercept=False,
|
||||
tol=1234, verbose=True)
|
||||
expected = """
|
||||
LogisticRegression(C=99, class_weight=0.4, fit_intercept=False, tol=1234,
|
||||
verbose=True)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert lr.__repr__() == expected
|
||||
|
||||
imputer = SimpleImputer(missing_values=0)
|
||||
expected = """SimpleImputer(missing_values=0)"""
|
||||
assert imputer.__repr__() == expected
|
||||
|
||||
# Defaults to np.NaN, trying with float('NaN')
|
||||
imputer = SimpleImputer(missing_values=float('NaN'))
|
||||
expected = """SimpleImputer()"""
|
||||
assert imputer.__repr__() == expected
|
||||
|
||||
# make sure array parameters don't throw error (see #13583)
|
||||
repr(LogisticRegressionCV(Cs=np.array([0.1, 1])))
|
||||
|
||||
|
||||
def test_pipeline(print_changed_only_false):
|
||||
# Render a pipeline object
|
||||
pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999))
|
||||
expected = """
|
||||
Pipeline(memory=None,
|
||||
steps=[('standardscaler',
|
||||
StandardScaler(copy=True, with_mean=True, with_std=True)),
|
||||
('logisticregression',
|
||||
LogisticRegression(C=999, class_weight=None, dual=False,
|
||||
fit_intercept=True, intercept_scaling=1,
|
||||
l1_ratio=None, max_iter=100,
|
||||
multi_class='warn', n_jobs=None,
|
||||
penalty='l2', random_state=None,
|
||||
solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False))],
|
||||
verbose=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pipeline.__repr__() == expected
|
||||
|
||||
|
||||
def test_deeply_nested(print_changed_only_false):
|
||||
# Render a deeply nested estimator
|
||||
rfe = RFE(RFE(RFE(RFE(RFE(RFE(RFE(LogisticRegression())))))))
|
||||
expected = """
|
||||
RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=LogisticRegression(C=1.0,
|
||||
class_weight=None,
|
||||
dual=False,
|
||||
fit_intercept=True,
|
||||
intercept_scaling=1,
|
||||
l1_ratio=None,
|
||||
max_iter=100,
|
||||
multi_class='warn',
|
||||
n_jobs=None,
|
||||
penalty='l2',
|
||||
random_state=None,
|
||||
solver='warn',
|
||||
tol=0.0001,
|
||||
verbose=0,
|
||||
warm_start=False),
|
||||
n_features_to_select=None,
|
||||
step=1,
|
||||
verbose=0),
|
||||
n_features_to_select=None,
|
||||
step=1,
|
||||
verbose=0),
|
||||
n_features_to_select=None,
|
||||
step=1, verbose=0),
|
||||
n_features_to_select=None, step=1,
|
||||
verbose=0),
|
||||
n_features_to_select=None, step=1, verbose=0),
|
||||
n_features_to_select=None, step=1, verbose=0),
|
||||
n_features_to_select=None, step=1, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert rfe.__repr__() == expected
|
||||
|
||||
|
||||
def test_gridsearch(print_changed_only_false):
|
||||
# render a gridsearch
|
||||
param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
|
||||
'C': [1, 10, 100, 1000]},
|
||||
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
|
||||
gs = GridSearchCV(SVC(), param_grid, cv=5)
|
||||
|
||||
expected = """
|
||||
GridSearchCV(cv=5, error_score='raise-deprecating',
|
||||
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr', degree=3,
|
||||
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
|
||||
probability=False, random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False),
|
||||
iid='warn', n_jobs=None,
|
||||
param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
|
||||
'kernel': ['rbf']},
|
||||
{'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert gs.__repr__() == expected
|
||||
|
||||
|
||||
def test_gridsearch_pipeline(print_changed_only_false):
|
||||
# render a pipeline inside a gridsearch
|
||||
pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True)
|
||||
|
||||
pipeline = Pipeline([
|
||||
('reduce_dim', PCA()),
|
||||
('classify', SVC())
|
||||
])
|
||||
N_FEATURES_OPTIONS = [2, 4, 8]
|
||||
C_OPTIONS = [1, 10, 100, 1000]
|
||||
param_grid = [
|
||||
{
|
||||
'reduce_dim': [PCA(iterated_power=7), NMF()],
|
||||
'reduce_dim__n_components': N_FEATURES_OPTIONS,
|
||||
'classify__C': C_OPTIONS
|
||||
},
|
||||
{
|
||||
'reduce_dim': [SelectKBest(chi2)],
|
||||
'reduce_dim__k': N_FEATURES_OPTIONS,
|
||||
'classify__C': C_OPTIONS
|
||||
}
|
||||
]
|
||||
gspipline = GridSearchCV(pipeline, cv=3, n_jobs=1, param_grid=param_grid)
|
||||
expected = """
|
||||
GridSearchCV(cv=3, error_score='raise-deprecating',
|
||||
estimator=Pipeline(memory=None,
|
||||
steps=[('reduce_dim',
|
||||
PCA(copy=True, iterated_power='auto',
|
||||
n_components=None,
|
||||
random_state=None,
|
||||
svd_solver='auto', tol=0.0,
|
||||
whiten=False)),
|
||||
('classify',
|
||||
SVC(C=1.0, cache_size=200,
|
||||
class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr',
|
||||
degree=3, gamma='auto_deprecated',
|
||||
kernel='rbf', max_iter=-1,
|
||||
probability=False,
|
||||
random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False))]),
|
||||
iid='warn', n_jobs=1,
|
||||
param_grid=[{'classify__C': [1, 10, 100, 1000],
|
||||
'reduce_dim': [PCA(copy=True, iterated_power=7,
|
||||
n_components=None,
|
||||
random_state=None,
|
||||
svd_solver='auto', tol=0.0,
|
||||
whiten=False),
|
||||
NMF(alpha=0.0, beta_loss='frobenius',
|
||||
init=None, l1_ratio=0.0,
|
||||
max_iter=200, n_components=None,
|
||||
random_state=None, shuffle=False,
|
||||
solver='cd', tol=0.0001,
|
||||
verbose=0)],
|
||||
'reduce_dim__n_components': [2, 4, 8]},
|
||||
{'classify__C': [1, 10, 100, 1000],
|
||||
'reduce_dim': [SelectKBest(k=10,
|
||||
score_func=<function chi2 at some_address>)],
|
||||
'reduce_dim__k': [2, 4, 8]}],
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
repr_ = pp.pformat(gspipline)
|
||||
# Remove address of '<function chi2 at 0x.....>' for reproducibility
|
||||
repr_ = re.sub('function chi2 at 0x.*>',
|
||||
'function chi2 at some_address>', repr_)
|
||||
assert repr_ == expected
|
||||
|
||||
|
||||
def test_n_max_elements_to_show(print_changed_only_false):
|
||||
|
||||
n_max_elements_to_show = 30
|
||||
pp = _EstimatorPrettyPrinter(
|
||||
compact=True, indent=1, indent_at_name=True,
|
||||
n_max_elements_to_show=n_max_elements_to_show
|
||||
)
|
||||
|
||||
# No ellipsis
|
||||
vocabulary = {i: i for i in range(n_max_elements_to_show)}
|
||||
vectorizer = CountVectorizer(vocabulary=vocabulary)
|
||||
|
||||
expected = r"""
|
||||
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
|
||||
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
|
||||
lowercase=True, max_df=1.0, max_features=None, min_df=1,
|
||||
ngram_range=(1, 1), preprocessor=None, stop_words=None,
|
||||
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
|
||||
tokenizer=None,
|
||||
vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
|
||||
8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
|
||||
15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
|
||||
21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
|
||||
27: 27, 28: 28, 29: 29})"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(vectorizer) == expected
|
||||
|
||||
# Now with ellipsis
|
||||
vocabulary = {i: i for i in range(n_max_elements_to_show + 1)}
|
||||
vectorizer = CountVectorizer(vocabulary=vocabulary)
|
||||
|
||||
expected = r"""
|
||||
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
|
||||
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
|
||||
lowercase=True, max_df=1.0, max_features=None, min_df=1,
|
||||
ngram_range=(1, 1), preprocessor=None, stop_words=None,
|
||||
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
|
||||
tokenizer=None,
|
||||
vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
|
||||
8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
|
||||
15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
|
||||
21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
|
||||
27: 27, 28: 28, 29: 29, ...})"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(vectorizer) == expected
|
||||
|
||||
# Also test with lists
|
||||
param_grid = {'C': list(range(n_max_elements_to_show))}
|
||||
gs = GridSearchCV(SVC(), param_grid)
|
||||
expected = """
|
||||
GridSearchCV(cv='warn', error_score='raise-deprecating',
|
||||
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr', degree=3,
|
||||
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
|
||||
probability=False, random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False),
|
||||
iid='warn', n_jobs=None,
|
||||
param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
|
||||
27, 28, 29]},
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(gs) == expected
|
||||
|
||||
# Now with ellipsis
|
||||
param_grid = {'C': list(range(n_max_elements_to_show + 1))}
|
||||
gs = GridSearchCV(SVC(), param_grid)
|
||||
expected = """
|
||||
GridSearchCV(cv='warn', error_score='raise-deprecating',
|
||||
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr', degree=3,
|
||||
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
|
||||
probability=False, random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False),
|
||||
iid='warn', n_jobs=None,
|
||||
param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
|
||||
27, 28, 29, ...]},
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(gs) == expected
|
||||
|
||||
|
||||
def test_bruteforce_ellipsis(print_changed_only_false):
|
||||
# Check that the bruteforce ellipsis (used when the number of non-blank
|
||||
# characters exceeds N_CHAR_MAX) renders correctly.
|
||||
|
||||
lr = LogisticRegression()
|
||||
|
||||
# test when the left and right side of the ellipsis aren't on the same
|
||||
# line.
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
in...
|
||||
multi_class='warn', n_jobs=None, penalty='l2',
|
||||
random_state=None, solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert expected == lr.__repr__(N_CHAR_MAX=150)
|
||||
|
||||
# test with very small N_CHAR_MAX
|
||||
# Note that N_CHAR_MAX is not strictly enforced, but it's normal: to avoid
|
||||
# weird reprs we still keep the whole line of the right part (after the
|
||||
# ellipsis).
|
||||
expected = """
|
||||
Lo...
|
||||
warm_start=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert expected == lr.__repr__(N_CHAR_MAX=4)
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters: In this case we
|
||||
# don't want ellipsis
|
||||
full_repr = lr.__repr__(N_CHAR_MAX=float('inf'))
|
||||
n_nonblank = len(''.join(full_repr.split()))
|
||||
assert lr.__repr__(N_CHAR_MAX=n_nonblank) == full_repr
|
||||
assert '...' not in full_repr
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
|
||||
# right side of the ellispsis are on different lines. In this case we
|
||||
# want to expend the whole line of the right side
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=None, max_i...
|
||||
multi_class='warn', n_jobs=None, penalty='l2',
|
||||
random_state=None, solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 10)
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
|
||||
# right side of the ellispsis are on the same line. In this case we don't
|
||||
# want to expend the whole line of the right side, just add the ellispsis
|
||||
# between the 2 sides.
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=None, max_iter...,
|
||||
multi_class='warn', n_jobs=None, penalty='l2',
|
||||
random_state=None, solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 4)
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters - 2: the left and
|
||||
# right side of the ellispsis are on the same line, but adding the ellipsis
|
||||
# would actually make the repr longer. So we don't add the ellipsis.
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=None, max_iter=100,
|
||||
multi_class='warn', n_jobs=None, penalty='l2',
|
||||
random_state=None, solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 2)
|
||||
|
||||
|
||||
def test_builtin_prettyprinter():
|
||||
# non regression test than ensures we can still use the builtin
|
||||
# PrettyPrinter class for estimators (as done e.g. by joblib).
|
||||
# Used to be a bug
|
||||
|
||||
PrettyPrinter().pprint(LogisticRegression())
|
||||
|
||||
|
||||
def test_kwargs_in_init():
|
||||
# Make sure the changed_only=True mode is OK when an argument is passed as
|
||||
# kwargs.
|
||||
# Non-regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/17206
|
||||
|
||||
class WithKWargs(BaseEstimator):
|
||||
# Estimator with a kwargs argument. These need to hack around
|
||||
# set_params and get_params. Here we mimic what LightGBM does.
|
||||
def __init__(self, a='willchange', b='unchanged', **kwargs):
|
||||
self.a = a
|
||||
self.b = b
|
||||
self._other_params = {}
|
||||
self.set_params(**kwargs)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
params = super().get_params(deep=deep)
|
||||
params.update(self._other_params)
|
||||
return params
|
||||
|
||||
def set_params(self, **params):
|
||||
for key, value in params.items():
|
||||
setattr(self, key, value)
|
||||
self._other_params[key] = value
|
||||
return self
|
||||
|
||||
est = WithKWargs(a='something', c='abcd', d=None)
|
||||
|
||||
expected = "WithKWargs(a='something', c='abcd', d=None)"
|
||||
assert expected == est.__repr__()
|
||||
|
||||
with config_context(print_changed_only=False):
|
||||
expected = "WithKWargs(a='something', b='unchanged', c='abcd', d=None)"
|
||||
assert expected == est.__repr__()
|
187
venv/Lib/site-packages/sklearn/utils/tests/test_random.py
Normal file
187
venv/Lib/site-packages/sklearn/utils/tests/test_random.py
Normal file
|
@ -0,0 +1,187 @@
|
|||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
from scipy.special import comb
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
|
||||
from sklearn.utils.random import _random_choice_csc, sample_without_replacement
|
||||
from sklearn.utils._random import _our_rand_r_py
|
||||
|
||||
|
||||
###############################################################################
|
||||
# test custom sampling without replacement algorithm
|
||||
###############################################################################
|
||||
def test_invalid_sample_without_replacement_algorithm():
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(5, 4, "unknown")
|
||||
|
||||
|
||||
def test_sample_without_replacement_algorithms():
|
||||
methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
|
||||
|
||||
for m in methods:
|
||||
def sample_without_replacement_method(n_population, n_samples,
|
||||
random_state=None):
|
||||
return sample_without_replacement(n_population, n_samples,
|
||||
method=m,
|
||||
random_state=random_state)
|
||||
|
||||
check_edge_case_of_sample_int(sample_without_replacement_method)
|
||||
check_sample_int(sample_without_replacement_method)
|
||||
check_sample_int_distribution(sample_without_replacement_method)
|
||||
|
||||
|
||||
def check_edge_case_of_sample_int(sample_without_replacement):
|
||||
|
||||
# n_population < n_sample
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(0, 1)
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(1, 2)
|
||||
|
||||
# n_population == n_samples
|
||||
assert sample_without_replacement(0, 0).shape == (0, )
|
||||
|
||||
assert sample_without_replacement(1, 1).shape == (1, )
|
||||
|
||||
# n_population >= n_samples
|
||||
assert sample_without_replacement(5, 0).shape == (0, )
|
||||
assert sample_without_replacement(5, 1).shape == (1, )
|
||||
|
||||
# n_population < 0 or n_samples < 0
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(-1, 5)
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(5, -1)
|
||||
|
||||
|
||||
def check_sample_int(sample_without_replacement):
|
||||
# This test is heavily inspired from test_random.py of python-core.
|
||||
#
|
||||
# For the entire allowable range of 0 <= k <= N, validate that
|
||||
# the sample is of the correct length and contains only unique items
|
||||
n_population = 100
|
||||
|
||||
for n_samples in range(n_population + 1):
|
||||
s = sample_without_replacement(n_population, n_samples)
|
||||
assert len(s) == n_samples
|
||||
unique = np.unique(s)
|
||||
assert np.size(unique) == n_samples
|
||||
assert np.all(unique < n_population)
|
||||
|
||||
# test edge case n_population == n_samples == 0
|
||||
assert np.size(sample_without_replacement(0, 0)) == 0
|
||||
|
||||
|
||||
def check_sample_int_distribution(sample_without_replacement):
|
||||
# This test is heavily inspired from test_random.py of python-core.
|
||||
#
|
||||
# For the entire allowable range of 0 <= k <= N, validate that
|
||||
# sample generates all possible permutations
|
||||
n_population = 10
|
||||
|
||||
# a large number of trials prevents false negatives without slowing normal
|
||||
# case
|
||||
n_trials = 10000
|
||||
|
||||
for n_samples in range(n_population):
|
||||
# Counting the number of combinations is not as good as counting the
|
||||
# the number of permutations. However, it works with sampling algorithm
|
||||
# that does not provide a random permutation of the subset of integer.
|
||||
n_expected = comb(n_population, n_samples, exact=True)
|
||||
|
||||
output = {}
|
||||
for i in range(n_trials):
|
||||
output[frozenset(sample_without_replacement(n_population,
|
||||
n_samples))] = None
|
||||
|
||||
if len(output) == n_expected:
|
||||
break
|
||||
else:
|
||||
raise AssertionError(
|
||||
"number of combinations != number of expected (%s != %s)" %
|
||||
(len(output), n_expected))
|
||||
|
||||
|
||||
def test_random_choice_csc(n_samples=10000, random_state=24):
|
||||
# Explicit class probabilities
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
|
||||
got = _random_choice_csc(n_samples, classes, class_probabilities,
|
||||
random_state)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
# Implicit class probabilities
|
||||
classes = [[0, 1], [1, 2]] # test for array-like support
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0, 1/2, 1/2])]
|
||||
|
||||
got = _random_choice_csc(n_samples=n_samples,
|
||||
classes=classes,
|
||||
random_state=random_state)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
# Edge case probabilities 1.0 and 0.0
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2])]
|
||||
class_probabilities = [np.array([1.0, 0.0]), np.array([0.0, 1.0, 0.0])]
|
||||
|
||||
got = _random_choice_csc(n_samples, classes, class_probabilities,
|
||||
random_state)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = np.bincount(got.getcol(k).toarray().ravel(),
|
||||
minlength=len(class_probabilities[k])) / n_samples
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
# One class target data
|
||||
classes = [[1], [0]] # test for array-like support
|
||||
class_probabilities = [np.array([0.0, 1.0]), np.array([1.0])]
|
||||
|
||||
got = _random_choice_csc(n_samples=n_samples,
|
||||
classes=classes,
|
||||
random_state=random_state)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = np.bincount(got.getcol(k).toarray().ravel()) / n_samples
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
|
||||
def test_random_choice_csc_errors():
|
||||
# the length of an array in classes and class_probabilities is mismatched
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2, 3])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
# the class dtype is not supported
|
||||
classes = [np.array(["a", "1"]), np.array(["z", "1", "2"])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
# the class dtype is not supported
|
||||
classes = [np.array([4.2, 0.1]), np.array([0.1, 0.2, 9.4])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
# Given probabilities don't sum to 1
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2])]
|
||||
class_probabilities = [np.array([0.5, 0.6]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
|
||||
def test_our_rand_r():
|
||||
assert 131541053 == _our_rand_r_py(1273642419)
|
||||
assert 270369 == _our_rand_r_py(0)
|
153
venv/Lib/site-packages/sklearn/utils/tests/test_seq_dataset.py
Normal file
153
venv/Lib/site-packages/sklearn/utils/tests/test_seq_dataset.py
Normal file
|
@ -0,0 +1,153 @@
|
|||
# Author: Tom Dupre la Tour
|
||||
# Joan Massich <mailsik@gmail.com>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
from numpy.testing import assert_array_equal
|
||||
from sklearn.utils._seq_dataset import (
|
||||
ArrayDataset32, ArrayDataset64, CSRDataset32, CSRDataset64)
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
iris = load_iris()
|
||||
X64 = iris.data.astype(np.float64)
|
||||
y64 = iris.target.astype(np.float64)
|
||||
X_csr64 = sp.csr_matrix(X64)
|
||||
sample_weight64 = np.arange(y64.size, dtype=np.float64)
|
||||
|
||||
X32 = iris.data.astype(np.float32)
|
||||
y32 = iris.target.astype(np.float32)
|
||||
X_csr32 = sp.csr_matrix(X32)
|
||||
sample_weight32 = np.arange(y32.size, dtype=np.float32)
|
||||
|
||||
|
||||
def assert_csr_equal_values(current, expected):
|
||||
current.eliminate_zeros()
|
||||
expected.eliminate_zeros()
|
||||
expected = expected.astype(current.dtype)
|
||||
assert current.shape[0] == expected.shape[0]
|
||||
assert current.shape[1] == expected.shape[1]
|
||||
assert_array_equal(current.data, expected.data)
|
||||
assert_array_equal(current.indices, expected.indices)
|
||||
assert_array_equal(current.indptr, expected.indptr)
|
||||
|
||||
|
||||
def make_dense_dataset_32():
|
||||
return ArrayDataset32(X32, y32, sample_weight32, seed=42)
|
||||
|
||||
|
||||
def make_dense_dataset_64():
|
||||
return ArrayDataset64(X64, y64, sample_weight64, seed=42)
|
||||
|
||||
|
||||
def make_sparse_dataset_32():
|
||||
return CSRDataset32(X_csr32.data, X_csr32.indptr, X_csr32.indices, y32,
|
||||
sample_weight32, seed=42)
|
||||
|
||||
|
||||
def make_sparse_dataset_64():
|
||||
return CSRDataset64(X_csr64.data, X_csr64.indptr, X_csr64.indices, y64,
|
||||
sample_weight64, seed=42)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dataset_constructor', [
|
||||
make_dense_dataset_32,
|
||||
make_dense_dataset_64,
|
||||
make_sparse_dataset_32,
|
||||
make_sparse_dataset_64,
|
||||
])
|
||||
def test_seq_dataset_basic_iteration(dataset_constructor):
|
||||
NUMBER_OF_RUNS = 5
|
||||
dataset = dataset_constructor()
|
||||
for _ in range(NUMBER_OF_RUNS):
|
||||
# next sample
|
||||
xi_, yi, swi, idx = dataset._next_py()
|
||||
xi = sp.csr_matrix((xi_), shape=(1, X64.shape[1]))
|
||||
|
||||
assert_csr_equal_values(xi, X_csr64[idx])
|
||||
assert yi == y64[idx]
|
||||
assert swi == sample_weight64[idx]
|
||||
|
||||
# random sample
|
||||
xi_, yi, swi, idx = dataset._random_py()
|
||||
xi = sp.csr_matrix((xi_), shape=(1, X64.shape[1]))
|
||||
|
||||
assert_csr_equal_values(xi, X_csr64[idx])
|
||||
assert yi == y64[idx]
|
||||
assert swi == sample_weight64[idx]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('make_dense_dataset,make_sparse_dataset', [
|
||||
(make_dense_dataset_32, make_sparse_dataset_32),
|
||||
(make_dense_dataset_64, make_sparse_dataset_64),
|
||||
])
|
||||
def test_seq_dataset_shuffle(make_dense_dataset, make_sparse_dataset):
|
||||
dense_dataset, sparse_dataset = make_dense_dataset(), make_sparse_dataset()
|
||||
# not shuffled
|
||||
for i in range(5):
|
||||
_, _, _, idx1 = dense_dataset._next_py()
|
||||
_, _, _, idx2 = sparse_dataset._next_py()
|
||||
assert idx1 == i
|
||||
assert idx2 == i
|
||||
|
||||
for i in [132, 50, 9, 18, 58]:
|
||||
_, _, _, idx1 = dense_dataset._random_py()
|
||||
_, _, _, idx2 = sparse_dataset._random_py()
|
||||
assert idx1 == i
|
||||
assert idx2 == i
|
||||
|
||||
seed = 77
|
||||
dense_dataset._shuffle_py(seed)
|
||||
sparse_dataset._shuffle_py(seed)
|
||||
|
||||
idx_next = [63, 91, 148, 87, 29]
|
||||
idx_shuffle = [137, 125, 56, 121, 127]
|
||||
for i, j in zip(idx_next, idx_shuffle):
|
||||
_, _, _, idx1 = dense_dataset._next_py()
|
||||
_, _, _, idx2 = sparse_dataset._next_py()
|
||||
assert idx1 == i
|
||||
assert idx2 == i
|
||||
|
||||
_, _, _, idx1 = dense_dataset._random_py()
|
||||
_, _, _, idx2 = sparse_dataset._random_py()
|
||||
assert idx1 == j
|
||||
assert idx2 == j
|
||||
|
||||
|
||||
@pytest.mark.parametrize('make_dataset_32,make_dataset_64', [
|
||||
(make_dense_dataset_32, make_dense_dataset_64),
|
||||
(make_sparse_dataset_32, make_sparse_dataset_64),
|
||||
])
|
||||
def test_fused_types_consistency(make_dataset_32, make_dataset_64):
|
||||
dataset_32, dataset_64 = make_dataset_32(), make_dataset_64()
|
||||
NUMBER_OF_RUNS = 5
|
||||
for _ in range(NUMBER_OF_RUNS):
|
||||
# next sample
|
||||
(xi_data32, _, _), yi32, _, _ = dataset_32._next_py()
|
||||
(xi_data64, _, _), yi64, _, _ = dataset_64._next_py()
|
||||
|
||||
assert xi_data32.dtype == np.float32
|
||||
assert xi_data64.dtype == np.float64
|
||||
|
||||
assert_allclose(xi_data64, xi_data32, rtol=1e-5)
|
||||
assert_allclose(yi64, yi32, rtol=1e-5)
|
||||
|
||||
|
||||
def test_buffer_dtype_mismatch_error():
|
||||
with pytest.raises(ValueError, match='Buffer dtype mismatch'):
|
||||
ArrayDataset64(X32, y32, sample_weight32, seed=42),
|
||||
|
||||
with pytest.raises(ValueError, match='Buffer dtype mismatch'):
|
||||
ArrayDataset32(X64, y64, sample_weight64, seed=42),
|
||||
|
||||
with pytest.raises(ValueError, match='Buffer dtype mismatch'):
|
||||
CSRDataset64(X_csr32.data, X_csr32.indptr, X_csr32.indices, y32,
|
||||
sample_weight32, seed=42),
|
||||
|
||||
with pytest.raises(ValueError, match='Buffer dtype mismatch'):
|
||||
CSRDataset32(X_csr64.data, X_csr64.indptr, X_csr64.indices, y64,
|
||||
sample_weight64, seed=42),
|
|
@ -0,0 +1,95 @@
|
|||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
from sklearn.utils.graph import (graph_shortest_path,
|
||||
single_source_shortest_path_length)
|
||||
|
||||
|
||||
def floyd_warshall_slow(graph, directed=False):
|
||||
N = graph.shape[0]
|
||||
|
||||
#set nonzero entries to infinity
|
||||
graph[np.where(graph == 0)] = np.inf
|
||||
|
||||
#set diagonal to zero
|
||||
graph.flat[::N + 1] = 0
|
||||
|
||||
if not directed:
|
||||
graph = np.minimum(graph, graph.T)
|
||||
|
||||
for k in range(N):
|
||||
for i in range(N):
|
||||
for j in range(N):
|
||||
graph[i, j] = min(graph[i, j], graph[i, k] + graph[k, j])
|
||||
|
||||
graph[np.where(np.isinf(graph))] = 0
|
||||
|
||||
return graph
|
||||
|
||||
|
||||
def generate_graph(N=20):
|
||||
#sparse grid of distances
|
||||
rng = np.random.RandomState(0)
|
||||
dist_matrix = rng.random_sample((N, N))
|
||||
|
||||
#make symmetric: distances are not direction-dependent
|
||||
dist_matrix = dist_matrix + dist_matrix.T
|
||||
|
||||
#make graph sparse
|
||||
i = (rng.randint(N, size=N * N // 2), rng.randint(N, size=N * N // 2))
|
||||
dist_matrix[i] = 0
|
||||
|
||||
#set diagonal to zero
|
||||
dist_matrix.flat[::N + 1] = 0
|
||||
|
||||
return dist_matrix
|
||||
|
||||
|
||||
def test_floyd_warshall():
|
||||
dist_matrix = generate_graph(20)
|
||||
|
||||
for directed in (True, False):
|
||||
graph_FW = graph_shortest_path(dist_matrix, directed, 'FW')
|
||||
graph_py = floyd_warshall_slow(dist_matrix.copy(), directed)
|
||||
|
||||
assert_array_almost_equal(graph_FW, graph_py)
|
||||
|
||||
|
||||
def test_dijkstra():
|
||||
dist_matrix = generate_graph(20)
|
||||
|
||||
for directed in (True, False):
|
||||
graph_D = graph_shortest_path(dist_matrix, directed, 'D')
|
||||
graph_py = floyd_warshall_slow(dist_matrix.copy(), directed)
|
||||
|
||||
assert_array_almost_equal(graph_D, graph_py)
|
||||
|
||||
|
||||
def test_shortest_path():
|
||||
dist_matrix = generate_graph(20)
|
||||
# We compare path length and not costs (-> set distances to 0 or 1)
|
||||
dist_matrix[dist_matrix != 0] = 1
|
||||
|
||||
for directed in (True, False):
|
||||
if not directed:
|
||||
dist_matrix = np.minimum(dist_matrix, dist_matrix.T)
|
||||
|
||||
graph_py = floyd_warshall_slow(dist_matrix.copy(), directed)
|
||||
for i in range(dist_matrix.shape[0]):
|
||||
# Non-reachable nodes have distance 0 in graph_py
|
||||
dist_dict = defaultdict(int)
|
||||
dist_dict.update(single_source_shortest_path_length(dist_matrix,
|
||||
i))
|
||||
|
||||
for j in range(graph_py[i].shape[0]):
|
||||
assert_array_almost_equal(dist_dict[j], graph_py[i, j])
|
||||
|
||||
|
||||
def test_dijkstra_bug_fix():
|
||||
X = np.array([[0., 0., 4.],
|
||||
[1., 0., 2.],
|
||||
[0., 5., 0.]])
|
||||
dist_FW = graph_shortest_path(X, directed=False, method='FW')
|
||||
dist_D = graph_shortest_path(X, directed=False, method='D')
|
||||
assert_array_almost_equal(dist_D, dist_FW)
|
|
@ -0,0 +1,37 @@
|
|||
|
||||
from sklearn.utils._show_versions import _get_sys_info
|
||||
from sklearn.utils._show_versions import _get_deps_info
|
||||
from sklearn.utils._show_versions import show_versions
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
|
||||
|
||||
def test_get_sys_info():
|
||||
sys_info = _get_sys_info()
|
||||
|
||||
assert 'python' in sys_info
|
||||
assert 'executable' in sys_info
|
||||
assert 'machine' in sys_info
|
||||
|
||||
|
||||
def test_get_deps_info():
|
||||
with ignore_warnings():
|
||||
deps_info = _get_deps_info()
|
||||
|
||||
assert 'pip' in deps_info
|
||||
assert 'setuptools' in deps_info
|
||||
assert 'sklearn' in deps_info
|
||||
assert 'numpy' in deps_info
|
||||
assert 'scipy' in deps_info
|
||||
assert 'Cython' in deps_info
|
||||
assert 'pandas' in deps_info
|
||||
assert 'matplotlib' in deps_info
|
||||
assert 'joblib' in deps_info
|
||||
|
||||
|
||||
def test_show_versions(capsys):
|
||||
with ignore_warnings():
|
||||
show_versions()
|
||||
out, err = capsys.readouterr()
|
||||
|
||||
assert 'python' in out
|
||||
assert 'numpy' in out
|
617
venv/Lib/site-packages/sklearn/utils/tests/test_sparsefuncs.py
Normal file
617
venv/Lib/site-packages/sklearn/utils/tests/test_sparsefuncs.py
Normal file
|
@ -0,0 +1,617 @@
|
|||
import pytest
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from scipy import linalg
|
||||
from numpy.testing import assert_array_almost_equal, assert_array_equal
|
||||
from numpy.random import RandomState
|
||||
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.utils.sparsefuncs import (mean_variance_axis,
|
||||
incr_mean_variance_axis,
|
||||
inplace_column_scale,
|
||||
inplace_row_scale,
|
||||
inplace_swap_row, inplace_swap_column,
|
||||
min_max_axis,
|
||||
count_nonzero, csc_median_axis_0)
|
||||
from sklearn.utils.sparsefuncs_fast import (assign_rows_csr,
|
||||
inplace_csr_row_normalize_l1,
|
||||
inplace_csr_row_normalize_l2,
|
||||
csr_row_norms)
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
|
||||
def test_mean_variance_axis0():
|
||||
X, _ = make_classification(5, 4, random_state=0)
|
||||
# Sparsify the array a little bit
|
||||
X[0, 0] = 0
|
||||
X[2, 1] = 0
|
||||
X[4, 3] = 0
|
||||
X_lil = sp.lil_matrix(X)
|
||||
X_lil[1, 0] = 0
|
||||
X[1, 0] = 0
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
mean_variance_axis(X_lil, axis=0)
|
||||
|
||||
X_csr = sp.csr_matrix(X_lil)
|
||||
X_csc = sp.csc_matrix(X_lil)
|
||||
|
||||
expected_dtypes = [(np.float32, np.float32),
|
||||
(np.float64, np.float64),
|
||||
(np.int32, np.float64),
|
||||
(np.int64, np.float64)]
|
||||
|
||||
for input_dtype, output_dtype in expected_dtypes:
|
||||
X_test = X.astype(input_dtype)
|
||||
for X_sparse in (X_csr, X_csc):
|
||||
X_sparse = X_sparse.astype(input_dtype)
|
||||
X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
|
||||
assert X_means.dtype == output_dtype
|
||||
assert X_vars.dtype == output_dtype
|
||||
assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
|
||||
assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
|
||||
|
||||
|
||||
def test_mean_variance_axis1():
|
||||
X, _ = make_classification(5, 4, random_state=0)
|
||||
# Sparsify the array a little bit
|
||||
X[0, 0] = 0
|
||||
X[2, 1] = 0
|
||||
X[4, 3] = 0
|
||||
X_lil = sp.lil_matrix(X)
|
||||
X_lil[1, 0] = 0
|
||||
X[1, 0] = 0
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
mean_variance_axis(X_lil, axis=1)
|
||||
|
||||
X_csr = sp.csr_matrix(X_lil)
|
||||
X_csc = sp.csc_matrix(X_lil)
|
||||
|
||||
expected_dtypes = [(np.float32, np.float32),
|
||||
(np.float64, np.float64),
|
||||
(np.int32, np.float64),
|
||||
(np.int64, np.float64)]
|
||||
|
||||
for input_dtype, output_dtype in expected_dtypes:
|
||||
X_test = X.astype(input_dtype)
|
||||
for X_sparse in (X_csr, X_csc):
|
||||
X_sparse = X_sparse.astype(input_dtype)
|
||||
X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
|
||||
assert X_means.dtype == output_dtype
|
||||
assert X_vars.dtype == output_dtype
|
||||
assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
|
||||
assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
|
||||
|
||||
|
||||
def test_incr_mean_variance_axis():
|
||||
for axis in [0, 1]:
|
||||
rng = np.random.RandomState(0)
|
||||
n_features = 50
|
||||
n_samples = 10
|
||||
data_chunks = [rng.randint(0, 2, size=n_features)
|
||||
for i in range(n_samples)]
|
||||
|
||||
# default params for incr_mean_variance
|
||||
last_mean = np.zeros(n_features)
|
||||
last_var = np.zeros_like(last_mean)
|
||||
last_n = np.zeros_like(last_mean, dtype=np.int64)
|
||||
|
||||
# Test errors
|
||||
X = np.array(data_chunks[0])
|
||||
X = np.atleast_2d(X)
|
||||
X_lil = sp.lil_matrix(X)
|
||||
X_csr = sp.csr_matrix(X_lil)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
incr_mean_variance_axis(X=axis, axis=last_mean, last_mean=last_var,
|
||||
last_var=last_n)
|
||||
with pytest.raises(TypeError):
|
||||
incr_mean_variance_axis(X_lil, axis=axis, last_mean=last_mean,
|
||||
last_var=last_var, last_n=last_n)
|
||||
|
||||
# Test _incr_mean_and_var with a 1 row input
|
||||
X_means, X_vars = mean_variance_axis(X_csr, axis)
|
||||
X_means_incr, X_vars_incr, n_incr = \
|
||||
incr_mean_variance_axis(X_csr, axis=axis, last_mean=last_mean,
|
||||
last_var=last_var, last_n=last_n)
|
||||
assert_array_almost_equal(X_means, X_means_incr)
|
||||
assert_array_almost_equal(X_vars, X_vars_incr)
|
||||
# X.shape[axis] picks # samples
|
||||
assert_array_equal(X.shape[axis], n_incr)
|
||||
|
||||
X_csc = sp.csc_matrix(X_lil)
|
||||
X_means, X_vars = mean_variance_axis(X_csc, axis)
|
||||
assert_array_almost_equal(X_means, X_means_incr)
|
||||
assert_array_almost_equal(X_vars, X_vars_incr)
|
||||
assert_array_equal(X.shape[axis], n_incr)
|
||||
|
||||
# Test _incremental_mean_and_var with whole data
|
||||
X = np.vstack(data_chunks)
|
||||
X_lil = sp.lil_matrix(X)
|
||||
X_csr = sp.csr_matrix(X_lil)
|
||||
X_csc = sp.csc_matrix(X_lil)
|
||||
|
||||
expected_dtypes = [(np.float32, np.float32),
|
||||
(np.float64, np.float64),
|
||||
(np.int32, np.float64),
|
||||
(np.int64, np.float64)]
|
||||
|
||||
for input_dtype, output_dtype in expected_dtypes:
|
||||
for X_sparse in (X_csr, X_csc):
|
||||
X_sparse = X_sparse.astype(input_dtype)
|
||||
last_mean = last_mean.astype(output_dtype)
|
||||
last_var = last_var.astype(output_dtype)
|
||||
X_means, X_vars = mean_variance_axis(X_sparse, axis)
|
||||
X_means_incr, X_vars_incr, n_incr = \
|
||||
incr_mean_variance_axis(X_sparse, axis=axis,
|
||||
last_mean=last_mean,
|
||||
last_var=last_var,
|
||||
last_n=last_n)
|
||||
assert X_means_incr.dtype == output_dtype
|
||||
assert X_vars_incr.dtype == output_dtype
|
||||
assert_array_almost_equal(X_means, X_means_incr)
|
||||
assert_array_almost_equal(X_vars, X_vars_incr)
|
||||
assert_array_equal(X.shape[axis], n_incr)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X1, X2",
|
||||
[
|
||||
(sp.random(5, 2, density=0.8, format='csr', random_state=0),
|
||||
sp.random(13, 2, density=0.8, format='csr', random_state=0)),
|
||||
(sp.random(5, 2, density=0.8, format='csr', random_state=0),
|
||||
sp.hstack([sp.csr_matrix(np.full((13, 1), fill_value=np.nan)),
|
||||
sp.random(13, 1, density=0.8, random_state=42)],
|
||||
format="csr"))
|
||||
]
|
||||
)
|
||||
def test_incr_mean_variance_axis_equivalence_mean_variance(X1, X2):
|
||||
# non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/16448
|
||||
# check that computing the incremental mean and variance is equivalent to
|
||||
# computing the mean and variance on the stacked dataset.
|
||||
axis = 0
|
||||
last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1])
|
||||
last_n = np.zeros(X1.shape[1], dtype=np.int64)
|
||||
updated_mean, updated_var, updated_n = incr_mean_variance_axis(
|
||||
X1, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
|
||||
)
|
||||
updated_mean, updated_var, updated_n = incr_mean_variance_axis(
|
||||
X2, axis=axis, last_mean=updated_mean, last_var=updated_var,
|
||||
last_n=updated_n
|
||||
)
|
||||
X = sp.vstack([X1, X2])
|
||||
assert_allclose(updated_mean, np.nanmean(X.A, axis=axis))
|
||||
assert_allclose(updated_var, np.nanvar(X.A, axis=axis))
|
||||
assert_allclose(updated_n, np.count_nonzero(~np.isnan(X.A), axis=0))
|
||||
|
||||
|
||||
def test_incr_mean_variance_no_new_n():
|
||||
# check the behaviour when we update the variance with an empty matrix
|
||||
axis = 0
|
||||
X1 = sp.random(5, 1, density=0.8, random_state=0).tocsr()
|
||||
X2 = sp.random(0, 1, density=0.8, random_state=0).tocsr()
|
||||
last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1])
|
||||
last_n = np.zeros(X1.shape[1], dtype=np.int64)
|
||||
last_mean, last_var, last_n = incr_mean_variance_axis(
|
||||
X1, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
|
||||
)
|
||||
# update statistic with a column which should ignored
|
||||
updated_mean, updated_var, updated_n = incr_mean_variance_axis(
|
||||
X2, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
|
||||
)
|
||||
assert_allclose(updated_mean, last_mean)
|
||||
assert_allclose(updated_var, last_var)
|
||||
assert_allclose(updated_n, last_n)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("axis", [0, 1])
|
||||
@pytest.mark.parametrize("sparse_constructor", [sp.csc_matrix, sp.csr_matrix])
|
||||
def test_incr_mean_variance_axis_ignore_nan(axis, sparse_constructor):
|
||||
old_means = np.array([535., 535., 535., 535.])
|
||||
old_variances = np.array([4225., 4225., 4225., 4225.])
|
||||
old_sample_count = np.array([2, 2, 2, 2], dtype=np.int64)
|
||||
|
||||
X = sparse_constructor(
|
||||
np.array([[170, 170, 170, 170],
|
||||
[430, 430, 430, 430],
|
||||
[300, 300, 300, 300]]))
|
||||
|
||||
X_nan = sparse_constructor(
|
||||
np.array([[170, np.nan, 170, 170],
|
||||
[np.nan, 170, 430, 430],
|
||||
[430, 430, np.nan, 300],
|
||||
[300, 300, 300, np.nan]]))
|
||||
|
||||
# we avoid creating specific data for axis 0 and 1: translating the data is
|
||||
# enough.
|
||||
if axis:
|
||||
X = X.T
|
||||
X_nan = X_nan.T
|
||||
|
||||
# take a copy of the old statistics since they are modified in place.
|
||||
X_means, X_vars, X_sample_count = incr_mean_variance_axis(
|
||||
X, axis=axis, last_mean=old_means.copy(),
|
||||
last_var=old_variances.copy(), last_n=old_sample_count.copy())
|
||||
X_nan_means, X_nan_vars, X_nan_sample_count = incr_mean_variance_axis(
|
||||
X_nan, axis=axis, last_mean=old_means.copy(),
|
||||
last_var=old_variances.copy(), last_n=old_sample_count.copy())
|
||||
|
||||
assert_allclose(X_nan_means, X_means)
|
||||
assert_allclose(X_nan_vars, X_vars)
|
||||
assert_allclose(X_nan_sample_count, X_sample_count)
|
||||
|
||||
|
||||
def test_mean_variance_illegal_axis():
|
||||
X, _ = make_classification(5, 4, random_state=0)
|
||||
# Sparsify the array a little bit
|
||||
X[0, 0] = 0
|
||||
X[2, 1] = 0
|
||||
X[4, 3] = 0
|
||||
X_csr = sp.csr_matrix(X)
|
||||
with pytest.raises(ValueError):
|
||||
mean_variance_axis(X_csr, axis=-3)
|
||||
with pytest.raises(ValueError):
|
||||
mean_variance_axis(X_csr, axis=2)
|
||||
with pytest.raises(ValueError):
|
||||
mean_variance_axis(X_csr, axis=-1)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
incr_mean_variance_axis(X_csr, axis=-3, last_mean=None, last_var=None,
|
||||
last_n=None)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
incr_mean_variance_axis(X_csr, axis=2, last_mean=None, last_var=None,
|
||||
last_n=None)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
incr_mean_variance_axis(X_csr, axis=-1, last_mean=None, last_var=None,
|
||||
last_n=None)
|
||||
|
||||
|
||||
def test_densify_rows():
|
||||
for dtype in (np.float32, np.float64):
|
||||
X = sp.csr_matrix([[0, 3, 0],
|
||||
[2, 4, 0],
|
||||
[0, 0, 0],
|
||||
[9, 8, 7],
|
||||
[4, 0, 5]], dtype=dtype)
|
||||
X_rows = np.array([0, 2, 3], dtype=np.intp)
|
||||
out = np.ones((6, X.shape[1]), dtype=dtype)
|
||||
out_rows = np.array([1, 3, 4], dtype=np.intp)
|
||||
|
||||
expect = np.ones_like(out)
|
||||
expect[out_rows] = X[X_rows, :].toarray()
|
||||
|
||||
assign_rows_csr(X, X_rows, out_rows, out)
|
||||
assert_array_equal(out, expect)
|
||||
|
||||
|
||||
def test_inplace_column_scale():
|
||||
rng = np.random.RandomState(0)
|
||||
X = sp.rand(100, 200, 0.05)
|
||||
Xr = X.tocsr()
|
||||
Xc = X.tocsc()
|
||||
XA = X.toarray()
|
||||
scale = rng.rand(200)
|
||||
XA *= scale
|
||||
|
||||
inplace_column_scale(Xc, scale)
|
||||
inplace_column_scale(Xr, scale)
|
||||
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_column_scale(X.tolil(), scale)
|
||||
|
||||
X = X.astype(np.float32)
|
||||
scale = scale.astype(np.float32)
|
||||
Xr = X.tocsr()
|
||||
Xc = X.tocsc()
|
||||
XA = X.toarray()
|
||||
XA *= scale
|
||||
inplace_column_scale(Xc, scale)
|
||||
inplace_column_scale(Xr, scale)
|
||||
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_column_scale(X.tolil(), scale)
|
||||
|
||||
|
||||
def test_inplace_row_scale():
|
||||
rng = np.random.RandomState(0)
|
||||
X = sp.rand(100, 200, 0.05)
|
||||
Xr = X.tocsr()
|
||||
Xc = X.tocsc()
|
||||
XA = X.toarray()
|
||||
scale = rng.rand(100)
|
||||
XA *= scale.reshape(-1, 1)
|
||||
|
||||
inplace_row_scale(Xc, scale)
|
||||
inplace_row_scale(Xr, scale)
|
||||
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_column_scale(X.tolil(), scale)
|
||||
|
||||
X = X.astype(np.float32)
|
||||
scale = scale.astype(np.float32)
|
||||
Xr = X.tocsr()
|
||||
Xc = X.tocsc()
|
||||
XA = X.toarray()
|
||||
XA *= scale.reshape(-1, 1)
|
||||
inplace_row_scale(Xc, scale)
|
||||
inplace_row_scale(Xr, scale)
|
||||
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_column_scale(X.tolil(), scale)
|
||||
|
||||
|
||||
def test_inplace_swap_row():
|
||||
X = np.array([[0, 3, 0],
|
||||
[2, 4, 0],
|
||||
[0, 0, 0],
|
||||
[9, 8, 7],
|
||||
[4, 0, 5]], dtype=np.float64)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
X_csc = sp.csc_matrix(X)
|
||||
|
||||
swap = linalg.get_blas_funcs(('swap',), (X,))
|
||||
swap = swap[0]
|
||||
X[0], X[-1] = swap(X[0], X[-1])
|
||||
inplace_swap_row(X_csr, 0, -1)
|
||||
inplace_swap_row(X_csc, 0, -1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
|
||||
X[2], X[3] = swap(X[2], X[3])
|
||||
inplace_swap_row(X_csr, 2, 3)
|
||||
inplace_swap_row(X_csc, 2, 3)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_swap_row(X_csr.tolil())
|
||||
|
||||
X = np.array([[0, 3, 0],
|
||||
[2, 4, 0],
|
||||
[0, 0, 0],
|
||||
[9, 8, 7],
|
||||
[4, 0, 5]], dtype=np.float32)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
X_csc = sp.csc_matrix(X)
|
||||
swap = linalg.get_blas_funcs(('swap',), (X,))
|
||||
swap = swap[0]
|
||||
X[0], X[-1] = swap(X[0], X[-1])
|
||||
inplace_swap_row(X_csr, 0, -1)
|
||||
inplace_swap_row(X_csc, 0, -1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
X[2], X[3] = swap(X[2], X[3])
|
||||
inplace_swap_row(X_csr, 2, 3)
|
||||
inplace_swap_row(X_csc, 2, 3)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_swap_row(X_csr.tolil())
|
||||
|
||||
|
||||
def test_inplace_swap_column():
|
||||
X = np.array([[0, 3, 0],
|
||||
[2, 4, 0],
|
||||
[0, 0, 0],
|
||||
[9, 8, 7],
|
||||
[4, 0, 5]], dtype=np.float64)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
X_csc = sp.csc_matrix(X)
|
||||
|
||||
swap = linalg.get_blas_funcs(('swap',), (X,))
|
||||
swap = swap[0]
|
||||
X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1])
|
||||
inplace_swap_column(X_csr, 0, -1)
|
||||
inplace_swap_column(X_csc, 0, -1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
|
||||
X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1])
|
||||
inplace_swap_column(X_csr, 0, 1)
|
||||
inplace_swap_column(X_csc, 0, 1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_swap_column(X_csr.tolil())
|
||||
|
||||
X = np.array([[0, 3, 0],
|
||||
[2, 4, 0],
|
||||
[0, 0, 0],
|
||||
[9, 8, 7],
|
||||
[4, 0, 5]], dtype=np.float32)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
X_csc = sp.csc_matrix(X)
|
||||
swap = linalg.get_blas_funcs(('swap',), (X,))
|
||||
swap = swap[0]
|
||||
X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1])
|
||||
inplace_swap_column(X_csr, 0, -1)
|
||||
inplace_swap_column(X_csc, 0, -1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1])
|
||||
inplace_swap_column(X_csr, 0, 1)
|
||||
inplace_swap_column(X_csc, 0, 1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_swap_column(X_csr.tolil())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize("axis", [0, 1, None])
|
||||
@pytest.mark.parametrize("sparse_format", [sp.csr_matrix, sp.csc_matrix])
|
||||
@pytest.mark.parametrize(
|
||||
"missing_values, min_func, max_func, ignore_nan",
|
||||
[(0, np.min, np.max, False),
|
||||
(np.nan, np.nanmin, np.nanmax, True)]
|
||||
)
|
||||
@pytest.mark.parametrize("large_indices", [True, False])
|
||||
def test_min_max(dtype, axis, sparse_format, missing_values, min_func,
|
||||
max_func, ignore_nan, large_indices):
|
||||
X = np.array([[0, 3, 0],
|
||||
[2, -1, missing_values],
|
||||
[0, 0, 0],
|
||||
[9, missing_values, 7],
|
||||
[4, 0, 5]], dtype=dtype)
|
||||
X_sparse = sparse_format(X)
|
||||
if large_indices:
|
||||
X_sparse.indices = X_sparse.indices.astype('int64')
|
||||
X_sparse.indptr = X_sparse.indptr.astype('int64')
|
||||
|
||||
mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis,
|
||||
ignore_nan=ignore_nan)
|
||||
assert_array_equal(mins_sparse, min_func(X, axis=axis))
|
||||
assert_array_equal(maxs_sparse, max_func(X, axis=axis))
|
||||
|
||||
|
||||
def test_min_max_axis_errors():
|
||||
X = np.array([[0, 3, 0],
|
||||
[2, -1, 0],
|
||||
[0, 0, 0],
|
||||
[9, 8, 7],
|
||||
[4, 0, 5]], dtype=np.float64)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
X_csc = sp.csc_matrix(X)
|
||||
with pytest.raises(TypeError):
|
||||
min_max_axis(X_csr.tolil(), axis=0)
|
||||
with pytest.raises(ValueError):
|
||||
min_max_axis(X_csr, axis=2)
|
||||
with pytest.raises(ValueError):
|
||||
min_max_axis(X_csc, axis=-3)
|
||||
|
||||
|
||||
def test_count_nonzero():
|
||||
X = np.array([[0, 3, 0],
|
||||
[2, -1, 0],
|
||||
[0, 0, 0],
|
||||
[9, 8, 7],
|
||||
[4, 0, 5]], dtype=np.float64)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
X_csc = sp.csc_matrix(X)
|
||||
X_nonzero = X != 0
|
||||
sample_weight = [.5, .2, .3, .1, .1]
|
||||
X_nonzero_weighted = X_nonzero * np.array(sample_weight)[:, None]
|
||||
|
||||
for axis in [0, 1, -1, -2, None]:
|
||||
assert_array_almost_equal(count_nonzero(X_csr, axis=axis),
|
||||
X_nonzero.sum(axis=axis))
|
||||
assert_array_almost_equal(count_nonzero(X_csr, axis=axis,
|
||||
sample_weight=sample_weight),
|
||||
X_nonzero_weighted.sum(axis=axis))
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
count_nonzero(X_csc)
|
||||
with pytest.raises(ValueError):
|
||||
count_nonzero(X_csr, axis=2)
|
||||
|
||||
assert (count_nonzero(X_csr, axis=0).dtype ==
|
||||
count_nonzero(X_csr, axis=1).dtype)
|
||||
assert (count_nonzero(X_csr, axis=0, sample_weight=sample_weight).dtype ==
|
||||
count_nonzero(X_csr, axis=1, sample_weight=sample_weight).dtype)
|
||||
|
||||
# Check dtypes with large sparse matrices too
|
||||
# XXX: test fails on 32bit (Windows/Linux)
|
||||
try:
|
||||
X_csr.indices = X_csr.indices.astype(np.int64)
|
||||
X_csr.indptr = X_csr.indptr.astype(np.int64)
|
||||
assert (count_nonzero(X_csr, axis=0).dtype ==
|
||||
count_nonzero(X_csr, axis=1).dtype)
|
||||
assert (count_nonzero(X_csr, axis=0,
|
||||
sample_weight=sample_weight).dtype ==
|
||||
count_nonzero(X_csr, axis=1,
|
||||
sample_weight=sample_weight).dtype)
|
||||
except TypeError as e:
|
||||
assert ("according to the rule 'safe'" in e.args[0]
|
||||
and np.intp().nbytes < 8), e
|
||||
|
||||
|
||||
def test_csc_row_median():
|
||||
# Test csc_row_median actually calculates the median.
|
||||
|
||||
# Test that it gives the same output when X is dense.
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(100, 50)
|
||||
dense_median = np.median(X, axis=0)
|
||||
csc = sp.csc_matrix(X)
|
||||
sparse_median = csc_median_axis_0(csc)
|
||||
assert_array_equal(sparse_median, dense_median)
|
||||
|
||||
# Test that it gives the same output when X is sparse
|
||||
X = rng.rand(51, 100)
|
||||
X[X < 0.7] = 0.0
|
||||
ind = rng.randint(0, 50, 10)
|
||||
X[ind] = -X[ind]
|
||||
csc = sp.csc_matrix(X)
|
||||
dense_median = np.median(X, axis=0)
|
||||
sparse_median = csc_median_axis_0(csc)
|
||||
assert_array_equal(sparse_median, dense_median)
|
||||
|
||||
# Test for toy data.
|
||||
X = [[0, -2], [-1, -1], [1, 0], [2, 1]]
|
||||
csc = sp.csc_matrix(X)
|
||||
assert_array_equal(csc_median_axis_0(csc), np.array([0.5, -0.5]))
|
||||
X = [[0, -2], [-1, -5], [1, -3]]
|
||||
csc = sp.csc_matrix(X)
|
||||
assert_array_equal(csc_median_axis_0(csc), np.array([0., -3]))
|
||||
|
||||
# Test that it raises an Error for non-csc matrices.
|
||||
with pytest.raises(TypeError):
|
||||
csc_median_axis_0(sp.csr_matrix(X))
|
||||
|
||||
|
||||
def test_inplace_normalize():
|
||||
ones = np.ones((10, 1))
|
||||
rs = RandomState(10)
|
||||
|
||||
for inplace_csr_row_normalize in (inplace_csr_row_normalize_l1,
|
||||
inplace_csr_row_normalize_l2):
|
||||
for dtype in (np.float64, np.float32):
|
||||
X = rs.randn(10, 5).astype(dtype)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
for index_dtype in [np.int32, np.int64]:
|
||||
# csr_matrix will use int32 indices by default,
|
||||
# up-casting those to int64 when necessary
|
||||
if index_dtype is np.int64:
|
||||
X_csr.indptr = X_csr.indptr.astype(index_dtype)
|
||||
X_csr.indices = X_csr.indices.astype(index_dtype)
|
||||
assert X_csr.indices.dtype == index_dtype
|
||||
assert X_csr.indptr.dtype == index_dtype
|
||||
inplace_csr_row_normalize(X_csr)
|
||||
assert X_csr.dtype == dtype
|
||||
if inplace_csr_row_normalize is inplace_csr_row_normalize_l2:
|
||||
X_csr.data **= 2
|
||||
assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_csr_row_norms(dtype):
|
||||
# checks that csr_row_norms returns the same output as
|
||||
# scipy.sparse.linalg.norm, and that the dype is the same as X.dtype.
|
||||
X = sp.random(100, 10, format='csr', dtype=dtype, random_state=42)
|
||||
|
||||
scipy_norms = sp.linalg.norm(X, axis=1)**2
|
||||
norms = csr_row_norms(X)
|
||||
|
||||
assert norms.dtype == dtype
|
||||
rtol = 1e-6 if dtype == np.float32 else 1e-7
|
||||
assert_allclose(norms, scipy_norms, rtol=rtol)
|
696
venv/Lib/site-packages/sklearn/utils/tests/test_testing.py
Normal file
696
venv/Lib/site-packages/sklearn/utils/tests/test_testing.py
Normal file
|
@ -0,0 +1,696 @@
|
|||
import warnings
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import atexit
|
||||
|
||||
import numpy as np
|
||||
|
||||
from scipy import sparse
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.utils.deprecation import deprecated
|
||||
from sklearn.utils.metaestimators import if_delegate_has_method
|
||||
from sklearn.utils._testing import (
|
||||
assert_raises,
|
||||
assert_less,
|
||||
assert_greater,
|
||||
assert_less_equal,
|
||||
assert_greater_equal,
|
||||
assert_warns,
|
||||
assert_no_warnings,
|
||||
assert_equal,
|
||||
assert_not_equal,
|
||||
assert_in,
|
||||
assert_not_in,
|
||||
set_random_state,
|
||||
assert_raise_message,
|
||||
ignore_warnings,
|
||||
check_docstring_parameters,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_raises_regex,
|
||||
TempMemmap,
|
||||
create_memmap_backed_data,
|
||||
_delete_folder,
|
||||
_convert_container)
|
||||
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore",
|
||||
category=FutureWarning) # 0.24
|
||||
def test_assert_less():
|
||||
assert 0 < 1
|
||||
with pytest.raises(AssertionError):
|
||||
assert_less(1, 0)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore",
|
||||
category=FutureWarning) # 0.24
|
||||
def test_assert_greater():
|
||||
assert 1 > 0
|
||||
with pytest.raises(AssertionError):
|
||||
assert_greater(0, 1)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore",
|
||||
category=FutureWarning) # 0.24
|
||||
def test_assert_less_equal():
|
||||
assert 0 <= 1
|
||||
assert 1 <= 1
|
||||
with pytest.raises(AssertionError):
|
||||
assert_less_equal(1, 0)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore",
|
||||
category=FutureWarning) # 0.24
|
||||
def test_assert_greater_equal():
|
||||
assert 1 >= 0
|
||||
assert 1 >= 1
|
||||
with pytest.raises(AssertionError):
|
||||
assert_greater_equal(0, 1)
|
||||
|
||||
|
||||
def test_set_random_state():
|
||||
lda = LinearDiscriminantAnalysis()
|
||||
tree = DecisionTreeClassifier()
|
||||
# Linear Discriminant Analysis doesn't have random state: smoke test
|
||||
set_random_state(lda, 3)
|
||||
set_random_state(tree, 3)
|
||||
assert tree.random_state == 3
|
||||
|
||||
|
||||
def test_assert_allclose_dense_sparse():
|
||||
x = np.arange(9).reshape(3, 3)
|
||||
msg = "Not equal to tolerance "
|
||||
y = sparse.csc_matrix(x)
|
||||
for X in [x, y]:
|
||||
# basic compare
|
||||
with pytest.raises(AssertionError, match=msg):
|
||||
assert_allclose_dense_sparse(X, X*2)
|
||||
assert_allclose_dense_sparse(X, X)
|
||||
|
||||
with pytest.raises(ValueError, match="Can only compare two sparse"):
|
||||
assert_allclose_dense_sparse(x, y)
|
||||
|
||||
A = sparse.diags(np.ones(5), offsets=0).tocsr()
|
||||
B = sparse.csr_matrix(np.ones((1, 5)))
|
||||
with pytest.raises(AssertionError, match="Arrays are not equal"):
|
||||
assert_allclose_dense_sparse(B, A)
|
||||
|
||||
|
||||
def test_assert_raises_msg():
|
||||
with assert_raises_regex(AssertionError, 'Hello world'):
|
||||
with assert_raises(ValueError, msg='Hello world'):
|
||||
pass
|
||||
|
||||
|
||||
def test_assert_raise_message():
|
||||
def _raise_ValueError(message):
|
||||
raise ValueError(message)
|
||||
|
||||
def _no_raise():
|
||||
pass
|
||||
|
||||
assert_raise_message(ValueError, "test",
|
||||
_raise_ValueError, "test")
|
||||
|
||||
assert_raises(AssertionError,
|
||||
assert_raise_message, ValueError, "something else",
|
||||
_raise_ValueError, "test")
|
||||
|
||||
assert_raises(ValueError,
|
||||
assert_raise_message, TypeError, "something else",
|
||||
_raise_ValueError, "test")
|
||||
|
||||
assert_raises(AssertionError,
|
||||
assert_raise_message, ValueError, "test",
|
||||
_no_raise)
|
||||
|
||||
# multiple exceptions in a tuple
|
||||
assert_raises(AssertionError,
|
||||
assert_raise_message, (ValueError, AttributeError),
|
||||
"test", _no_raise)
|
||||
|
||||
|
||||
def test_ignore_warning():
|
||||
# This check that ignore_warning decorateur and context manager are working
|
||||
# as expected
|
||||
def _warning_function():
|
||||
warnings.warn("deprecation warning", DeprecationWarning)
|
||||
|
||||
def _multiple_warning_function():
|
||||
warnings.warn("deprecation warning", DeprecationWarning)
|
||||
warnings.warn("deprecation warning")
|
||||
|
||||
# Check the function directly
|
||||
assert_no_warnings(ignore_warnings(_warning_function))
|
||||
assert_no_warnings(ignore_warnings(_warning_function,
|
||||
category=DeprecationWarning))
|
||||
assert_warns(DeprecationWarning, ignore_warnings(_warning_function,
|
||||
category=UserWarning))
|
||||
assert_warns(UserWarning,
|
||||
ignore_warnings(_multiple_warning_function,
|
||||
category=FutureWarning))
|
||||
assert_warns(DeprecationWarning,
|
||||
ignore_warnings(_multiple_warning_function,
|
||||
category=UserWarning))
|
||||
assert_no_warnings(ignore_warnings(_warning_function,
|
||||
category=(DeprecationWarning,
|
||||
UserWarning)))
|
||||
|
||||
# Check the decorator
|
||||
@ignore_warnings
|
||||
def decorator_no_warning():
|
||||
_warning_function()
|
||||
_multiple_warning_function()
|
||||
|
||||
@ignore_warnings(category=(DeprecationWarning, UserWarning))
|
||||
def decorator_no_warning_multiple():
|
||||
_multiple_warning_function()
|
||||
|
||||
@ignore_warnings(category=DeprecationWarning)
|
||||
def decorator_no_deprecation_warning():
|
||||
_warning_function()
|
||||
|
||||
@ignore_warnings(category=UserWarning)
|
||||
def decorator_no_user_warning():
|
||||
_warning_function()
|
||||
|
||||
@ignore_warnings(category=DeprecationWarning)
|
||||
def decorator_no_deprecation_multiple_warning():
|
||||
_multiple_warning_function()
|
||||
|
||||
@ignore_warnings(category=UserWarning)
|
||||
def decorator_no_user_multiple_warning():
|
||||
_multiple_warning_function()
|
||||
|
||||
assert_no_warnings(decorator_no_warning)
|
||||
assert_no_warnings(decorator_no_warning_multiple)
|
||||
assert_no_warnings(decorator_no_deprecation_warning)
|
||||
assert_warns(DeprecationWarning, decorator_no_user_warning)
|
||||
assert_warns(UserWarning, decorator_no_deprecation_multiple_warning)
|
||||
assert_warns(DeprecationWarning, decorator_no_user_multiple_warning)
|
||||
|
||||
# Check the context manager
|
||||
def context_manager_no_warning():
|
||||
with ignore_warnings():
|
||||
_warning_function()
|
||||
|
||||
def context_manager_no_warning_multiple():
|
||||
with ignore_warnings(category=(DeprecationWarning, UserWarning)):
|
||||
_multiple_warning_function()
|
||||
|
||||
def context_manager_no_deprecation_warning():
|
||||
with ignore_warnings(category=DeprecationWarning):
|
||||
_warning_function()
|
||||
|
||||
def context_manager_no_user_warning():
|
||||
with ignore_warnings(category=UserWarning):
|
||||
_warning_function()
|
||||
|
||||
def context_manager_no_deprecation_multiple_warning():
|
||||
with ignore_warnings(category=DeprecationWarning):
|
||||
_multiple_warning_function()
|
||||
|
||||
def context_manager_no_user_multiple_warning():
|
||||
with ignore_warnings(category=UserWarning):
|
||||
_multiple_warning_function()
|
||||
|
||||
assert_no_warnings(context_manager_no_warning)
|
||||
assert_no_warnings(context_manager_no_warning_multiple)
|
||||
assert_no_warnings(context_manager_no_deprecation_warning)
|
||||
assert_warns(DeprecationWarning, context_manager_no_user_warning)
|
||||
assert_warns(UserWarning, context_manager_no_deprecation_multiple_warning)
|
||||
assert_warns(DeprecationWarning, context_manager_no_user_multiple_warning)
|
||||
|
||||
# Check that passing warning class as first positional argument
|
||||
warning_class = UserWarning
|
||||
match = "'obj' should be a callable.+you should use 'category=UserWarning'"
|
||||
|
||||
with pytest.raises(ValueError, match=match):
|
||||
silence_warnings_func = ignore_warnings(warning_class)(
|
||||
_warning_function)
|
||||
silence_warnings_func()
|
||||
|
||||
with pytest.raises(ValueError, match=match):
|
||||
@ignore_warnings(warning_class)
|
||||
def test():
|
||||
pass
|
||||
|
||||
|
||||
class TestWarns(unittest.TestCase):
|
||||
def test_warn(self):
|
||||
def f():
|
||||
warnings.warn("yo")
|
||||
return 3
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", UserWarning)
|
||||
filters_orig = warnings.filters[:]
|
||||
assert assert_warns(UserWarning, f) == 3
|
||||
# test that assert_warns doesn't have side effects on warnings
|
||||
# filters
|
||||
assert warnings.filters == filters_orig
|
||||
with pytest.raises(AssertionError):
|
||||
assert_no_warnings(f)
|
||||
assert assert_no_warnings(lambda x: x, 1) == 1
|
||||
|
||||
def test_warn_wrong_warning(self):
|
||||
def f():
|
||||
warnings.warn("yo", FutureWarning)
|
||||
|
||||
failed = False
|
||||
filters = sys.modules['warnings'].filters[:]
|
||||
try:
|
||||
try:
|
||||
# Should raise an AssertionError
|
||||
|
||||
# assert_warns has a special handling of "FutureWarning" that
|
||||
# pytest.warns does not have
|
||||
assert_warns(UserWarning, f)
|
||||
failed = True
|
||||
except AssertionError:
|
||||
pass
|
||||
finally:
|
||||
sys.modules['warnings'].filters = filters
|
||||
|
||||
if failed:
|
||||
raise AssertionError("wrong warning caught by assert_warn")
|
||||
|
||||
|
||||
# Tests for docstrings:
|
||||
|
||||
def f_ok(a, b):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
b : float
|
||||
Parameter b
|
||||
|
||||
Returns
|
||||
-------
|
||||
c : list
|
||||
Parameter c
|
||||
"""
|
||||
c = a + b
|
||||
return c
|
||||
|
||||
|
||||
def f_bad_sections(a, b):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
b : float
|
||||
Parameter b
|
||||
|
||||
Results
|
||||
-------
|
||||
c : list
|
||||
Parameter c
|
||||
"""
|
||||
c = a + b
|
||||
return c
|
||||
|
||||
|
||||
def f_bad_order(b, a):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
b : float
|
||||
Parameter b
|
||||
|
||||
Returns
|
||||
-------
|
||||
c : list
|
||||
Parameter c
|
||||
"""
|
||||
c = a + b
|
||||
return c
|
||||
|
||||
|
||||
def f_too_many_param_docstring(a, b):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
b : int
|
||||
Parameter b
|
||||
c : int
|
||||
Parameter c
|
||||
|
||||
Returns
|
||||
-------
|
||||
d : list
|
||||
Parameter c
|
||||
"""
|
||||
d = a + b
|
||||
return d
|
||||
|
||||
|
||||
def f_missing(a, b):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
|
||||
Returns
|
||||
-------
|
||||
c : list
|
||||
Parameter c
|
||||
"""
|
||||
c = a + b
|
||||
return c
|
||||
|
||||
|
||||
def f_check_param_definition(a, b, c, d, e):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a: int
|
||||
Parameter a
|
||||
b:
|
||||
Parameter b
|
||||
c :
|
||||
Parameter c
|
||||
d:int
|
||||
Parameter d
|
||||
e
|
||||
No typespec is allowed without colon
|
||||
"""
|
||||
return a + b + c + d
|
||||
|
||||
|
||||
class Klass:
|
||||
def f_missing(self, X, y):
|
||||
pass
|
||||
|
||||
def f_bad_sections(self, X, y):
|
||||
"""Function f
|
||||
|
||||
Parameter
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
b : float
|
||||
Parameter b
|
||||
|
||||
Results
|
||||
-------
|
||||
c : list
|
||||
Parameter c
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class MockEst:
|
||||
def __init__(self):
|
||||
"""MockEstimator"""
|
||||
def fit(self, X, y):
|
||||
return X
|
||||
|
||||
def predict(self, X):
|
||||
return X
|
||||
|
||||
def predict_proba(self, X):
|
||||
return X
|
||||
|
||||
def score(self, X):
|
||||
return 1.
|
||||
|
||||
|
||||
class MockMetaEstimator:
|
||||
def __init__(self, delegate):
|
||||
"""MetaEstimator to check if doctest on delegated methods work.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
delegate : estimator
|
||||
Delegated estimator.
|
||||
"""
|
||||
self.delegate = delegate
|
||||
|
||||
@if_delegate_has_method(delegate=('delegate'))
|
||||
def predict(self, X):
|
||||
"""This is available only if delegate has predict.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : ndarray
|
||||
Parameter y
|
||||
"""
|
||||
return self.delegate.predict(X)
|
||||
|
||||
@if_delegate_has_method(delegate=('delegate'))
|
||||
@deprecated("Testing a deprecated delegated method")
|
||||
def score(self, X):
|
||||
"""This is available only if delegate has score.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
y : ndarray
|
||||
Parameter y
|
||||
"""
|
||||
|
||||
@if_delegate_has_method(delegate=('delegate'))
|
||||
def predict_proba(self, X):
|
||||
"""This is available only if delegate has predict_proba.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
X : ndarray
|
||||
Parameter X
|
||||
"""
|
||||
return X
|
||||
|
||||
@deprecated('Testing deprecated function with wrong params')
|
||||
def fit(self, X, y):
|
||||
"""Incorrect docstring but should not be tested"""
|
||||
|
||||
|
||||
def test_check_docstring_parameters():
|
||||
pytest.importorskip('numpydoc',
|
||||
reason="numpydoc is required to test the docstrings")
|
||||
|
||||
incorrect = check_docstring_parameters(f_ok)
|
||||
assert incorrect == []
|
||||
incorrect = check_docstring_parameters(f_ok, ignore=['b'])
|
||||
assert incorrect == []
|
||||
incorrect = check_docstring_parameters(f_missing, ignore=['b'])
|
||||
assert incorrect == []
|
||||
with pytest.raises(RuntimeError, match="Unknown section Results"):
|
||||
check_docstring_parameters(f_bad_sections)
|
||||
with pytest.raises(RuntimeError, match="Unknown section Parameter"):
|
||||
check_docstring_parameters(Klass.f_bad_sections)
|
||||
|
||||
incorrect = check_docstring_parameters(f_check_param_definition)
|
||||
assert (
|
||||
incorrect == [
|
||||
"sklearn.utils.tests.test_testing.f_check_param_definition There "
|
||||
"was no space between the param name and colon ('a: int')",
|
||||
|
||||
"sklearn.utils.tests.test_testing.f_check_param_definition There "
|
||||
"was no space between the param name and colon ('b:')",
|
||||
|
||||
"sklearn.utils.tests.test_testing.f_check_param_definition "
|
||||
"Parameter 'c :' has an empty type spec. Remove the colon",
|
||||
|
||||
"sklearn.utils.tests.test_testing.f_check_param_definition There "
|
||||
"was no space between the param name and colon ('d:int')",
|
||||
])
|
||||
|
||||
messages = [
|
||||
["In function: sklearn.utils.tests.test_testing.f_bad_order",
|
||||
"There's a parameter name mismatch in function docstring w.r.t."
|
||||
" function signature, at index 0 diff: 'b' != 'a'",
|
||||
"Full diff:",
|
||||
"- ['b', 'a']",
|
||||
"+ ['a', 'b']"],
|
||||
|
||||
["In function: " +
|
||||
"sklearn.utils.tests.test_testing.f_too_many_param_docstring",
|
||||
"Parameters in function docstring have more items w.r.t. function"
|
||||
" signature, first extra item: c",
|
||||
"Full diff:",
|
||||
"- ['a', 'b']",
|
||||
"+ ['a', 'b', 'c']",
|
||||
"? +++++"],
|
||||
|
||||
["In function: sklearn.utils.tests.test_testing.f_missing",
|
||||
"Parameters in function docstring have less items w.r.t. function"
|
||||
" signature, first missing item: b",
|
||||
"Full diff:",
|
||||
"- ['a', 'b']",
|
||||
"+ ['a']"],
|
||||
|
||||
["In function: sklearn.utils.tests.test_testing.Klass.f_missing",
|
||||
"Parameters in function docstring have less items w.r.t. function"
|
||||
" signature, first missing item: X",
|
||||
"Full diff:",
|
||||
"- ['X', 'y']",
|
||||
"+ []"],
|
||||
|
||||
["In function: " +
|
||||
"sklearn.utils.tests.test_testing.MockMetaEstimator.predict",
|
||||
"There's a parameter name mismatch in function docstring w.r.t."
|
||||
" function signature, at index 0 diff: 'X' != 'y'",
|
||||
"Full diff:",
|
||||
"- ['X']",
|
||||
"? ^",
|
||||
"+ ['y']",
|
||||
"? ^"],
|
||||
|
||||
["In function: " +
|
||||
"sklearn.utils.tests.test_testing.MockMetaEstimator."
|
||||
+ "predict_proba",
|
||||
"Parameters in function docstring have less items w.r.t. function"
|
||||
" signature, first missing item: X",
|
||||
"Full diff:",
|
||||
"- ['X']",
|
||||
"+ []"],
|
||||
|
||||
["In function: " +
|
||||
"sklearn.utils.tests.test_testing.MockMetaEstimator.score",
|
||||
"Parameters in function docstring have less items w.r.t. function"
|
||||
" signature, first missing item: X",
|
||||
"Full diff:",
|
||||
"- ['X']",
|
||||
"+ []"],
|
||||
|
||||
["In function: " +
|
||||
"sklearn.utils.tests.test_testing.MockMetaEstimator.fit",
|
||||
"Parameters in function docstring have less items w.r.t. function"
|
||||
" signature, first missing item: X",
|
||||
"Full diff:",
|
||||
"- ['X', 'y']",
|
||||
"+ []"],
|
||||
|
||||
]
|
||||
|
||||
mock_meta = MockMetaEstimator(delegate=MockEst())
|
||||
|
||||
for msg, f in zip(messages,
|
||||
[f_bad_order,
|
||||
f_too_many_param_docstring,
|
||||
f_missing,
|
||||
Klass.f_missing,
|
||||
mock_meta.predict,
|
||||
mock_meta.predict_proba,
|
||||
mock_meta.score,
|
||||
mock_meta.fit]):
|
||||
incorrect = check_docstring_parameters(f)
|
||||
assert msg == incorrect, ('\n"%s"\n not in \n"%s"' % (msg, incorrect))
|
||||
|
||||
|
||||
class RegistrationCounter:
|
||||
def __init__(self):
|
||||
self.nb_calls = 0
|
||||
|
||||
def __call__(self, to_register_func):
|
||||
self.nb_calls += 1
|
||||
assert to_register_func.func is _delete_folder
|
||||
|
||||
|
||||
def check_memmap(input_array, mmap_data, mmap_mode='r'):
|
||||
assert isinstance(mmap_data, np.memmap)
|
||||
writeable = mmap_mode != 'r'
|
||||
assert mmap_data.flags.writeable is writeable
|
||||
np.testing.assert_array_equal(input_array, mmap_data)
|
||||
|
||||
|
||||
def test_tempmemmap(monkeypatch):
|
||||
registration_counter = RegistrationCounter()
|
||||
monkeypatch.setattr(atexit, 'register', registration_counter)
|
||||
|
||||
input_array = np.ones(3)
|
||||
with TempMemmap(input_array) as data:
|
||||
check_memmap(input_array, data)
|
||||
temp_folder = os.path.dirname(data.filename)
|
||||
if os.name != 'nt':
|
||||
assert not os.path.exists(temp_folder)
|
||||
assert registration_counter.nb_calls == 1
|
||||
|
||||
mmap_mode = 'r+'
|
||||
with TempMemmap(input_array, mmap_mode=mmap_mode) as data:
|
||||
check_memmap(input_array, data, mmap_mode=mmap_mode)
|
||||
temp_folder = os.path.dirname(data.filename)
|
||||
if os.name != 'nt':
|
||||
assert not os.path.exists(temp_folder)
|
||||
assert registration_counter.nb_calls == 2
|
||||
|
||||
|
||||
def test_create_memmap_backed_data(monkeypatch):
|
||||
registration_counter = RegistrationCounter()
|
||||
monkeypatch.setattr(atexit, 'register', registration_counter)
|
||||
|
||||
input_array = np.ones(3)
|
||||
data = create_memmap_backed_data(input_array)
|
||||
check_memmap(input_array, data)
|
||||
assert registration_counter.nb_calls == 1
|
||||
|
||||
data, folder = create_memmap_backed_data(input_array,
|
||||
return_folder=True)
|
||||
check_memmap(input_array, data)
|
||||
assert folder == os.path.dirname(data.filename)
|
||||
assert registration_counter.nb_calls == 2
|
||||
|
||||
mmap_mode = 'r+'
|
||||
data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode)
|
||||
check_memmap(input_array, data, mmap_mode)
|
||||
assert registration_counter.nb_calls == 3
|
||||
|
||||
input_list = [input_array, input_array + 1, input_array + 2]
|
||||
mmap_data_list = create_memmap_backed_data(input_list)
|
||||
for input_array, data in zip(input_list, mmap_data_list):
|
||||
check_memmap(input_array, data)
|
||||
assert registration_counter.nb_calls == 4
|
||||
|
||||
|
||||
# 0.24
|
||||
@pytest.mark.parametrize('callable, args', [
|
||||
(assert_equal, (0, 0)),
|
||||
(assert_not_equal, (0, 1)),
|
||||
(assert_greater, (1, 0)),
|
||||
(assert_greater_equal, (1, 0)),
|
||||
(assert_less, (0, 1)),
|
||||
(assert_less_equal, (0, 1)),
|
||||
(assert_in, (0, [0])),
|
||||
(assert_not_in, (0, [1]))])
|
||||
def test_deprecated_helpers(callable, args):
|
||||
msg = ('is deprecated in version 0.22 and will be removed in version '
|
||||
'0.24. Please use "assert" instead')
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
callable(*args)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constructor_name, container_type",
|
||||
[('list', list),
|
||||
('tuple', tuple),
|
||||
('array', np.ndarray),
|
||||
('sparse', sparse.csr_matrix),
|
||||
('dataframe', pytest.importorskip('pandas').DataFrame),
|
||||
('series', pytest.importorskip('pandas').Series),
|
||||
('index', pytest.importorskip('pandas').Index),
|
||||
('slice', slice)]
|
||||
)
|
||||
def test_convert_container(constructor_name, container_type):
|
||||
container = [0, 1]
|
||||
assert isinstance(_convert_container(container, constructor_name),
|
||||
container_type)
|
697
venv/Lib/site-packages/sklearn/utils/tests/test_utils.py
Normal file
697
venv/Lib/site-packages/sklearn/utils/tests/test_utils.py
Normal file
|
@ -0,0 +1,697 @@
|
|||
from copy import copy
|
||||
from itertools import chain
|
||||
import warnings
|
||||
import string
|
||||
import timeit
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from sklearn.utils._testing import (assert_array_equal,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_warns_message,
|
||||
assert_no_warnings,
|
||||
_convert_container)
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils import _determine_key_type
|
||||
from sklearn.utils import deprecated
|
||||
from sklearn.utils import gen_batches
|
||||
from sklearn.utils import _get_column_indices
|
||||
from sklearn.utils import resample
|
||||
from sklearn.utils import safe_mask
|
||||
from sklearn.utils import column_or_1d
|
||||
from sklearn.utils import _safe_indexing
|
||||
from sklearn.utils import shuffle
|
||||
from sklearn.utils import gen_even_slices
|
||||
from sklearn.utils import _message_with_time, _print_elapsed_time
|
||||
from sklearn.utils import get_chunk_n_rows
|
||||
from sklearn.utils import is_scalar_nan
|
||||
from sklearn.utils import _to_object_array
|
||||
from sklearn.utils._mocking import MockDataFrame
|
||||
from sklearn import config_context
|
||||
|
||||
# toy array
|
||||
X_toy = np.arange(9).reshape((3, 3))
|
||||
|
||||
|
||||
def test_make_rng():
|
||||
# Check the check_random_state utility function behavior
|
||||
assert check_random_state(None) is np.random.mtrand._rand
|
||||
assert check_random_state(np.random) is np.random.mtrand._rand
|
||||
|
||||
rng_42 = np.random.RandomState(42)
|
||||
assert check_random_state(42).randint(100) == rng_42.randint(100)
|
||||
|
||||
rng_42 = np.random.RandomState(42)
|
||||
assert check_random_state(rng_42) is rng_42
|
||||
|
||||
rng_42 = np.random.RandomState(42)
|
||||
assert check_random_state(43).randint(100) != rng_42.randint(100)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
check_random_state("some invalid seed")
|
||||
|
||||
|
||||
def test_gen_batches():
|
||||
# Make sure gen_batches errors on invalid batch_size
|
||||
|
||||
assert_array_equal(
|
||||
list(gen_batches(4, 2)),
|
||||
[slice(0, 2, None), slice(2, 4, None)]
|
||||
)
|
||||
msg_zero = "gen_batches got batch_size=0, must be positive"
|
||||
with pytest.raises(ValueError, match=msg_zero):
|
||||
next(gen_batches(4, 0))
|
||||
|
||||
msg_float = "gen_batches got batch_size=0.5, must be an integer"
|
||||
with pytest.raises(TypeError, match=msg_float):
|
||||
next(gen_batches(4, 0.5))
|
||||
|
||||
|
||||
def test_deprecated():
|
||||
# Test whether the deprecated decorator issues appropriate warnings
|
||||
# Copied almost verbatim from https://docs.python.org/library/warnings.html
|
||||
|
||||
# First a function...
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
warnings.simplefilter("always")
|
||||
|
||||
@deprecated()
|
||||
def ham():
|
||||
return "spam"
|
||||
|
||||
spam = ham()
|
||||
|
||||
assert spam == "spam" # function must remain usable
|
||||
|
||||
assert len(w) == 1
|
||||
assert issubclass(w[0].category, FutureWarning)
|
||||
assert "deprecated" in str(w[0].message).lower()
|
||||
|
||||
# ... then a class.
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
warnings.simplefilter("always")
|
||||
|
||||
@deprecated("don't use this")
|
||||
class Ham:
|
||||
SPAM = 1
|
||||
|
||||
ham = Ham()
|
||||
|
||||
assert hasattr(ham, "SPAM")
|
||||
|
||||
assert len(w) == 1
|
||||
assert issubclass(w[0].category, FutureWarning)
|
||||
assert "deprecated" in str(w[0].message).lower()
|
||||
|
||||
|
||||
def test_resample():
|
||||
# Border case not worth mentioning in doctests
|
||||
assert resample() is None
|
||||
|
||||
# Check that invalid arguments yield ValueError
|
||||
with pytest.raises(ValueError):
|
||||
resample([0], [0, 1])
|
||||
with pytest.raises(ValueError):
|
||||
resample([0, 1], [0, 1], replace=False, n_samples=3)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
resample([0, 1], [0, 1], meaning_of_life=42)
|
||||
# Issue:6581, n_samples can be more when replace is True (default).
|
||||
assert len(resample([1, 2], n_samples=5)) == 5
|
||||
|
||||
|
||||
def test_resample_stratified():
|
||||
# Make sure resample can stratify
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
p = .9
|
||||
X = rng.normal(size=(n_samples, 1))
|
||||
y = rng.binomial(1, p, size=n_samples)
|
||||
|
||||
_, y_not_stratified = resample(X, y, n_samples=10, random_state=0,
|
||||
stratify=None)
|
||||
assert np.all(y_not_stratified == 1)
|
||||
|
||||
_, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
|
||||
assert not np.all(y_stratified == 1)
|
||||
assert np.sum(y_stratified) == 9 # all 1s, one 0
|
||||
|
||||
|
||||
def test_resample_stratified_replace():
|
||||
# Make sure stratified resampling supports the replace parameter
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 1))
|
||||
y = rng.randint(0, 2, size=n_samples)
|
||||
|
||||
X_replace, _ = resample(X, y, replace=True, n_samples=50,
|
||||
random_state=rng, stratify=y)
|
||||
X_no_replace, _ = resample(X, y, replace=False, n_samples=50,
|
||||
random_state=rng, stratify=y)
|
||||
assert np.unique(X_replace).shape[0] < 50
|
||||
assert np.unique(X_no_replace).shape[0] == 50
|
||||
|
||||
# make sure n_samples can be greater than X.shape[0] if we sample with
|
||||
# replacement
|
||||
X_replace, _ = resample(X, y, replace=True, n_samples=1000,
|
||||
random_state=rng, stratify=y)
|
||||
assert X_replace.shape[0] == 1000
|
||||
assert np.unique(X_replace).shape[0] == 100
|
||||
|
||||
|
||||
def test_resample_stratify_2dy():
|
||||
# Make sure y can be 2d when stratifying
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 1))
|
||||
y = rng.randint(0, 2, size=(n_samples, 2))
|
||||
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y)
|
||||
assert y.ndim == 2
|
||||
|
||||
|
||||
def test_resample_stratify_sparse_error():
|
||||
# resample must be ndarray
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 2))
|
||||
y = rng.randint(0, 2, size=n_samples)
|
||||
stratify = sp.csr_matrix(y)
|
||||
with pytest.raises(TypeError, match='A sparse matrix was passed'):
|
||||
X, y = resample(X, y, n_samples=50, random_state=rng,
|
||||
stratify=stratify)
|
||||
|
||||
|
||||
def test_safe_mask():
|
||||
random_state = check_random_state(0)
|
||||
X = random_state.rand(5, 4)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
mask = [False, False, True, True, True]
|
||||
|
||||
mask = safe_mask(X, mask)
|
||||
assert X[mask].shape[0] == 3
|
||||
|
||||
mask = safe_mask(X_csr, mask)
|
||||
assert X_csr[mask].shape[0] == 3
|
||||
|
||||
|
||||
def test_column_or_1d():
|
||||
EXAMPLES = [
|
||||
("binary", ["spam", "egg", "spam"]),
|
||||
("binary", [0, 1, 0, 1]),
|
||||
("continuous", np.arange(10) / 20.),
|
||||
("multiclass", [1, 2, 3]),
|
||||
("multiclass", [0, 1, 2, 2, 0]),
|
||||
("multiclass", [[1], [2], [3]]),
|
||||
("multilabel-indicator", [[0, 1, 0], [0, 0, 1]]),
|
||||
("multiclass-multioutput", [[1, 2, 3]]),
|
||||
("multiclass-multioutput", [[1, 1], [2, 2], [3, 1]]),
|
||||
("multiclass-multioutput", [[5, 1], [4, 2], [3, 1]]),
|
||||
("multiclass-multioutput", [[1, 2, 3]]),
|
||||
("continuous-multioutput", np.arange(30).reshape((-1, 3))),
|
||||
]
|
||||
|
||||
for y_type, y in EXAMPLES:
|
||||
if y_type in ["binary", 'multiclass', "continuous"]:
|
||||
assert_array_equal(column_or_1d(y), np.ravel(y))
|
||||
else:
|
||||
with pytest.raises(ValueError):
|
||||
column_or_1d(y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key, dtype",
|
||||
[(0, 'int'),
|
||||
('0', 'str'),
|
||||
(True, 'bool'),
|
||||
(np.bool_(True), 'bool'),
|
||||
([0, 1, 2], 'int'),
|
||||
(['0', '1', '2'], 'str'),
|
||||
((0, 1, 2), 'int'),
|
||||
(('0', '1', '2'), 'str'),
|
||||
(slice(None, None), None),
|
||||
(slice(0, 2), 'int'),
|
||||
(np.array([0, 1, 2], dtype=np.int32), 'int'),
|
||||
(np.array([0, 1, 2], dtype=np.int64), 'int'),
|
||||
(np.array([0, 1, 2], dtype=np.uint8), 'int'),
|
||||
([True, False], 'bool'),
|
||||
((True, False), 'bool'),
|
||||
(np.array([True, False]), 'bool'),
|
||||
('col_0', 'str'),
|
||||
(['col_0', 'col_1', 'col_2'], 'str'),
|
||||
(('col_0', 'col_1', 'col_2'), 'str'),
|
||||
(slice('begin', 'end'), 'str'),
|
||||
(np.array(['col_0', 'col_1', 'col_2']), 'str'),
|
||||
(np.array(['col_0', 'col_1', 'col_2'], dtype=object), 'str')]
|
||||
)
|
||||
def test_determine_key_type(key, dtype):
|
||||
assert _determine_key_type(key) == dtype
|
||||
|
||||
|
||||
def test_determine_key_type_error():
|
||||
with pytest.raises(ValueError, match="No valid specification of the"):
|
||||
_determine_key_type(1.0)
|
||||
|
||||
|
||||
def test_determine_key_type_slice_error():
|
||||
with pytest.raises(TypeError, match="Only array-like or scalar are"):
|
||||
_determine_key_type(slice(0, 2, 1), accept_slice=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type", ["list", "array", "sparse", "dataframe"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"indices_type", ["list", "tuple", "array", "series", "slice"]
|
||||
)
|
||||
def test_safe_indexing_2d_container_axis_0(array_type, indices_type):
|
||||
indices = [1, 2]
|
||||
if indices_type == 'slice' and isinstance(indices[1], int):
|
||||
indices[1] += 1
|
||||
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert_allclose_dense_sparse(
|
||||
subset, _convert_container([[4, 5, 6], [7, 8, 9]], array_type)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["list", "array", "series"])
|
||||
@pytest.mark.parametrize(
|
||||
"indices_type", ["list", "tuple", "array", "series", "slice"]
|
||||
)
|
||||
def test_safe_indexing_1d_container(array_type, indices_type):
|
||||
indices = [1, 2]
|
||||
if indices_type == 'slice' and isinstance(indices[1], int):
|
||||
indices[1] += 1
|
||||
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert_allclose_dense_sparse(
|
||||
subset, _convert_container([2, 3], array_type)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
|
||||
@pytest.mark.parametrize(
|
||||
"indices_type", ["list", "tuple", "array", "series", "slice"]
|
||||
)
|
||||
@pytest.mark.parametrize("indices", [[1, 2], ["col_1", "col_2"]])
|
||||
def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices):
|
||||
# validation of the indices
|
||||
# we make a copy because indices is mutable and shared between tests
|
||||
indices_converted = copy(indices)
|
||||
if indices_type == 'slice' and isinstance(indices[1], int):
|
||||
indices_converted[1] += 1
|
||||
|
||||
columns_name = ['col_0', 'col_1', 'col_2']
|
||||
array = _convert_container(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
|
||||
)
|
||||
indices_converted = _convert_container(indices_converted, indices_type)
|
||||
|
||||
if isinstance(indices[0], str) and array_type != 'dataframe':
|
||||
err_msg = ("Specifying the columns using strings is only supported "
|
||||
"for pandas DataFrames")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(array, indices_converted, axis=1)
|
||||
else:
|
||||
subset = _safe_indexing(array, indices_converted, axis=1)
|
||||
assert_allclose_dense_sparse(
|
||||
subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_read_only", [True, False])
|
||||
@pytest.mark.parametrize("indices_read_only", [True, False])
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
|
||||
@pytest.mark.parametrize("indices_type", ["array", "series"])
|
||||
@pytest.mark.parametrize(
|
||||
"axis, expected_array",
|
||||
[(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])]
|
||||
)
|
||||
def test_safe_indexing_2d_read_only_axis_1(array_read_only, indices_read_only,
|
||||
array_type, indices_type, axis,
|
||||
expected_array):
|
||||
array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
||||
if array_read_only:
|
||||
array.setflags(write=False)
|
||||
array = _convert_container(array, array_type)
|
||||
indices = np.array([1, 2])
|
||||
if indices_read_only:
|
||||
indices.setflags(write=False)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=axis)
|
||||
assert_allclose_dense_sparse(
|
||||
subset, _convert_container(expected_array, array_type)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["list", "array", "series"])
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
|
||||
def test_safe_indexing_1d_container_mask(array_type, indices_type):
|
||||
indices = [False] + [True] * 2 + [False] * 6
|
||||
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert_allclose_dense_sparse(
|
||||
subset, _convert_container([2, 3], array_type)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
|
||||
@pytest.mark.parametrize(
|
||||
"axis, expected_subset",
|
||||
[(0, [[4, 5, 6], [7, 8, 9]]),
|
||||
(1, [[2, 3], [5, 6], [8, 9]])]
|
||||
)
|
||||
def test_safe_indexing_2d_mask(array_type, indices_type, axis,
|
||||
expected_subset):
|
||||
columns_name = ['col_0', 'col_1', 'col_2']
|
||||
array = _convert_container(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
|
||||
)
|
||||
indices = [False, True, True]
|
||||
indices = _convert_container(indices, indices_type)
|
||||
|
||||
subset = _safe_indexing(array, indices, axis=axis)
|
||||
assert_allclose_dense_sparse(
|
||||
subset, _convert_container(expected_subset, array_type)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type, expected_output_type",
|
||||
[("list", "list"), ("array", "array"),
|
||||
("sparse", "sparse"), ("dataframe", "series")]
|
||||
)
|
||||
def test_safe_indexing_2d_scalar_axis_0(array_type, expected_output_type):
|
||||
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
|
||||
indices = 2
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
expected_array = _convert_container([7, 8, 9], expected_output_type)
|
||||
assert_allclose_dense_sparse(subset, expected_array)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["list", "array", "series"])
|
||||
def test_safe_indexing_1d_scalar(array_type):
|
||||
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
|
||||
indices = 2
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert subset == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type, expected_output_type",
|
||||
[("array", "array"), ("sparse", "sparse"), ("dataframe", "series")]
|
||||
)
|
||||
@pytest.mark.parametrize("indices", [2, "col_2"])
|
||||
def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type,
|
||||
indices):
|
||||
columns_name = ['col_0', 'col_1', 'col_2']
|
||||
array = _convert_container(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
|
||||
)
|
||||
|
||||
if isinstance(indices, str) and array_type != 'dataframe':
|
||||
err_msg = ("Specifying the columns using strings is only supported "
|
||||
"for pandas DataFrames")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(array, indices, axis=1)
|
||||
else:
|
||||
subset = _safe_indexing(array, indices, axis=1)
|
||||
expected_output = [3, 6, 9]
|
||||
if expected_output_type == 'sparse':
|
||||
# sparse matrix are keeping the 2D shape
|
||||
expected_output = [[3], [6], [9]]
|
||||
expected_array = _convert_container(
|
||||
expected_output, expected_output_type
|
||||
)
|
||||
assert_allclose_dense_sparse(subset, expected_array)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["list", "array", "sparse"])
|
||||
def test_safe_indexing_None_axis_0(array_type):
|
||||
X = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
|
||||
X_subset = _safe_indexing(X, None, axis=0)
|
||||
assert_allclose_dense_sparse(X_subset, X)
|
||||
|
||||
|
||||
def test_safe_indexing_pandas_no_matching_cols_error():
|
||||
pd = pytest.importorskip('pandas')
|
||||
err_msg = "No valid specification of the columns."
|
||||
X = pd.DataFrame(X_toy)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(X, [1.0], axis=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("axis", [None, 3])
|
||||
def test_safe_indexing_error_axis(axis):
|
||||
with pytest.raises(ValueError, match="'axis' should be either 0"):
|
||||
_safe_indexing(X_toy, [0, 1], axis=axis)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X_constructor", ['array', 'series'])
|
||||
def test_safe_indexing_1d_array_error(X_constructor):
|
||||
# check that we are raising an error if the array-like passed is 1D and
|
||||
# we try to index on the 2nd dimension
|
||||
X = list(range(5))
|
||||
if X_constructor == 'array':
|
||||
X_constructor = np.asarray(X)
|
||||
elif X_constructor == 'series':
|
||||
pd = pytest.importorskip("pandas")
|
||||
X_constructor = pd.Series(X)
|
||||
|
||||
err_msg = "'X' should be a 2D NumPy array, 2D sparse matrix or pandas"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(X_constructor, [0, 1], axis=1)
|
||||
|
||||
|
||||
def test_safe_indexing_container_axis_0_unsupported_type():
|
||||
indices = ["col_1", "col_2"]
|
||||
array = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
|
||||
err_msg = "String indexing is not supported with 'axis=0'"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(array, indices, axis=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key, err_msg",
|
||||
[(10, r"all features must be in \[0, 2\]"),
|
||||
('whatever', 'A given column is not a column of the dataframe')]
|
||||
)
|
||||
def test_get_column_indices_error(key, err_msg):
|
||||
pd = pytest.importorskip("pandas")
|
||||
X_df = pd.DataFrame(X_toy, columns=['col_0', 'col_1', 'col_2'])
|
||||
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_get_column_indices(X_df, key)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key",
|
||||
[['col1'], ['col2'], ['col1', 'col2'], ['col1', 'col3'], ['col2', 'col3']]
|
||||
)
|
||||
def test_get_column_indices_pandas_nonunique_columns_error(key):
|
||||
pd = pytest.importorskip('pandas')
|
||||
toy = np.zeros((1, 5), dtype=int)
|
||||
columns = ['col1', 'col1', 'col2', 'col3', 'col2']
|
||||
X = pd.DataFrame(toy, columns=columns)
|
||||
|
||||
err_msg = "Selected columns, {}, are not unique in dataframe".format(key)
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
_get_column_indices(X, key)
|
||||
assert str(exc_info.value) == err_msg
|
||||
|
||||
|
||||
def test_shuffle_on_ndim_equals_three():
|
||||
def to_tuple(A): # to make the inner arrays hashable
|
||||
return tuple(tuple(tuple(C) for C in B) for B in A)
|
||||
|
||||
A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2)
|
||||
S = set(to_tuple(A))
|
||||
shuffle(A) # shouldn't raise a ValueError for dim = 3
|
||||
assert set(to_tuple(A)) == S
|
||||
|
||||
|
||||
def test_shuffle_dont_convert_to_array():
|
||||
# Check that shuffle does not try to convert to numpy arrays with float
|
||||
# dtypes can let any indexable datastructure pass-through.
|
||||
a = ['a', 'b', 'c']
|
||||
b = np.array(['a', 'b', 'c'], dtype=object)
|
||||
c = [1, 2, 3]
|
||||
d = MockDataFrame(np.array([['a', 0],
|
||||
['b', 1],
|
||||
['c', 2]],
|
||||
dtype=object))
|
||||
e = sp.csc_matrix(np.arange(6).reshape(3, 2))
|
||||
a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0)
|
||||
|
||||
assert a_s == ['c', 'b', 'a']
|
||||
assert type(a_s) == list
|
||||
|
||||
assert_array_equal(b_s, ['c', 'b', 'a'])
|
||||
assert b_s.dtype == object
|
||||
|
||||
assert c_s == [3, 2, 1]
|
||||
assert type(c_s) == list
|
||||
|
||||
assert_array_equal(d_s, np.array([['c', 2],
|
||||
['b', 1],
|
||||
['a', 0]],
|
||||
dtype=object))
|
||||
assert type(d_s) == MockDataFrame
|
||||
|
||||
assert_array_equal(e_s.toarray(), np.array([[4, 5],
|
||||
[2, 3],
|
||||
[0, 1]]))
|
||||
|
||||
|
||||
def test_gen_even_slices():
|
||||
# check that gen_even_slices contains all samples
|
||||
some_range = range(10)
|
||||
joined_range = list(chain(*[some_range[slice] for slice in
|
||||
gen_even_slices(10, 3)]))
|
||||
assert_array_equal(some_range, joined_range)
|
||||
|
||||
# check that passing negative n_chunks raises an error
|
||||
slices = gen_even_slices(10, -1)
|
||||
with pytest.raises(ValueError, match="gen_even_slices got n_packs=-1,"
|
||||
" must be >=1"):
|
||||
next(slices)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
('row_bytes', 'max_n_rows', 'working_memory', 'expected', 'warning'),
|
||||
[(1024, None, 1, 1024, None),
|
||||
(1024, None, 0.99999999, 1023, None),
|
||||
(1023, None, 1, 1025, None),
|
||||
(1025, None, 1, 1023, None),
|
||||
(1024, None, 2, 2048, None),
|
||||
(1024, 7, 1, 7, None),
|
||||
(1024 * 1024, None, 1, 1, None),
|
||||
(1024 * 1024 + 1, None, 1, 1,
|
||||
'Could not adhere to working_memory config. '
|
||||
'Currently 1MiB, 2MiB required.'),
|
||||
])
|
||||
def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory,
|
||||
expected, warning):
|
||||
if warning is not None:
|
||||
def check_warning(*args, **kw):
|
||||
return assert_warns_message(UserWarning, warning, *args, **kw)
|
||||
else:
|
||||
check_warning = assert_no_warnings
|
||||
|
||||
actual = check_warning(get_chunk_n_rows,
|
||||
row_bytes=row_bytes,
|
||||
max_n_rows=max_n_rows,
|
||||
working_memory=working_memory)
|
||||
|
||||
assert actual == expected
|
||||
assert type(actual) is type(expected)
|
||||
with config_context(working_memory=working_memory):
|
||||
actual = check_warning(get_chunk_n_rows,
|
||||
row_bytes=row_bytes,
|
||||
max_n_rows=max_n_rows)
|
||||
assert actual == expected
|
||||
assert type(actual) is type(expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
['source', 'message', 'is_long'],
|
||||
[
|
||||
('ABC', string.ascii_lowercase, False),
|
||||
('ABCDEF', string.ascii_lowercase, False),
|
||||
('ABC', string.ascii_lowercase * 3, True),
|
||||
('ABC' * 10, string.ascii_lowercase, True),
|
||||
('ABC', string.ascii_lowercase + u'\u1048', False),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
['time', 'time_str'],
|
||||
[
|
||||
(0.2, ' 0.2s'),
|
||||
(20, ' 20.0s'),
|
||||
(2000, '33.3min'),
|
||||
(20000, '333.3min'),
|
||||
])
|
||||
def test_message_with_time(source, message, is_long, time, time_str):
|
||||
out = _message_with_time(source, message, time)
|
||||
if is_long:
|
||||
assert len(out) > 70
|
||||
else:
|
||||
assert len(out) == 70
|
||||
|
||||
assert out.startswith('[' + source + '] ')
|
||||
out = out[len(source) + 3:]
|
||||
|
||||
assert out.endswith(time_str)
|
||||
out = out[:-len(time_str)]
|
||||
assert out.endswith(', total=')
|
||||
out = out[:-len(', total=')]
|
||||
assert out.endswith(message)
|
||||
out = out[:-len(message)]
|
||||
assert out.endswith(' ')
|
||||
out = out[:-1]
|
||||
|
||||
if is_long:
|
||||
assert not out
|
||||
else:
|
||||
assert list(set(out)) == ['.']
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
['message', 'expected'],
|
||||
[
|
||||
('hello', _message_with_time('ABC', 'hello', 0.1) + '\n'),
|
||||
('', _message_with_time('ABC', '', 0.1) + '\n'),
|
||||
(None, ''),
|
||||
])
|
||||
def test_print_elapsed_time(message, expected, capsys, monkeypatch):
|
||||
monkeypatch.setattr(timeit, 'default_timer', lambda: 0)
|
||||
with _print_elapsed_time('ABC', message):
|
||||
monkeypatch.setattr(timeit, 'default_timer', lambda: 0.1)
|
||||
assert capsys.readouterr().out == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("value, result", [(float("nan"), True),
|
||||
(np.nan, True),
|
||||
(np.float("nan"), True),
|
||||
(np.float32("nan"), True),
|
||||
(np.float64("nan"), True),
|
||||
(0, False),
|
||||
(0., False),
|
||||
(None, False),
|
||||
("", False),
|
||||
("nan", False),
|
||||
([np.nan], False)])
|
||||
def test_is_scalar_nan(value, result):
|
||||
assert is_scalar_nan(value) is result
|
||||
|
||||
|
||||
def dummy_func():
|
||||
pass
|
||||
|
||||
|
||||
def test_deprecation_joblib_api(tmpdir):
|
||||
|
||||
# Only parallel_backend and register_parallel_backend are not deprecated in
|
||||
# sklearn.utils
|
||||
from sklearn.utils import parallel_backend, register_parallel_backend
|
||||
assert_no_warnings(parallel_backend, 'loky', None)
|
||||
assert_no_warnings(register_parallel_backend, 'failing', None)
|
||||
|
||||
from sklearn.utils._joblib import joblib
|
||||
del joblib.parallel.BACKENDS['failing']
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sequence",
|
||||
[[np.array(1), np.array(2)], [[1, 2], [3, 4]]]
|
||||
)
|
||||
def test_to_object_array(sequence):
|
||||
out = _to_object_array(sequence)
|
||||
assert isinstance(out, np.ndarray)
|
||||
assert out.dtype.kind == 'O'
|
||||
assert out.ndim == 1
|
1215
venv/Lib/site-packages/sklearn/utils/tests/test_validation.py
Normal file
1215
venv/Lib/site-packages/sklearn/utils/tests/test_validation.py
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue