Uploaded Test files

This commit is contained in:
Batuhan Berk Başoğlu 2020-11-12 11:05:57 -05:00
parent f584ad9d97
commit 2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions

View file

@ -0,0 +1,10 @@
import pytest
import sklearn
@pytest.fixture
def print_changed_only_false():
sklearn.set_config(print_changed_only=False)
yield
sklearn.set_config(print_changed_only=True) # reset to default

View file

@ -0,0 +1,266 @@
import numpy as np
import pytest
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_almost_equal
def test_compute_class_weight():
# Test (and demo) compute_class_weight.
y = np.asarray([2, 2, 2, 3, 3, 4])
classes = np.unique(y)
cw = compute_class_weight("balanced", classes=classes, y=y)
# total effect of samples is preserved
class_counts = np.bincount(y)[2:]
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
assert cw[0] < cw[1] < cw[2]
def test_compute_class_weight_not_present():
# Raise error when y does not contain all class labels
classes = np.arange(4)
y = np.asarray([0, 0, 0, 1, 1, 2])
with pytest.raises(ValueError):
compute_class_weight("balanced", classes=classes, y=y)
# Fix exception in error message formatting when missing label is a string
# https://github.com/scikit-learn/scikit-learn/issues/8312
with pytest.raises(ValueError,
match="Class label label_not_present not present"):
compute_class_weight({"label_not_present": 1.}, classes=classes, y=y)
# Raise error when y has items not in classes
classes = np.arange(2)
with pytest.raises(ValueError):
compute_class_weight("balanced", classes=classes, y=y)
with pytest.raises(ValueError):
compute_class_weight({0: 1., 1: 2.}, classes=classes, y=y)
def test_compute_class_weight_dict():
classes = np.arange(3)
class_weights = {0: 1.0, 1: 2.0, 2: 3.0}
y = np.asarray([0, 0, 1, 2])
cw = compute_class_weight(class_weights, classes=classes, y=y)
# When the user specifies class weights, compute_class_weights should just
# return them.
assert_array_almost_equal(np.asarray([1.0, 2.0, 3.0]), cw)
# When a class weight is specified that isn't in classes, a ValueError
# should get raised
msg = 'Class label 4 not present.'
class_weights = {0: 1.0, 1: 2.0, 2: 3.0, 4: 1.5}
with pytest.raises(ValueError, match=msg):
compute_class_weight(class_weights, classes=classes, y=y)
msg = 'Class label -1 not present.'
class_weights = {-1: 5.0, 0: 1.0, 1: 2.0, 2: 3.0}
with pytest.raises(ValueError, match=msg):
compute_class_weight(class_weights, classes=classes, y=y)
def test_compute_class_weight_invariance():
# Test that results with class_weight="balanced" is invariant wrt
# class imbalance if the number of samples is identical.
# The test uses a balanced two class dataset with 100 datapoints.
# It creates three versions, one where class 1 is duplicated
# resulting in 150 points of class 1 and 50 of class 0,
# one where there are 50 points in class 1 and 150 in class 0,
# and one where there are 100 points of each class (this one is balanced
# again).
# With balancing class weights, all three should give the same model.
X, y = make_blobs(centers=2, random_state=0)
# create dataset where class 1 is duplicated twice
X_1 = np.vstack([X] + [X[y == 1]] * 2)
y_1 = np.hstack([y] + [y[y == 1]] * 2)
# create dataset where class 0 is duplicated twice
X_0 = np.vstack([X] + [X[y == 0]] * 2)
y_0 = np.hstack([y] + [y[y == 0]] * 2)
# duplicate everything
X_ = np.vstack([X] * 2)
y_ = np.hstack([y] * 2)
# results should be identical
logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1)
logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0)
logreg = LogisticRegression(class_weight="balanced").fit(X_, y_)
assert_array_almost_equal(logreg1.coef_, logreg0.coef_)
assert_array_almost_equal(logreg.coef_, logreg0.coef_)
def test_compute_class_weight_balanced_negative():
# Test compute_class_weight when labels are negative
# Test with balanced class labels.
classes = np.array([-2, -1, 0])
y = np.asarray([-1, -1, 0, 0, -2, -2])
cw = compute_class_weight("balanced", classes=classes, y=y)
assert len(cw) == len(classes)
assert_array_almost_equal(cw, np.array([1., 1., 1.]))
# Test with unbalanced class labels.
y = np.asarray([-1, 0, 0, -2, -2, -2])
cw = compute_class_weight("balanced", classes=classes, y=y)
assert len(cw) == len(classes)
class_counts = np.bincount(y + 2)
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
assert_array_almost_equal(cw, [2. / 3, 2., 1.])
def test_compute_class_weight_balanced_unordered():
# Test compute_class_weight when classes are unordered
classes = np.array([1, 0, 3])
y = np.asarray([1, 0, 0, 3, 3, 3])
cw = compute_class_weight("balanced", classes=classes, y=y)
class_counts = np.bincount(y)[classes]
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
assert_array_almost_equal(cw, [2., 1., 2. / 3])
def test_compute_class_weight_default():
# Test for the case where no weight is given for a present class.
# Current behaviour is to assign the unweighted classes a weight of 1.
y = np.asarray([2, 2, 2, 3, 3, 4])
classes = np.unique(y)
classes_len = len(classes)
# Test for non specified weights
cw = compute_class_weight(None, classes=classes, y=y)
assert len(cw) == classes_len
assert_array_almost_equal(cw, np.ones(3))
# Tests for partly specified weights
cw = compute_class_weight({2: 1.5}, classes=classes, y=y)
assert len(cw) == classes_len
assert_array_almost_equal(cw, [1.5, 1., 1.])
cw = compute_class_weight({2: 1.5, 4: 0.5}, classes=classes, y=y)
assert len(cw) == classes_len
assert_array_almost_equal(cw, [1.5, 1., 0.5])
def test_compute_sample_weight():
# Test (and demo) compute_sample_weight.
# Test with balanced classes
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
# Test with user-defined weights
sample_weight = compute_sample_weight({1: 2, 2: 1}, y)
assert_array_almost_equal(sample_weight, [2., 2., 2., 1., 1., 1.])
# Test with column vector of balanced classes
y = np.asarray([[1], [1], [1], [2], [2], [2]])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
# Test with unbalanced classes
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y)
expected_balanced = np.array([0.7777, 0.7777, 0.7777, 0.7777, 0.7777,
0.7777, 2.3333])
assert_array_almost_equal(sample_weight, expected_balanced, decimal=4)
# Test with `None` weights
sample_weight = compute_sample_weight(None, y)
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 1.])
# Test with multi-output of balanced classes
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
# Test with multi-output with user-defined weights
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight([{1: 2, 2: 1}, {0: 1, 1: 2}], y)
assert_array_almost_equal(sample_weight, [2., 2., 2., 2., 2., 2.])
# Test with multi-output of unbalanced classes
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [3, -1]])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, expected_balanced ** 2, decimal=3)
def test_compute_sample_weight_with_subsample():
# Test compute_sample_weight with subsamples specified.
# Test with balanced classes and all samples present
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
# Test with column vector of balanced classes and all samples present
y = np.asarray([[1], [1], [1], [2], [2], [2]])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
# Test with a subsample
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, indices=range(4))
assert_array_almost_equal(sample_weight, [2. / 3, 2. / 3,
2. / 3, 2., 2., 2.])
# Test with a bootstrap subsample
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y,
indices=[0, 1, 1, 2, 2, 3])
expected_balanced = np.asarray([0.6, 0.6, 0.6, 3., 3., 3.])
assert_array_almost_equal(sample_weight, expected_balanced)
# Test with a bootstrap subsample for multi-output
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight("balanced", y,
indices=[0, 1, 1, 2, 2, 3])
assert_array_almost_equal(sample_weight, expected_balanced ** 2)
# Test with a missing class
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
# Test with a missing class for multi-output
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
def test_compute_sample_weight_errors():
# Test compute_sample_weight raises errors expected.
# Invalid preset string
y = np.asarray([1, 1, 1, 2, 2, 2])
y_ = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
with pytest.raises(ValueError):
compute_sample_weight("ni", y)
with pytest.raises(ValueError):
compute_sample_weight("ni", y, indices=range(4))
with pytest.raises(ValueError):
compute_sample_weight("ni", y_)
with pytest.raises(ValueError):
compute_sample_weight("ni", y_, indices=range(4))
# Not "balanced" for subsample
with pytest.raises(ValueError):
compute_sample_weight({1: 2, 2: 1}, y, indices=range(4))
# Not a list or preset for multi-output
with pytest.raises(ValueError):
compute_sample_weight({1: 2, 2: 1}, y_)
# Incorrect length list for multi-output
with pytest.raises(ValueError):
compute_sample_weight([{1: 2, 2: 1}], y_)
def test_compute_sample_weight_more_than_32():
# Non-regression smoke test for #12146
y = np.arange(50) # more than 32 distinct classes
indices = np.arange(50) # use subsampling
weight = compute_sample_weight('balanced', y, indices=indices)
assert_array_almost_equal(weight, np.ones(y.shape[0]))

View file

@ -0,0 +1,229 @@
import pytest
import numpy as np
from sklearn.utils._testing import assert_allclose
from sklearn.utils._cython_blas import _dot_memview
from sklearn.utils._cython_blas import _asum_memview
from sklearn.utils._cython_blas import _axpy_memview
from sklearn.utils._cython_blas import _nrm2_memview
from sklearn.utils._cython_blas import _copy_memview
from sklearn.utils._cython_blas import _scal_memview
from sklearn.utils._cython_blas import _rotg_memview
from sklearn.utils._cython_blas import _rot_memview
from sklearn.utils._cython_blas import _gemv_memview
from sklearn.utils._cython_blas import _ger_memview
from sklearn.utils._cython_blas import _gemm_memview
from sklearn.utils._cython_blas import RowMajor, ColMajor
from sklearn.utils._cython_blas import Trans, NoTrans
def _numpy_to_cython(dtype):
cython = pytest.importorskip("cython")
if dtype == np.float32:
return cython.float
elif dtype == np.float64:
return cython.double
RTOL = {np.float32: 1e-6, np.float64: 1e-12}
ORDER = {RowMajor: 'C', ColMajor: 'F'}
def _no_op(x):
return x
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_dot(dtype):
dot = _dot_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(10).astype(dtype, copy=False)
expected = x.dot(y)
actual = dot(x, y)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_asum(dtype):
asum = _asum_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
expected = np.abs(x).sum()
actual = asum(x)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_axpy(dtype):
axpy = _axpy_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(10).astype(dtype, copy=False)
alpha = 2.5
expected = alpha * x + y
axpy(alpha, x, y)
assert_allclose(y, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_nrm2(dtype):
nrm2 = _nrm2_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
expected = np.linalg.norm(x)
actual = nrm2(x)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_copy(dtype):
copy = _copy_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = np.empty_like(x)
expected = x.copy()
copy(x, y)
assert_allclose(y, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_scal(dtype):
scal = _scal_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
alpha = 2.5
expected = alpha * x
scal(alpha, x)
assert_allclose(x, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_rotg(dtype):
rotg = _rotg_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
a = dtype(rng.randn())
b = dtype(rng.randn())
c, s = 0.0, 0.0
def expected_rotg(a, b):
roe = a if abs(a) > abs(b) else b
if a == 0 and b == 0:
c, s, r, z = (1, 0, 0, 0)
else:
r = np.sqrt(a**2 + b**2) * (1 if roe >= 0 else -1)
c, s = a/r, b/r
z = s if roe == a else (1 if c == 0 else 1 / c)
return r, z, c, s
expected = expected_rotg(a, b)
actual = rotg(a, b, c, s)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_rot(dtype):
rot = _rot_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(10).astype(dtype, copy=False)
c = dtype(rng.randn())
s = dtype(rng.randn())
expected_x = c * x + s * y
expected_y = c * y - s * x
rot(x, y, c, s)
assert_allclose(x, expected_x)
assert_allclose(y, expected_y)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("opA, transA",
[(_no_op, NoTrans), (np.transpose, Trans)],
ids=["NoTrans", "Trans"])
@pytest.mark.parametrize("order", [RowMajor, ColMajor],
ids=["RowMajor", "ColMajor"])
def test_gemv(dtype, opA, transA, order):
gemv = _gemv_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
A = np.asarray(opA(rng.random_sample((20, 10)).astype(dtype, copy=False)),
order=ORDER[order])
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(20).astype(dtype, copy=False)
alpha, beta = 2.5, -0.5
expected = alpha * opA(A).dot(x) + beta * y
gemv(transA, alpha, A, x, beta, y)
assert_allclose(y, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("order", [RowMajor, ColMajor],
ids=["RowMajor", "ColMajor"])
def test_ger(dtype, order):
ger = _ger_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(20).astype(dtype, copy=False)
A = np.asarray(rng.random_sample((10, 20)).astype(dtype, copy=False),
order=ORDER[order])
alpha = 2.5
expected = alpha * np.outer(x, y) + A
ger(alpha, x, y, A)
assert_allclose(A, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("opB, transB",
[(_no_op, NoTrans), (np.transpose, Trans)],
ids=["NoTrans", "Trans"])
@pytest.mark.parametrize("opA, transA",
[(_no_op, NoTrans), (np.transpose, Trans)],
ids=["NoTrans", "Trans"])
@pytest.mark.parametrize("order", [RowMajor, ColMajor],
ids=["RowMajor", "ColMajor"])
def test_gemm(dtype, opA, transA, opB, transB, order):
gemm = _gemm_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
A = np.asarray(opA(rng.random_sample((30, 10)).astype(dtype, copy=False)),
order=ORDER[order])
B = np.asarray(opB(rng.random_sample((10, 20)).astype(dtype, copy=False)),
order=ORDER[order])
C = np.asarray(rng.random_sample((30, 20)).astype(dtype, copy=False),
order=ORDER[order])
alpha, beta = 2.5, -0.5
expected = alpha * opA(A).dot(opB(B)) + beta * C
gemm(transA, transB, alpha, A, B, beta, C)
assert_allclose(C, expected, rtol=RTOL[dtype])

View file

@ -0,0 +1,128 @@
import pytest
import types
import numpy as np
import warnings
from sklearn.dummy import DummyClassifier
from sklearn.utils import all_estimators
from sklearn.utils.estimator_checks import choose_check_classifiers_labels
from sklearn.utils.estimator_checks import NotAnArray
from sklearn.utils.estimator_checks import enforce_estimator_tags_y
from sklearn.utils.estimator_checks import is_public_parameter
from sklearn.utils.estimator_checks import pairwise_estimator_convert_X
from sklearn.utils.estimator_checks import set_checking_parameters
from sklearn.utils.optimize import newton_cg
from sklearn.utils.random import random_choice_csc
from sklearn.utils import safe_indexing
# This file tests the utils that are deprecated
# TODO: remove in 0.24
def test_choose_check_classifiers_labels_deprecated():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
choose_check_classifiers_labels(None, None, None)
# TODO: remove in 0.24
def test_enforce_estimator_tags_y():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
enforce_estimator_tags_y(DummyClassifier(), np.array([0, 1]))
# TODO: remove in 0.24
def test_notanarray():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
NotAnArray([1, 2])
# TODO: remove in 0.24
def test_is_public_parameter():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
is_public_parameter('hello')
# TODO: remove in 0.24
def test_pairwise_estimator_convert_X():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
pairwise_estimator_convert_X([[1, 2]], DummyClassifier())
# TODO: remove in 0.24
def test_set_checking_parameters():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
set_checking_parameters(DummyClassifier())
# TODO: remove in 0.24
def test_newton_cg():
rng = np.random.RandomState(0)
A = rng.normal(size=(10, 10))
x0 = np.ones(10)
def func(x):
Ax = A.dot(x)
return .5 * (Ax).dot(Ax)
def grad(x):
return A.T.dot(A.dot(x))
def grad_hess(x):
return grad(x), lambda x: A.T.dot(A.dot(x))
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
newton_cg(grad_hess, func, grad, x0)
# TODO: remove in 0.24
def test_random_choice_csc():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
random_choice_csc(10, [[2]])
# TODO: remove in 0.24
def test_safe_indexing():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
safe_indexing([1, 2], 0)
# TODO: remove in 0.24
def test_partial_dependence_no_shadowing():
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/15842
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning)
from sklearn.inspection.partial_dependence import partial_dependence as _ # noqa
# Calling all_estimators() also triggers a recursive import of all
# submodules, including deprecated ones.
all_estimators()
from sklearn.inspection import partial_dependence
assert isinstance(partial_dependence, types.FunctionType)
# TODO: remove in 0.24
def test_dict_learning_no_shadowing():
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/15842
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning)
from sklearn.decomposition.dict_learning import dict_learning as _ # noqa
# Calling all_estimators() also triggers a recursive import of all
# submodules, including deprecated ones.
all_estimators()
from sklearn.decomposition import dict_learning
assert isinstance(dict_learning, types.FunctionType)

View file

@ -0,0 +1,59 @@
# Authors: Raghav RV <rvraghav93@gmail.com>
# License: BSD 3 clause
import pickle
from sklearn.utils.deprecation import _is_deprecated
from sklearn.utils.deprecation import deprecated
from sklearn.utils._testing import assert_warns_message
@deprecated('qwerty')
class MockClass1:
pass
class MockClass2:
@deprecated('mockclass2_method')
def method(self):
pass
class MockClass3:
@deprecated()
def __init__(self):
pass
class MockClass4:
pass
@deprecated()
def mock_function():
return 10
def test_deprecated():
assert_warns_message(FutureWarning, 'qwerty', MockClass1)
assert_warns_message(FutureWarning, 'mockclass2_method',
MockClass2().method)
assert_warns_message(FutureWarning, 'deprecated', MockClass3)
val = assert_warns_message(FutureWarning, 'deprecated',
mock_function)
assert val == 10
def test_is_deprecated():
# Test if _is_deprecated helper identifies wrapping via deprecated
# NOTE it works only for class methods and functions
assert _is_deprecated(MockClass1.__init__)
assert _is_deprecated(MockClass2().method)
assert _is_deprecated(MockClass3.__init__)
assert not _is_deprecated(MockClass4.__init__)
assert _is_deprecated(mock_function)
def test_pickle():
pickle.loads(pickle.dumps(mock_function))

View file

@ -0,0 +1,640 @@
import unittest
import sys
import numpy as np
import scipy.sparse as sp
import joblib
from io import StringIO
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import deprecated
from sklearn.utils._testing import (assert_raises_regex,
ignore_warnings,
assert_warns, assert_raises,
SkipTest)
from sklearn.utils.estimator_checks import check_estimator, _NotAnArray
from sklearn.utils.estimator_checks \
import check_class_weight_balanced_linear_classifier
from sklearn.utils.estimator_checks import set_random_state
from sklearn.utils.estimator_checks import _set_checking_parameters
from sklearn.utils.estimator_checks import check_estimators_unfitted
from sklearn.utils.estimator_checks import check_fit_score_takes_y
from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
from sklearn.utils.estimator_checks import check_classifier_data_not_an_array
from sklearn.utils.estimator_checks import check_regressor_data_not_an_array
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.estimator_checks import check_outlier_corruption
from sklearn.utils.fixes import np_version, parse_version
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.mixture import GaussianMixture
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import NMF
from sklearn.linear_model import MultiTaskElasticNet, LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils.validation import check_array
from sklearn.utils import all_estimators
class CorrectNotFittedError(ValueError):
"""Exception class to raise if estimator is used before fitting.
Like NotFittedError, it inherits from ValueError, but not from
AttributeError. Used for testing only.
"""
class BaseBadClassifier(ClassifierMixin, BaseEstimator):
def fit(self, X, y):
return self
def predict(self, X):
return np.ones(X.shape[0])
class ChangesDict(BaseEstimator):
def __init__(self, key=0):
self.key = key
def fit(self, X, y=None):
X, y = self._validate_data(X, y)
return self
def predict(self, X):
X = check_array(X)
self.key = 1000
return np.ones(X.shape[0])
class SetsWrongAttribute(BaseEstimator):
def __init__(self, acceptable_key=0):
self.acceptable_key = acceptable_key
def fit(self, X, y=None):
self.wrong_attribute = 0
X, y = self._validate_data(X, y)
return self
class ChangesWrongAttribute(BaseEstimator):
def __init__(self, wrong_attribute=0):
self.wrong_attribute = wrong_attribute
def fit(self, X, y=None):
self.wrong_attribute = 1
X, y = self._validate_data(X, y)
return self
class ChangesUnderscoreAttribute(BaseEstimator):
def fit(self, X, y=None):
self._good_attribute = 1
X, y = self._validate_data(X, y)
return self
class RaisesErrorInSetParams(BaseEstimator):
def __init__(self, p=0):
self.p = p
def set_params(self, **kwargs):
if 'p' in kwargs:
p = kwargs.pop('p')
if p < 0:
raise ValueError("p can't be less than 0")
self.p = p
return super().set_params(**kwargs)
def fit(self, X, y=None):
X, y = self._validate_data(X, y)
return self
class ModifiesValueInsteadOfRaisingError(BaseEstimator):
def __init__(self, p=0):
self.p = p
def set_params(self, **kwargs):
if 'p' in kwargs:
p = kwargs.pop('p')
if p < 0:
p = 0
self.p = p
return super().set_params(**kwargs)
def fit(self, X, y=None):
X, y = self._validate_data(X, y)
return self
class ModifiesAnotherValue(BaseEstimator):
def __init__(self, a=0, b='method1'):
self.a = a
self.b = b
def set_params(self, **kwargs):
if 'a' in kwargs:
a = kwargs.pop('a')
self.a = a
if a is None:
kwargs.pop('b')
self.b = 'method2'
return super().set_params(**kwargs)
def fit(self, X, y=None):
X, y = self._validate_data(X, y)
return self
class NoCheckinPredict(BaseBadClassifier):
def fit(self, X, y):
X, y = self._validate_data(X, y)
return self
class NoSparseClassifier(BaseBadClassifier):
def fit(self, X, y):
X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'])
if sp.issparse(X):
raise ValueError("Nonsensical Error")
return self
def predict(self, X):
X = check_array(X)
return np.ones(X.shape[0])
class CorrectNotFittedErrorClassifier(BaseBadClassifier):
def fit(self, X, y):
X, y = self._validate_data(X, y)
self.coef_ = np.ones(X.shape[1])
return self
def predict(self, X):
check_is_fitted(self)
X = check_array(X)
return np.ones(X.shape[0])
class NoSampleWeightPandasSeriesType(BaseEstimator):
def fit(self, X, y, sample_weight=None):
# Convert data
X, y = self._validate_data(
X, y,
accept_sparse=("csr", "csc"),
multi_output=True,
y_numeric=True)
# Function is only called after we verify that pandas is installed
from pandas import Series
if isinstance(sample_weight, Series):
raise ValueError("Estimator does not accept 'sample_weight'"
"of type pandas.Series")
return self
def predict(self, X):
X = check_array(X)
return np.ones(X.shape[0])
class BadBalancedWeightsClassifier(BaseBadClassifier):
def __init__(self, class_weight=None):
self.class_weight = class_weight
def fit(self, X, y):
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import compute_class_weight
label_encoder = LabelEncoder().fit(y)
classes = label_encoder.classes_
class_weight = compute_class_weight(self.class_weight, classes=classes,
y=y)
# Intentionally modify the balanced class_weight
# to simulate a bug and raise an exception
if self.class_weight == "balanced":
class_weight += 1.
# Simply assigning coef_ to the class_weight
self.coef_ = class_weight
return self
class BadTransformerWithoutMixin(BaseEstimator):
def fit(self, X, y=None):
X = self._validate_data(X)
return self
def transform(self, X):
X = check_array(X)
return X
class NotInvariantPredict(BaseEstimator):
def fit(self, X, y):
# Convert data
X, y = self._validate_data(
X, y,
accept_sparse=("csr", "csc"),
multi_output=True,
y_numeric=True)
return self
def predict(self, X):
# return 1 if X has more than one element else return 0
X = check_array(X)
if X.shape[0] > 1:
return np.ones(X.shape[0])
return np.zeros(X.shape[0])
class LargeSparseNotSupportedClassifier(BaseEstimator):
def fit(self, X, y):
X, y = self._validate_data(
X, y,
accept_sparse=("csr", "csc", "coo"),
accept_large_sparse=True,
multi_output=True,
y_numeric=True)
if sp.issparse(X):
if X.getformat() == "coo":
if X.row.dtype == "int64" or X.col.dtype == "int64":
raise ValueError(
"Estimator doesn't support 64-bit indices")
elif X.getformat() in ["csc", "csr"]:
assert "int64" not in (X.indices.dtype, X.indptr.dtype),\
"Estimator doesn't support 64-bit indices"
return self
class SparseTransformer(BaseEstimator):
def fit(self, X, y=None):
self.X_shape_ = self._validate_data(X).shape
return self
def fit_transform(self, X, y=None):
return self.fit(X, y).transform(X)
def transform(self, X):
X = check_array(X)
if X.shape[1] != self.X_shape_[1]:
raise ValueError('Bad number of features')
return sp.csr_matrix(X)
class EstimatorInconsistentForPandas(BaseEstimator):
def fit(self, X, y):
try:
from pandas import DataFrame
if isinstance(X, DataFrame):
self.value_ = X.iloc[0, 0]
else:
X = check_array(X)
self.value_ = X[1, 0]
return self
except ImportError:
X = check_array(X)
self.value_ = X[1, 0]
return self
def predict(self, X):
X = check_array(X)
return np.array([self.value_] * X.shape[0])
class UntaggedBinaryClassifier(SGDClassifier):
# Toy classifier that only supports binary classification, will fail tests.
def fit(self, X, y, coef_init=None, intercept_init=None,
sample_weight=None):
super().fit(X, y, coef_init, intercept_init, sample_weight)
if len(self.classes_) > 2:
raise ValueError('Only 2 classes are supported')
return self
def partial_fit(self, X, y, classes=None, sample_weight=None):
super().partial_fit(X=X, y=y, classes=classes,
sample_weight=sample_weight)
if len(self.classes_) > 2:
raise ValueError('Only 2 classes are supported')
return self
class TaggedBinaryClassifier(UntaggedBinaryClassifier):
# Toy classifier that only supports binary classification.
def _more_tags(self):
return {'binary_only': True}
class RequiresPositiveYRegressor(LinearRegression):
def fit(self, X, y):
X, y = self._validate_data(X, y, multi_output=True)
if (y <= 0).any():
raise ValueError('negative y values not supported!')
return super().fit(X, y)
def _more_tags(self):
return {"requires_positive_y": True}
def test_not_an_array_array_function():
if np_version < parse_version('1.17'):
raise SkipTest("array_function protocol not supported in numpy <1.17")
not_array = _NotAnArray(np.ones(10))
msg = "Don't want to call array_function sum!"
assert_raises_regex(TypeError, msg, np.sum, not_array)
# always returns True
assert np.may_share_memory(not_array, None)
def test_check_fit_score_takes_y_works_on_deprecated_fit():
# Tests that check_fit_score_takes_y works on a class with
# a deprecated fit method
class TestEstimatorWithDeprecatedFitMethod(BaseEstimator):
@deprecated("Deprecated for the purpose of testing "
"check_fit_score_takes_y")
def fit(self, X, y):
return self
check_fit_score_takes_y("test", TestEstimatorWithDeprecatedFitMethod())
@ignore_warnings("Passing a class is depr", category=FutureWarning) # 0.24
def test_check_estimator():
# tests that the estimator actually fails on "bad" estimators.
# not a complete test of all checks, which are very extensive.
# check that we have a set_params and can clone
msg = "it does not implement a 'get_params' method"
assert_raises_regex(TypeError, msg, check_estimator, object)
msg = "object has no attribute '_get_tags'"
assert_raises_regex(AttributeError, msg, check_estimator, object())
# check that values returned by get_params match set_params
msg = "get_params result does not match what was passed to set_params"
assert_raises_regex(AssertionError, msg, check_estimator,
ModifiesValueInsteadOfRaisingError())
assert_warns(UserWarning, check_estimator, RaisesErrorInSetParams())
assert_raises_regex(AssertionError, msg, check_estimator,
ModifiesAnotherValue())
# check that we have a fit method
msg = "object has no attribute 'fit'"
assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator)
assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator())
# check that fit does input validation
msg = "ValueError not raised"
assert_raises_regex(AssertionError, msg, check_estimator,
BaseBadClassifier)
assert_raises_regex(AssertionError, msg, check_estimator,
BaseBadClassifier())
# check that sample_weights in fit accepts pandas.Series type
try:
from pandas import Series # noqa
msg = ("Estimator NoSampleWeightPandasSeriesType raises error if "
"'sample_weight' parameter is of type pandas.Series")
assert_raises_regex(
ValueError, msg, check_estimator, NoSampleWeightPandasSeriesType)
except ImportError:
pass
# check that predict does input validation (doesn't accept dicts in input)
msg = "Estimator doesn't check for NaN and inf in predict"
assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict)
assert_raises_regex(AssertionError, msg, check_estimator,
NoCheckinPredict())
# check that estimator state does not change
# at transform/predict/predict_proba time
msg = 'Estimator changes __dict__ during predict'
assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict)
# check that `fit` only changes attribures that
# are private (start with an _ or end with a _).
msg = ('Estimator ChangesWrongAttribute should not change or mutate '
'the parameter wrong_attribute from 0 to 1 during fit.')
assert_raises_regex(AssertionError, msg,
check_estimator, ChangesWrongAttribute)
check_estimator(ChangesUnderscoreAttribute)
# check that `fit` doesn't add any public attribute
msg = (r'Estimator adds public attribute\(s\) during the fit method.'
' Estimators are only allowed to add private attributes'
' either started with _ or ended'
' with _ but wrong_attribute added')
assert_raises_regex(AssertionError, msg,
check_estimator, SetsWrongAttribute)
# check for invariant method
name = NotInvariantPredict.__name__
method = 'predict'
msg = ("{method} of {name} is not invariant when applied "
"to a subset.").format(method=method, name=name)
assert_raises_regex(AssertionError, msg,
check_estimator, NotInvariantPredict)
# check for sparse matrix input handling
name = NoSparseClassifier.__name__
msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name
# the check for sparse input handling prints to the stdout,
# instead of raising an error, so as not to remove the original traceback.
# that means we need to jump through some hoops to catch it.
old_stdout = sys.stdout
string_buffer = StringIO()
sys.stdout = string_buffer
try:
check_estimator(NoSparseClassifier)
except:
pass
finally:
sys.stdout = old_stdout
assert msg in string_buffer.getvalue()
# Large indices test on bad estimator
msg = ('Estimator LargeSparseNotSupportedClassifier doesn\'t seem to '
r'support \S{3}_64 matrix, and is not failing gracefully.*')
assert_raises_regex(AssertionError, msg, check_estimator,
LargeSparseNotSupportedClassifier)
# does error on binary_only untagged estimator
msg = 'Only 2 classes are supported'
assert_raises_regex(ValueError, msg, check_estimator,
UntaggedBinaryClassifier)
# non-regression test for estimators transforming to sparse data
check_estimator(SparseTransformer())
# doesn't error on actual estimator
check_estimator(LogisticRegression)
check_estimator(LogisticRegression(C=0.01))
check_estimator(MultiTaskElasticNet)
check_estimator(MultiTaskElasticNet())
# doesn't error on binary_only tagged estimator
check_estimator(TaggedBinaryClassifier)
# Check regressor with requires_positive_y estimator tag
msg = 'negative y values not supported!'
assert_raises_regex(ValueError, msg, check_estimator,
RequiresPositiveYRegressor)
def test_check_outlier_corruption():
# should raise AssertionError
decision = np.array([0., 1., 1.5, 2.])
assert_raises(AssertionError, check_outlier_corruption, 1, 2, decision)
# should pass
decision = np.array([0., 1., 1., 2.])
check_outlier_corruption(1, 2, decision)
def test_check_estimator_transformer_no_mixin():
# check that TransformerMixin is not required for transformer tests to run
assert_raises_regex(AttributeError, '.*fit_transform.*',
check_estimator, BadTransformerWithoutMixin())
def test_check_estimator_clones():
# check that check_estimator doesn't modify the estimator it receives
from sklearn.datasets import load_iris
iris = load_iris()
for Estimator in [GaussianMixture, LinearRegression,
RandomForestClassifier, NMF, SGDClassifier,
MiniBatchKMeans]:
with ignore_warnings(category=FutureWarning):
# when 'est = SGDClassifier()'
est = Estimator()
_set_checking_parameters(est)
set_random_state(est)
# without fitting
old_hash = joblib.hash(est)
check_estimator(est)
assert old_hash == joblib.hash(est)
with ignore_warnings(category=FutureWarning):
# when 'est = SGDClassifier()'
est = Estimator()
_set_checking_parameters(est)
set_random_state(est)
# with fitting
est.fit(iris.data + 10, iris.target)
old_hash = joblib.hash(est)
check_estimator(est)
assert old_hash == joblib.hash(est)
def test_check_estimators_unfitted():
# check that a ValueError/AttributeError is raised when calling predict
# on an unfitted estimator
msg = "NotFittedError not raised by predict"
assert_raises_regex(AssertionError, msg, check_estimators_unfitted,
"estimator", NoSparseClassifier())
# check that CorrectNotFittedError inherit from either ValueError
# or AttributeError
check_estimators_unfitted("estimator", CorrectNotFittedErrorClassifier())
def test_check_no_attributes_set_in_init():
class NonConformantEstimatorPrivateSet(BaseEstimator):
def __init__(self):
self.you_should_not_set_this_ = None
class NonConformantEstimatorNoParamSet(BaseEstimator):
def __init__(self, you_should_set_this_=None):
pass
assert_raises_regex(AssertionError,
"Estimator estimator_name should not set any"
" attribute apart from parameters during init."
r" Found attributes \['you_should_not_set_this_'\].",
check_no_attributes_set_in_init,
'estimator_name',
NonConformantEstimatorPrivateSet())
assert_raises_regex(AssertionError,
"Estimator estimator_name should store all "
"parameters as an attribute during init. "
"Did not find attributes "
r"\['you_should_set_this_'\].",
check_no_attributes_set_in_init,
'estimator_name',
NonConformantEstimatorNoParamSet())
def test_check_estimator_pairwise():
# check that check_estimator() works on estimator with _pairwise
# kernel or metric
# test precomputed kernel
est = SVC(kernel='precomputed')
check_estimator(est)
# test precomputed metric
est = KNeighborsRegressor(metric='precomputed')
check_estimator(est)
def test_check_classifier_data_not_an_array():
assert_raises_regex(AssertionError,
'Not equal to tolerance',
check_classifier_data_not_an_array,
'estimator_name',
EstimatorInconsistentForPandas())
def test_check_regressor_data_not_an_array():
assert_raises_regex(AssertionError,
'Not equal to tolerance',
check_regressor_data_not_an_array,
'estimator_name',
EstimatorInconsistentForPandas())
@ignore_warnings("Passing a class is depr", category=FutureWarning) # 0.24
def test_check_estimator_required_parameters_skip():
# TODO: remove whole test in 0.24 since passes classes to check_estimator()
# isn't supported anymore
class MyEstimator(BaseEstimator):
_required_parameters = ["special_parameter"]
def __init__(self, special_parameter):
self.special_parameter = special_parameter
assert_raises_regex(SkipTest, r"Can't instantiate estimator MyEstimator "
r"which requires parameters "
r"\['special_parameter'\]",
check_estimator, MyEstimator)
def run_tests_without_pytest():
"""Runs the tests in this file without using pytest.
"""
main_module = sys.modules['__main__']
test_functions = [getattr(main_module, name) for name in dir(main_module)
if name.startswith('test_')]
test_cases = [unittest.FunctionTestCase(fn) for fn in test_functions]
suite = unittest.TestSuite()
suite.addTests(test_cases)
runner = unittest.TextTestRunner()
runner.run(suite)
def test_check_class_weight_balanced_linear_classifier():
# check that ill-computed balanced weights raises an exception
assert_raises_regex(AssertionError,
"Classifier estimator_name is not computing"
" class_weight=balanced properly.",
check_class_weight_balanced_linear_classifier,
'estimator_name',
BadBalancedWeightsClassifier)
def test_all_estimators_all_public():
# all_estimator should not fail when pytest is not installed and return
# only public estimators
estimators = all_estimators()
for est in estimators:
assert not est.__class__.__name__.startswith("_")
if __name__ == '__main__':
# This module is run as a script to check that we have no dependency on
# pytest for estimator checks.
run_tests_without_pytest()

View file

@ -0,0 +1,267 @@
from contextlib import closing
from io import StringIO
import pytest
from sklearn import config_context
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectPercentile
from sklearn.cluster import Birch
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import LinearSVC
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import StackingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.utils._estimator_html_repr import _write_label_html
from sklearn.utils._estimator_html_repr import _get_visual_block
from sklearn.utils._estimator_html_repr import estimator_html_repr
@pytest.mark.parametrize("checked", [True, False])
def test_write_label_html(checked):
# Test checking logic and labeling
name = "LogisticRegression"
tool_tip = "hello-world"
with closing(StringIO()) as out:
_write_label_html(out, name, tool_tip, checked=checked)
html_label = out.getvalue()
assert 'LogisticRegression</label>' in html_label
assert html_label.startswith('<div class="sk-label-container">')
assert '<pre>hello-world</pre>' in html_label
if checked:
assert 'checked>' in html_label
@pytest.mark.parametrize('est', ['passthrough', 'drop', None])
def test_get_visual_block_single_str_none(est):
# Test estimators that are represnted by strings
est_html_info = _get_visual_block(est)
assert est_html_info.kind == 'single'
assert est_html_info.estimators == est
assert est_html_info.names == str(est)
assert est_html_info.name_details == str(est)
def test_get_visual_block_single_estimator():
est = LogisticRegression(C=10.0)
est_html_info = _get_visual_block(est)
assert est_html_info.kind == 'single'
assert est_html_info.estimators == est
assert est_html_info.names == est.__class__.__name__
assert est_html_info.name_details == str(est)
def test_get_visual_block_pipeline():
pipe = Pipeline([
('imputer', SimpleImputer()),
('do_nothing', 'passthrough'),
('do_nothing_more', None),
('classifier', LogisticRegression())
])
est_html_info = _get_visual_block(pipe)
assert est_html_info.kind == 'serial'
assert est_html_info.estimators == tuple(step[1] for step in pipe.steps)
assert est_html_info.names == ['imputer: SimpleImputer',
'do_nothing: passthrough',
'do_nothing_more: passthrough',
'classifier: LogisticRegression']
assert est_html_info.name_details == [str(est) for _, est in pipe.steps]
def test_get_visual_block_feature_union():
f_union = FeatureUnion([
('pca', PCA()), ('svd', TruncatedSVD())
])
est_html_info = _get_visual_block(f_union)
assert est_html_info.kind == 'parallel'
assert est_html_info.names == ('pca', 'svd')
assert est_html_info.estimators == tuple(
trans[1] for trans in f_union.transformer_list)
assert est_html_info.name_details == (None, None)
def test_get_visual_block_voting():
clf = VotingClassifier([
('log_reg', LogisticRegression()),
('mlp', MLPClassifier())
])
est_html_info = _get_visual_block(clf)
assert est_html_info.kind == 'parallel'
assert est_html_info.estimators == tuple(trans[1]
for trans in clf.estimators)
assert est_html_info.names == ('log_reg', 'mlp')
assert est_html_info.name_details == (None, None)
def test_get_visual_block_column_transformer():
ct = ColumnTransformer([
('pca', PCA(), ['num1', 'num2']),
('svd', TruncatedSVD, [0, 3])
])
est_html_info = _get_visual_block(ct)
assert est_html_info.kind == 'parallel'
assert est_html_info.estimators == tuple(
trans[1] for trans in ct.transformers)
assert est_html_info.names == ('pca', 'svd')
assert est_html_info.name_details == (['num1', 'num2'], [0, 3])
def test_estimator_html_repr_pipeline():
num_trans = Pipeline(steps=[
('pass', 'passthrough'),
('imputer', SimpleImputer(strategy='median'))
])
cat_trans = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant',
missing_values='empty')),
('one-hot', OneHotEncoder(drop='first'))
])
preprocess = ColumnTransformer([
('num', num_trans, ['a', 'b', 'c', 'd', 'e']),
('cat', cat_trans, [0, 1, 2, 3])
])
feat_u = FeatureUnion([
('pca', PCA(n_components=1)),
('tsvd', Pipeline([('first', TruncatedSVD(n_components=3)),
('select', SelectPercentile())]))
])
clf = VotingClassifier([
('lr', LogisticRegression(solver='lbfgs', random_state=1)),
('mlp', MLPClassifier(alpha=0.001))
])
pipe = Pipeline([
('preprocessor', preprocess), ('feat_u', feat_u), ('classifier', clf)
])
html_output = estimator_html_repr(pipe)
# top level estimators show estimator with changes
assert str(pipe) in html_output
for _, est in pipe.steps:
assert (f"<div class=\"sk-toggleable__content\">"
f"<pre>{str(est)}") in html_output
# low level estimators do not show changes
with config_context(print_changed_only=True):
assert str(num_trans['pass']) in html_output
assert 'passthrough</label>' in html_output
assert str(num_trans['imputer']) in html_output
for _, _, cols in preprocess.transformers:
assert f"<pre>{cols}</pre>" in html_output
# feature union
for name, _ in feat_u.transformer_list:
assert f"<label>{name}</label>" in html_output
pca = feat_u.transformer_list[0][1]
assert f"<pre>{str(pca)}</pre>" in html_output
tsvd = feat_u.transformer_list[1][1]
first = tsvd['first']
select = tsvd['select']
assert f"<pre>{str(first)}</pre>" in html_output
assert f"<pre>{str(select)}</pre>" in html_output
# voting classifer
for name, est in clf.estimators:
assert f"<label>{name}</label>" in html_output
assert f"<pre>{str(est)}</pre>" in html_output
@pytest.mark.parametrize("final_estimator", [None, LinearSVC()])
def test_stacking_classsifer(final_estimator):
estimators = [('mlp', MLPClassifier(alpha=0.001)),
('tree', DecisionTreeClassifier())]
clf = StackingClassifier(
estimators=estimators, final_estimator=final_estimator)
html_output = estimator_html_repr(clf)
assert str(clf) in html_output
# If final_estimator's default changes from LogisticRegression
# this should be updated
if final_estimator is None:
assert "LogisticRegression(" in html_output
else:
assert final_estimator.__class__.__name__ in html_output
@pytest.mark.parametrize("final_estimator", [None, LinearSVR()])
def test_stacking_regressor(final_estimator):
reg = StackingRegressor(
estimators=[('svr', LinearSVR())], final_estimator=final_estimator)
html_output = estimator_html_repr(reg)
assert str(reg.estimators[0][0]) in html_output
assert "LinearSVR</label>" in html_output
if final_estimator is None:
assert "RidgeCV</label>" in html_output
else:
assert final_estimator.__class__.__name__ in html_output
def test_birch_duck_typing_meta():
# Test duck typing meta estimators with Birch
birch = Birch(n_clusters=AgglomerativeClustering(n_clusters=3))
html_output = estimator_html_repr(birch)
# inner estimators do not show changes
with config_context(print_changed_only=True):
assert f"<pre>{str(birch.n_clusters)}" in html_output
assert "AgglomerativeClustering</label>" in html_output
# outer estimator contains all changes
assert f"<pre>{str(birch)}" in html_output
def test_ovo_classifier_duck_typing_meta():
# Test duck typing metaestimators with OVO
ovo = OneVsOneClassifier(LinearSVC(penalty='l1'))
html_output = estimator_html_repr(ovo)
# inner estimators do not show changes
with config_context(print_changed_only=True):
assert f"<pre>{str(ovo.estimator)}" in html_output
assert "LinearSVC</label>" in html_output
# outter estimator
assert f"<pre>{str(ovo)}" in html_output
def test_duck_typing_nested_estimator():
# Test duck typing metaestimators with GP
kernel = RationalQuadratic(length_scale=1.0, alpha=0.1)
gp = GaussianProcessRegressor(kernel=kernel)
html_output = estimator_html_repr(gp)
assert f"<pre>{str(kernel)}" in html_output
assert f"<pre>{str(gp)}" in html_output
@pytest.mark.parametrize('print_changed_only', [True, False])
def test_one_estimator_print_change_only(print_changed_only):
pca = PCA(n_components=10)
with config_context(print_changed_only=print_changed_only):
pca_repr = str(pca)
html_output = estimator_html_repr(pca)
assert pca_repr in html_output

View file

@ -0,0 +1,722 @@
# Authors: Olivier Grisel <olivier.grisel@ensta.org>
# Mathieu Blondel <mathieu@mblondel.org>
# Denis Engemann <denis-alexander.engemann@inria.fr>
#
# License: BSD 3 clause
import numpy as np
from scipy import sparse
from scipy import linalg
from scipy import stats
from scipy.special import expit
import pytest
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import assert_allclose_dense_sparse
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_warns
from sklearn.utils._testing import assert_warns_message
from sklearn.utils._testing import skip_if_32bit
from sklearn.utils.extmath import density
from sklearn.utils.extmath import randomized_svd
from sklearn.utils.extmath import row_norms
from sklearn.utils.extmath import weighted_mode
from sklearn.utils.extmath import cartesian
from sklearn.utils.extmath import log_logistic
from sklearn.utils.extmath import svd_flip
from sklearn.utils.extmath import _incremental_mean_and_var
from sklearn.utils.extmath import _deterministic_vector_sign_flip
from sklearn.utils.extmath import softmax
from sklearn.utils.extmath import stable_cumsum
from sklearn.utils.extmath import safe_min
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.datasets import make_low_rank_matrix
def test_density():
rng = np.random.RandomState(0)
X = rng.randint(10, size=(10, 5))
X[1, 2] = 0
X[5, 3] = 0
X_csr = sparse.csr_matrix(X)
X_csc = sparse.csc_matrix(X)
X_coo = sparse.coo_matrix(X)
X_lil = sparse.lil_matrix(X)
for X_ in (X_csr, X_csc, X_coo, X_lil):
assert density(X_) == density(X)
def test_uniform_weights():
# with uniform weights, results should be identical to stats.mode
rng = np.random.RandomState(0)
x = rng.randint(10, size=(10, 5))
weights = np.ones(x.shape)
for axis in (None, 0, 1):
mode, score = stats.mode(x, axis)
mode2, score2 = weighted_mode(x, weights, axis=axis)
assert_array_equal(mode, mode2)
assert_array_equal(score, score2)
def test_random_weights():
# set this up so that each row should have a weighted mode of 6,
# with a score that is easily reproduced
mode_result = 6
rng = np.random.RandomState(0)
x = rng.randint(mode_result, size=(100, 10))
w = rng.random_sample(x.shape)
x[:, :5] = mode_result
w[:, :5] += 1
mode, score = weighted_mode(x, w, axis=1)
assert_array_equal(mode, mode_result)
assert_array_almost_equal(score.ravel(), w[:, :5].sum(1))
def check_randomized_svd_low_rank(dtype):
# Check that extmath.randomized_svd is consistent with linalg.svd
n_samples = 100
n_features = 500
rank = 5
k = 10
decimal = 5 if dtype == np.float32 else 7
dtype = np.dtype(dtype)
# generate a matrix X of approximate effective rank `rank` and no noise
# component (very structured signal):
X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
effective_rank=rank, tail_strength=0.0,
random_state=0).astype(dtype, copy=False)
assert X.shape == (n_samples, n_features)
# compute the singular values of X using the slow exact method
U, s, V = linalg.svd(X, full_matrices=False)
# Convert the singular values to the specific dtype
U = U.astype(dtype, copy=False)
s = s.astype(dtype, copy=False)
V = V.astype(dtype, copy=False)
for normalizer in ['auto', 'LU', 'QR']: # 'none' would not be stable
# compute the singular values of X using the fast approximate method
Ua, sa, Va = randomized_svd(
X, k, power_iteration_normalizer=normalizer, random_state=0)
# If the input dtype is float, then the output dtype is float of the
# same bit size (f32 is not upcast to f64)
# But if the input dtype is int, the output dtype is float64
if dtype.kind == 'f':
assert Ua.dtype == dtype
assert sa.dtype == dtype
assert Va.dtype == dtype
else:
assert Ua.dtype == np.float64
assert sa.dtype == np.float64
assert Va.dtype == np.float64
assert Ua.shape == (n_samples, k)
assert sa.shape == (k,)
assert Va.shape == (k, n_features)
# ensure that the singular values of both methods are equal up to the
# real rank of the matrix
assert_almost_equal(s[:k], sa, decimal=decimal)
# check the singular vectors too (while not checking the sign)
assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va),
decimal=decimal)
# check the sparse matrix representation
X = sparse.csr_matrix(X)
# compute the singular values of X using the fast approximate method
Ua, sa, Va = \
randomized_svd(X, k, power_iteration_normalizer=normalizer,
random_state=0)
if dtype.kind == 'f':
assert Ua.dtype == dtype
assert sa.dtype == dtype
assert Va.dtype == dtype
else:
assert Ua.dtype.kind == 'f'
assert sa.dtype.kind == 'f'
assert Va.dtype.kind == 'f'
assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)
@pytest.mark.parametrize('dtype',
(np.int32, np.int64, np.float32, np.float64))
def test_randomized_svd_low_rank_all_dtypes(dtype):
check_randomized_svd_low_rank(dtype)
@pytest.mark.parametrize('dtype',
(np.float32, np.float64))
def test_row_norms(dtype):
X = np.random.RandomState(42).randn(100, 100)
if dtype is np.float32:
precision = 4
else:
precision = 5
X = X.astype(dtype, copy=False)
sq_norm = (X ** 2).sum(axis=1)
assert_array_almost_equal(sq_norm, row_norms(X, squared=True),
precision)
assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision)
for csr_index_dtype in [np.int32, np.int64]:
Xcsr = sparse.csr_matrix(X, dtype=dtype)
# csr_matrix will use int32 indices by default,
# up-casting those to int64 when necessary
if csr_index_dtype is np.int64:
Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype, copy=False)
Xcsr.indices = Xcsr.indices.astype(csr_index_dtype, copy=False)
assert Xcsr.indices.dtype == csr_index_dtype
assert Xcsr.indptr.dtype == csr_index_dtype
assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True),
precision)
assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr),
precision)
def test_randomized_svd_low_rank_with_noise():
# Check that extmath.randomized_svd can handle noisy matrices
n_samples = 100
n_features = 500
rank = 5
k = 10
# generate a matrix X wity structure approximate rank `rank` and an
# important noisy component
X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
effective_rank=rank, tail_strength=0.1,
random_state=0)
assert X.shape == (n_samples, n_features)
# compute the singular values of X using the slow exact method
_, s, _ = linalg.svd(X, full_matrices=False)
for normalizer in ['auto', 'none', 'LU', 'QR']:
# compute the singular values of X using the fast approximate
# method without the iterated power method
_, sa, _ = randomized_svd(X, k, n_iter=0,
power_iteration_normalizer=normalizer,
random_state=0)
# the approximation does not tolerate the noise:
assert np.abs(s[:k] - sa).max() > 0.01
# compute the singular values of X using the fast approximate
# method with iterated power method
_, sap, _ = randomized_svd(X, k,
power_iteration_normalizer=normalizer,
random_state=0)
# the iterated power method is helping getting rid of the noise:
assert_almost_equal(s[:k], sap, decimal=3)
def test_randomized_svd_infinite_rank():
# Check that extmath.randomized_svd can handle noisy matrices
n_samples = 100
n_features = 500
rank = 5
k = 10
# let us try again without 'low_rank component': just regularly but slowly
# decreasing singular values: the rank of the data matrix is infinite
X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
effective_rank=rank, tail_strength=1.0,
random_state=0)
assert X.shape == (n_samples, n_features)
# compute the singular values of X using the slow exact method
_, s, _ = linalg.svd(X, full_matrices=False)
for normalizer in ['auto', 'none', 'LU', 'QR']:
# compute the singular values of X using the fast approximate method
# without the iterated power method
_, sa, _ = randomized_svd(X, k, n_iter=0,
power_iteration_normalizer=normalizer)
# the approximation does not tolerate the noise:
assert np.abs(s[:k] - sa).max() > 0.1
# compute the singular values of X using the fast approximate method
# with iterated power method
_, sap, _ = randomized_svd(X, k, n_iter=5,
power_iteration_normalizer=normalizer)
# the iterated power method is still managing to get most of the
# structure at the requested rank
assert_almost_equal(s[:k], sap, decimal=3)
def test_randomized_svd_transpose_consistency():
# Check that transposing the design matrix has limited impact
n_samples = 100
n_features = 500
rank = 4
k = 10
X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
effective_rank=rank, tail_strength=0.5,
random_state=0)
assert X.shape == (n_samples, n_features)
U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False,
random_state=0)
U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True,
random_state=0)
U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose='auto',
random_state=0)
U4, s4, V4 = linalg.svd(X, full_matrices=False)
assert_almost_equal(s1, s4[:k], decimal=3)
assert_almost_equal(s2, s4[:k], decimal=3)
assert_almost_equal(s3, s4[:k], decimal=3)
assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]),
decimal=2)
assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]),
decimal=2)
# in this case 'auto' is equivalent to transpose
assert_almost_equal(s2, s3)
def test_randomized_svd_power_iteration_normalizer():
# randomized_svd with power_iteration_normalized='none' diverges for
# large number of power iterations on this dataset
rng = np.random.RandomState(42)
X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng)
X += 3 * rng.randint(0, 2, size=X.shape)
n_components = 50
# Check that it diverges with many (non-normalized) power iterations
U, s, V = randomized_svd(X, n_components, n_iter=2,
power_iteration_normalizer='none')
A = X - U.dot(np.diag(s).dot(V))
error_2 = linalg.norm(A, ord='fro')
U, s, V = randomized_svd(X, n_components, n_iter=20,
power_iteration_normalizer='none')
A = X - U.dot(np.diag(s).dot(V))
error_20 = linalg.norm(A, ord='fro')
assert np.abs(error_2 - error_20) > 100
for normalizer in ['LU', 'QR', 'auto']:
U, s, V = randomized_svd(X, n_components, n_iter=2,
power_iteration_normalizer=normalizer,
random_state=0)
A = X - U.dot(np.diag(s).dot(V))
error_2 = linalg.norm(A, ord='fro')
for i in [5, 10, 50]:
U, s, V = randomized_svd(X, n_components, n_iter=i,
power_iteration_normalizer=normalizer,
random_state=0)
A = X - U.dot(np.diag(s).dot(V))
error = linalg.norm(A, ord='fro')
assert 15 > np.abs(error_2 - error)
def test_randomized_svd_sparse_warnings():
# randomized_svd throws a warning for lil and dok matrix
rng = np.random.RandomState(42)
X = make_low_rank_matrix(50, 20, effective_rank=10, random_state=rng)
n_components = 5
for cls in (sparse.lil_matrix, sparse.dok_matrix):
X = cls(X)
assert_warns_message(
sparse.SparseEfficiencyWarning,
"Calculating SVD of a {} is expensive. "
"csr_matrix is more efficient.".format(cls.__name__),
randomized_svd, X, n_components, n_iter=1,
power_iteration_normalizer='none')
def test_svd_flip():
# Check that svd_flip works in both situations, and reconstructs input.
rs = np.random.RandomState(1999)
n_samples = 20
n_features = 10
X = rs.randn(n_samples, n_features)
# Check matrix reconstruction
U, S, V = linalg.svd(X, full_matrices=False)
U1, V1 = svd_flip(U, V, u_based_decision=False)
assert_almost_equal(np.dot(U1 * S, V1), X, decimal=6)
# Check transposed matrix reconstruction
XT = X.T
U, S, V = linalg.svd(XT, full_matrices=False)
U2, V2 = svd_flip(U, V, u_based_decision=True)
assert_almost_equal(np.dot(U2 * S, V2), XT, decimal=6)
# Check that different flip methods are equivalent under reconstruction
U_flip1, V_flip1 = svd_flip(U, V, u_based_decision=True)
assert_almost_equal(np.dot(U_flip1 * S, V_flip1), XT, decimal=6)
U_flip2, V_flip2 = svd_flip(U, V, u_based_decision=False)
assert_almost_equal(np.dot(U_flip2 * S, V_flip2), XT, decimal=6)
def test_randomized_svd_sign_flip():
a = np.array([[2.0, 0.0], [0.0, 1.0]])
u1, s1, v1 = randomized_svd(a, 2, flip_sign=True, random_state=41)
for seed in range(10):
u2, s2, v2 = randomized_svd(a, 2, flip_sign=True, random_state=seed)
assert_almost_equal(u1, u2)
assert_almost_equal(v1, v2)
assert_almost_equal(np.dot(u2 * s2, v2), a)
assert_almost_equal(np.dot(u2.T, u2), np.eye(2))
assert_almost_equal(np.dot(v2.T, v2), np.eye(2))
def test_randomized_svd_sign_flip_with_transpose():
# Check if the randomized_svd sign flipping is always done based on u
# irrespective of transpose.
# See https://github.com/scikit-learn/scikit-learn/issues/5608
# for more details.
def max_loading_is_positive(u, v):
"""
returns bool tuple indicating if the values maximising np.abs
are positive across all rows for u and across all columns for v.
"""
u_based = (np.abs(u).max(axis=0) == u.max(axis=0)).all()
v_based = (np.abs(v).max(axis=1) == v.max(axis=1)).all()
return u_based, v_based
mat = np.arange(10 * 8).reshape(10, -1)
# Without transpose
u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True)
u_based, v_based = max_loading_is_positive(u_flipped, v_flipped)
assert u_based
assert not v_based
# With transpose
u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd(
mat, 3, flip_sign=True, transpose=True)
u_based, v_based = max_loading_is_positive(
u_flipped_with_transpose, v_flipped_with_transpose)
assert u_based
assert not v_based
def test_cartesian():
# Check if cartesian product delivers the right results
axes = (np.array([1, 2, 3]), np.array([4, 5]), np.array([6, 7]))
true_out = np.array([[1, 4, 6],
[1, 4, 7],
[1, 5, 6],
[1, 5, 7],
[2, 4, 6],
[2, 4, 7],
[2, 5, 6],
[2, 5, 7],
[3, 4, 6],
[3, 4, 7],
[3, 5, 6],
[3, 5, 7]])
out = cartesian(axes)
assert_array_equal(true_out, out)
# check single axis
x = np.arange(3)
assert_array_equal(x[:, np.newaxis], cartesian((x,)))
def test_logistic_sigmoid():
# Check correctness and robustness of logistic sigmoid implementation
def naive_log_logistic(x):
return np.log(expit(x))
x = np.linspace(-2, 2, 50)
assert_array_almost_equal(log_logistic(x), naive_log_logistic(x))
extreme_x = np.array([-100., 100.])
assert_array_almost_equal(log_logistic(extreme_x), [-100, 0])
def test_incremental_variance_update_formulas():
# Test Youngs and Cramer incremental variance formulas.
# Doggie data from https://www.mathsisfun.com/data/standard-deviation.html
A = np.array([[600, 470, 170, 430, 300],
[600, 470, 170, 430, 300],
[600, 470, 170, 430, 300],
[600, 470, 170, 430, 300]]).T
idx = 2
X1 = A[:idx, :]
X2 = A[idx:, :]
old_means = X1.mean(axis=0)
old_variances = X1.var(axis=0)
old_sample_count = np.full(X1.shape[1], X1.shape[0], dtype=np.int32)
final_means, final_variances, final_count = \
_incremental_mean_and_var(X2, old_means, old_variances,
old_sample_count)
assert_almost_equal(final_means, A.mean(axis=0), 6)
assert_almost_equal(final_variances, A.var(axis=0), 6)
assert_almost_equal(final_count, A.shape[0])
def test_incremental_mean_and_variance_ignore_nan():
old_means = np.array([535., 535., 535., 535.])
old_variances = np.array([4225., 4225., 4225., 4225.])
old_sample_count = np.array([2, 2, 2, 2], dtype=np.int32)
X = np.array([[170, 170, 170, 170],
[430, 430, 430, 430],
[300, 300, 300, 300]])
X_nan = np.array([[170, np.nan, 170, 170],
[np.nan, 170, 430, 430],
[430, 430, np.nan, 300],
[300, 300, 300, np.nan]])
X_means, X_variances, X_count = _incremental_mean_and_var(
X, old_means, old_variances, old_sample_count)
X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var(
X_nan, old_means, old_variances, old_sample_count)
assert_allclose(X_nan_means, X_means)
assert_allclose(X_nan_variances, X_variances)
assert_allclose(X_nan_count, X_count)
@skip_if_32bit
def test_incremental_variance_numerical_stability():
# Test Youngs and Cramer incremental variance formulas.
def np_var(A):
return A.var(axis=0)
# Naive one pass variance computation - not numerically stable
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
def one_pass_var(X):
n = X.shape[0]
exp_x2 = (X ** 2).sum(axis=0) / n
expx_2 = (X.sum(axis=0) / n) ** 2
return exp_x2 - expx_2
# Two-pass algorithm, stable.
# We use it as a benchmark. It is not an online algorithm
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
def two_pass_var(X):
mean = X.mean(axis=0)
Y = X.copy()
return np.mean((Y - mean)**2, axis=0)
# Naive online implementation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm
# This works only for chunks for size 1
def naive_mean_variance_update(x, last_mean, last_variance,
last_sample_count):
updated_sample_count = (last_sample_count + 1)
samples_ratio = last_sample_count / float(updated_sample_count)
updated_mean = x / updated_sample_count + last_mean * samples_ratio
updated_variance = last_variance * samples_ratio + \
(x - last_mean) * (x - updated_mean) / updated_sample_count
return updated_mean, updated_variance, updated_sample_count
# We want to show a case when one_pass_var has error > 1e-3 while
# _batch_mean_variance_update has less.
tol = 200
n_features = 2
n_samples = 10000
x1 = np.array(1e8, dtype=np.float64)
x2 = np.log(1e-5, dtype=np.float64)
A0 = np.full((n_samples // 2, n_features), x1, dtype=np.float64)
A1 = np.full((n_samples // 2, n_features), x2, dtype=np.float64)
A = np.vstack((A0, A1))
# Naive one pass var: >tol (=1063)
assert np.abs(np_var(A) - one_pass_var(A)).max() > tol
# Starting point for online algorithms: after A0
# Naive implementation: >tol (436)
mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2
for i in range(A1.shape[0]):
mean, var, n = \
naive_mean_variance_update(A1[i, :], mean, var, n)
assert n == A.shape[0]
# the mean is also slightly unstable
assert np.abs(A.mean(axis=0) - mean).max() > 1e-6
assert np.abs(np_var(A) - var).max() > tol
# Robust implementation: <tol (177)
mean, var = A0[0, :], np.zeros(n_features)
n = np.full(n_features, n_samples // 2, dtype=np.int32)
for i in range(A1.shape[0]):
mean, var, n = \
_incremental_mean_and_var(A1[i, :].reshape((1, A1.shape[1])),
mean, var, n)
assert_array_equal(n, A.shape[0])
assert_array_almost_equal(A.mean(axis=0), mean)
assert tol > np.abs(np_var(A) - var).max()
def test_incremental_variance_ddof():
# Test that degrees of freedom parameter for calculations are correct.
rng = np.random.RandomState(1999)
X = rng.randn(50, 10)
n_samples, n_features = X.shape
for batch_size in [11, 20, 37]:
steps = np.arange(0, X.shape[0], batch_size)
if steps[-1] != X.shape[0]:
steps = np.hstack([steps, n_samples])
for i, j in zip(steps[:-1], steps[1:]):
batch = X[i:j, :]
if i == 0:
incremental_means = batch.mean(axis=0)
incremental_variances = batch.var(axis=0)
# Assign this twice so that the test logic is consistent
incremental_count = batch.shape[0]
sample_count = np.full(batch.shape[1], batch.shape[0],
dtype=np.int32)
else:
result = _incremental_mean_and_var(
batch, incremental_means, incremental_variances,
sample_count)
(incremental_means, incremental_variances,
incremental_count) = result
sample_count += batch.shape[0]
calculated_means = np.mean(X[:j], axis=0)
calculated_variances = np.var(X[:j], axis=0)
assert_almost_equal(incremental_means, calculated_means, 6)
assert_almost_equal(incremental_variances,
calculated_variances, 6)
assert_array_equal(incremental_count, sample_count)
def test_vector_sign_flip():
# Testing that sign flip is working & largest value has positive sign
data = np.random.RandomState(36).randn(5, 5)
max_abs_rows = np.argmax(np.abs(data), axis=1)
data_flipped = _deterministic_vector_sign_flip(data)
max_rows = np.argmax(data_flipped, axis=1)
assert_array_equal(max_abs_rows, max_rows)
signs = np.sign(data[range(data.shape[0]), max_abs_rows])
assert_array_equal(data, data_flipped * signs[:, np.newaxis])
def test_softmax():
rng = np.random.RandomState(0)
X = rng.randn(3, 5)
exp_X = np.exp(X)
sum_exp_X = np.sum(exp_X, axis=1).reshape((-1, 1))
assert_array_almost_equal(softmax(X), exp_X / sum_exp_X)
def test_stable_cumsum():
assert_array_equal(stable_cumsum([1, 2, 3]), np.cumsum([1, 2, 3]))
r = np.random.RandomState(0).rand(100000)
assert_warns(RuntimeWarning, stable_cumsum, r, rtol=0, atol=0)
# test axis parameter
A = np.random.RandomState(36).randint(1000, size=(5, 5, 5))
assert_array_equal(stable_cumsum(A, axis=0), np.cumsum(A, axis=0))
assert_array_equal(stable_cumsum(A, axis=1), np.cumsum(A, axis=1))
assert_array_equal(stable_cumsum(A, axis=2), np.cumsum(A, axis=2))
def test_safe_min():
msg = ("safe_min is deprecated in version 0.22 and will be removed "
"in version 0.24.")
with pytest.warns(FutureWarning, match=msg):
safe_min(np.ones(10))
@pytest.mark.parametrize("A_array_constr", [np.array, sparse.csr_matrix],
ids=["dense", "sparse"])
@pytest.mark.parametrize("B_array_constr", [np.array, sparse.csr_matrix],
ids=["dense", "sparse"])
def test_safe_sparse_dot_2d(A_array_constr, B_array_constr):
rng = np.random.RandomState(0)
A = rng.random_sample((30, 10))
B = rng.random_sample((10, 20))
expected = np.dot(A, B)
A = A_array_constr(A)
B = B_array_constr(B)
actual = safe_sparse_dot(A, B, dense_output=True)
assert_allclose(actual, expected)
def test_safe_sparse_dot_nd():
rng = np.random.RandomState(0)
# dense ND / sparse
A = rng.random_sample((2, 3, 4, 5, 6))
B = rng.random_sample((6, 7))
expected = np.dot(A, B)
B = sparse.csr_matrix(B)
actual = safe_sparse_dot(A, B)
assert_allclose(actual, expected)
# sparse / dense ND
A = rng.random_sample((2, 3))
B = rng.random_sample((4, 5, 3, 6))
expected = np.dot(A, B)
A = sparse.csr_matrix(A)
actual = safe_sparse_dot(A, B)
assert_allclose(actual, expected)
@pytest.mark.parametrize("A_array_constr", [np.array, sparse.csr_matrix],
ids=["dense", "sparse"])
def test_safe_sparse_dot_2d_1d(A_array_constr):
rng = np.random.RandomState(0)
B = rng.random_sample((10))
# 2D @ 1D
A = rng.random_sample((30, 10))
expected = np.dot(A, B)
A = A_array_constr(A)
actual = safe_sparse_dot(A, B)
assert_allclose(actual, expected)
# 1D @ 2D
A = rng.random_sample((10, 30))
expected = np.dot(B, A)
A = A_array_constr(A)
actual = safe_sparse_dot(B, A)
assert_allclose(actual, expected)
@pytest.mark.parametrize("dense_output", [True, False])
def test_safe_sparse_dot_dense_output(dense_output):
rng = np.random.RandomState(0)
A = sparse.random(30, 10, density=0.1, random_state=rng)
B = sparse.random(10, 20, density=0.1, random_state=rng)
expected = A.dot(B)
actual = safe_sparse_dot(A, B, dense_output=dense_output)
assert sparse.issparse(actual) == (not dense_output)
if dense_output:
expected = expected.toarray()
assert_allclose_dense_sparse(actual, expected)

View file

@ -0,0 +1,31 @@
""" Test fast_dict.
"""
import numpy as np
from sklearn.utils._fast_dict import IntFloatDict, argmin
def test_int_float_dict():
rng = np.random.RandomState(0)
keys = np.unique(rng.randint(100, size=10).astype(np.intp))
values = rng.rand(len(keys))
d = IntFloatDict(keys, values)
for key, value in zip(keys, values):
assert d[key] == value
assert len(d) == len(keys)
d.append(120, 3.)
assert d[120] == 3.0
assert len(d) == len(keys) + 1
for i in range(2000):
d.append(i + 1000, 4.0)
assert d[1100] == 4.0
def test_int_float_dict_argmin():
# Test the argmin implementation on the IntFloatDict
keys = np.arange(100, dtype=np.intp)
values = np.arange(100, dtype=np.float64)
d = IntFloatDict(keys, values)
assert argmin(d) == (0, 0)

View file

@ -0,0 +1,91 @@
# Authors: Gael Varoquaux <gael.varoquaux@normalesup.org>
# Justin Vincent
# Lars Buitinck
# License: BSD 3 clause
import math
import numpy as np
import pytest
import scipy.stats
from sklearn.utils._testing import assert_array_equal
from sklearn.utils.fixes import _joblib_parallel_args
from sklearn.utils.fixes import _object_dtype_isnan
from sklearn.utils.fixes import loguniform
from sklearn.utils.fixes import MaskedArray
@pytest.mark.parametrize('joblib_version', ('0.11', '0.12.0'))
def test_joblib_parallel_args(monkeypatch, joblib_version):
import joblib
monkeypatch.setattr(joblib, '__version__', joblib_version)
if joblib_version == '0.12.0':
# arguments are simply passed through
assert _joblib_parallel_args(prefer='threads') == {'prefer': 'threads'}
assert _joblib_parallel_args(prefer='processes', require=None) == {
'prefer': 'processes', 'require': None}
assert _joblib_parallel_args(non_existing=1) == {'non_existing': 1}
elif joblib_version == '0.11':
# arguments are mapped to the corresponding backend
assert _joblib_parallel_args(prefer='threads') == {
'backend': 'threading'}
assert _joblib_parallel_args(prefer='processes') == {
'backend': 'multiprocessing'}
with pytest.raises(ValueError):
_joblib_parallel_args(prefer='invalid')
assert _joblib_parallel_args(
prefer='processes', require='sharedmem') == {
'backend': 'threading'}
with pytest.raises(ValueError):
_joblib_parallel_args(require='invalid')
with pytest.raises(NotImplementedError):
_joblib_parallel_args(verbose=True)
else:
raise ValueError
@pytest.mark.parametrize("dtype, val", ([object, 1],
[object, "a"],
[float, 1]))
def test_object_dtype_isnan(dtype, val):
X = np.array([[val, np.nan],
[np.nan, val]], dtype=dtype)
expected_mask = np.array([[False, True],
[True, False]])
mask = _object_dtype_isnan(X)
assert_array_equal(mask, expected_mask)
@pytest.mark.parametrize("low,high,base",
[(-1, 0, 10), (0, 2, np.exp(1)), (-1, 1, 2)])
def test_loguniform(low, high, base):
rv = loguniform(base ** low, base ** high)
assert isinstance(rv, scipy.stats._distn_infrastructure.rv_frozen)
rvs = rv.rvs(size=2000, random_state=0)
# Test the basics; right bounds, right size
assert (base ** low <= rvs).all() and (rvs <= base ** high).all()
assert len(rvs) == 2000
# Test that it's actually (fairly) uniform
log_rvs = np.array([math.log(x, base) for x in rvs])
counts, _ = np.histogram(log_rvs)
assert counts.mean() == 200
assert np.abs(counts - counts.mean()).max() <= 40
# Test that random_state works
assert (
loguniform(base ** low, base ** high).rvs(random_state=0)
== loguniform(base ** low, base ** high).rvs(random_state=0)
)
def test_masked_array_deprecated(): # TODO: remove in 0.25
with pytest.warns(FutureWarning, match='is deprecated'):
MaskedArray()

View file

@ -0,0 +1,77 @@
from sklearn.utils.metaestimators import if_delegate_has_method
class Prefix:
def func(self):
pass
class MockMetaEstimator:
"""This is a mock meta estimator"""
a_prefix = Prefix()
@if_delegate_has_method(delegate="a_prefix")
def func(self):
"""This is a mock delegated function"""
pass
def test_delegated_docstring():
assert "This is a mock delegated function" \
in str(MockMetaEstimator.__dict__['func'].__doc__)
assert "This is a mock delegated function" \
in str(MockMetaEstimator.func.__doc__)
assert "This is a mock delegated function" \
in str(MockMetaEstimator().func.__doc__)
class MetaEst:
"""A mock meta estimator"""
def __init__(self, sub_est, better_sub_est=None):
self.sub_est = sub_est
self.better_sub_est = better_sub_est
@if_delegate_has_method(delegate='sub_est')
def predict(self):
pass
class MetaEstTestTuple(MetaEst):
"""A mock meta estimator to test passing a tuple of delegates"""
@if_delegate_has_method(delegate=('sub_est', 'better_sub_est'))
def predict(self):
pass
class MetaEstTestList(MetaEst):
"""A mock meta estimator to test passing a list of delegates"""
@if_delegate_has_method(delegate=['sub_est', 'better_sub_est'])
def predict(self):
pass
class HasPredict:
"""A mock sub-estimator with predict method"""
def predict(self):
pass
class HasNoPredict:
"""A mock sub-estimator with no predict method"""
pass
def test_if_delegate_has_method():
assert hasattr(MetaEst(HasPredict()), 'predict')
assert not hasattr(MetaEst(HasNoPredict()), 'predict')
assert not hasattr(MetaEstTestTuple(HasNoPredict(), HasNoPredict()),
'predict')
assert hasattr(MetaEstTestTuple(HasPredict(), HasNoPredict()), 'predict')
assert not hasattr(MetaEstTestTuple(HasNoPredict(), HasPredict()),
'predict')
assert not hasattr(MetaEstTestList(HasNoPredict(), HasPredict()),
'predict')
assert hasattr(MetaEstTestList(HasPredict(), HasPredict()), 'predict')

View file

@ -0,0 +1,439 @@
import numpy as np
import scipy.sparse as sp
from itertools import product
import pytest
from scipy.sparse import issparse
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
from scipy.sparse import dok_matrix
from scipy.sparse import lil_matrix
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_allclose
from sklearn.utils.estimator_checks import _NotAnArray
from sklearn.utils.fixes import parse_version
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.multiclass import is_multilabel
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.multiclass import class_distribution
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.multiclass import _ovr_decision_function
from sklearn.utils.metaestimators import _safe_split
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVC
from sklearn import datasets
EXAMPLES = {
'multilabel-indicator': [
# valid when the data is formatted as sparse or dense, identified
# by CSR format when the testing takes place
csr_matrix(np.random.RandomState(42).randint(2, size=(10, 10))),
[[0, 1], [1, 0]],
[[0, 1]],
csr_matrix(np.array([[0, 1], [1, 0]])),
csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.bool)),
csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.int8)),
csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.uint8)),
csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.float)),
csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.float32)),
csr_matrix(np.array([[0, 0], [0, 0]])),
csr_matrix(np.array([[0, 1]])),
# Only valid when data is dense
[[-1, 1], [1, -1]],
np.array([[-1, 1], [1, -1]]),
np.array([[-3, 3], [3, -3]]),
_NotAnArray(np.array([[-3, 3], [3, -3]])),
],
'multiclass': [
[1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
np.array([1, 0, 2]),
np.array([1, 0, 2], dtype=np.int8),
np.array([1, 0, 2], dtype=np.uint8),
np.array([1, 0, 2], dtype=np.float),
np.array([1, 0, 2], dtype=np.float32),
np.array([[1], [0], [2]]),
_NotAnArray(np.array([1, 0, 2])),
[0, 1, 2],
['a', 'b', 'c'],
np.array(['a', 'b', 'c']),
np.array(['a', 'b', 'c'], dtype=object),
np.array(['a', 'b', 'c'], dtype=object),
],
'multiclass-multioutput': [
[[1, 0, 2, 2], [1, 4, 2, 4]],
[['a', 'b'], ['c', 'd']],
np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
np.array([['a', 'b'], ['c', 'd']]),
np.array([['a', 'b'], ['c', 'd']]),
np.array([['a', 'b'], ['c', 'd']], dtype=object),
np.array([[1, 0, 2]]),
_NotAnArray(np.array([[1, 0, 2]])),
],
'binary': [
[0, 1],
[1, 1],
[],
[0],
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.bool),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
np.array([[0], [1]]),
_NotAnArray(np.array([[0], [1]])),
[1, -1],
[3, 5],
['a'],
['a', 'b'],
['abc', 'def'],
np.array(['abc', 'def']),
['a', 'b'],
np.array(['abc', 'def'], dtype=object),
],
'continuous': [
[1e-5],
[0, .5],
np.array([[0], [.5]]),
np.array([[0], [.5]], dtype=np.float32),
],
'continuous-multioutput': [
np.array([[0, .5], [.5, 0]]),
np.array([[0, .5], [.5, 0]], dtype=np.float32),
np.array([[0, .5]]),
],
'unknown': [
[[]],
[()],
# sequence of sequences that weren't supported even before deprecation
np.array([np.array([]), np.array([1, 2, 3])], dtype=object),
[np.array([]), np.array([1, 2, 3])],
[{1, 2, 3}, {1, 2}],
[frozenset([1, 2, 3]), frozenset([1, 2])],
# and also confusable as sequences of sequences
[{0: 'a', 1: 'b'}, {0: 'a'}],
# empty second dimension
np.array([[], []]),
# 3d
np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
]
}
NON_ARRAY_LIKE_EXAMPLES = [
{1, 2, 3},
{0: 'a', 1: 'b'},
{0: [5], 1: [5]},
'abc',
frozenset([1, 2, 3]),
None,
]
MULTILABEL_SEQUENCES = [
[[1], [2], [0, 1]],
[(), (2), (0, 1)],
np.array([[], [1, 2]], dtype='object'),
_NotAnArray(np.array([[], [1, 2]], dtype='object'))
]
def test_unique_labels():
# Empty iterable
with pytest.raises(ValueError):
unique_labels()
# Multiclass problem
assert_array_equal(unique_labels(range(10)), np.arange(10))
assert_array_equal(unique_labels(np.arange(10)), np.arange(10))
assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4]))
# Multilabel indicator
assert_array_equal(unique_labels(np.array([[0, 0, 1],
[1, 0, 1],
[0, 0, 0]])),
np.arange(3))
assert_array_equal(unique_labels(np.array([[0, 0, 1],
[0, 0, 0]])),
np.arange(3))
# Several arrays passed
assert_array_equal(unique_labels([4, 0, 2], range(5)),
np.arange(5))
assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)),
np.arange(3))
# Border line case with binary indicator matrix
with pytest.raises(ValueError):
unique_labels([4, 0, 2], np.ones((5, 5)))
with pytest.raises(ValueError):
unique_labels(np.ones((5, 4)), np.ones((5, 5)))
assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))),
np.arange(5))
def test_unique_labels_non_specific():
# Test unique_labels with a variety of collected examples
# Smoke test for all supported format
for format in ["binary", "multiclass", "multilabel-indicator"]:
for y in EXAMPLES[format]:
unique_labels(y)
# We don't support those format at the moment
for example in NON_ARRAY_LIKE_EXAMPLES:
with pytest.raises(ValueError):
unique_labels(example)
for y_type in ["unknown", "continuous", 'continuous-multioutput',
'multiclass-multioutput']:
for example in EXAMPLES[y_type]:
with pytest.raises(ValueError):
unique_labels(example)
def test_unique_labels_mixed_types():
# Mix with binary or multiclass and multilabel
mix_clf_format = product(EXAMPLES["multilabel-indicator"],
EXAMPLES["multiclass"] +
EXAMPLES["binary"])
for y_multilabel, y_multiclass in mix_clf_format:
with pytest.raises(ValueError):
unique_labels(y_multiclass, y_multilabel)
with pytest.raises(ValueError):
unique_labels(y_multilabel, y_multiclass)
with pytest.raises(ValueError):
unique_labels([[1, 2]], [["a", "d"]])
with pytest.raises(ValueError):
unique_labels(["1", 2])
with pytest.raises(ValueError):
unique_labels([["1", 2], [1, 3]])
with pytest.raises(ValueError):
unique_labels([["1", "2"], [2, 3]])
def test_is_multilabel():
for group, group_examples in EXAMPLES.items():
if group in ['multilabel-indicator']:
dense_exp = True
else:
dense_exp = False
for example in group_examples:
# Only mark explicitly defined sparse examples as valid sparse
# multilabel-indicators
if group == 'multilabel-indicator' and issparse(example):
sparse_exp = True
else:
sparse_exp = False
if (issparse(example) or
(hasattr(example, '__array__') and
np.asarray(example).ndim == 2 and
np.asarray(example).dtype.kind in 'biuf' and
np.asarray(example).shape[1] > 0)):
examples_sparse = [sparse_matrix(example)
for sparse_matrix in [coo_matrix,
csc_matrix,
csr_matrix,
dok_matrix,
lil_matrix]]
for exmpl_sparse in examples_sparse:
assert sparse_exp == is_multilabel(exmpl_sparse), (
'is_multilabel(%r) should be %s'
% (exmpl_sparse, sparse_exp))
# Densify sparse examples before testing
if issparse(example):
example = example.toarray()
assert dense_exp == is_multilabel(example), (
'is_multilabel(%r) should be %s'
% (example, dense_exp))
def test_check_classification_targets():
for y_type in EXAMPLES.keys():
if y_type in ["unknown", "continuous", 'continuous-multioutput']:
for example in EXAMPLES[y_type]:
msg = 'Unknown label type: '
with pytest.raises(ValueError, match=msg):
check_classification_targets(example)
else:
for example in EXAMPLES[y_type]:
check_classification_targets(example)
# @ignore_warnings
def test_type_of_target():
for group, group_examples in EXAMPLES.items():
for example in group_examples:
assert type_of_target(example) == group, (
'type_of_target(%r) should be %r, got %r'
% (example, group, type_of_target(example)))
for example in NON_ARRAY_LIKE_EXAMPLES:
msg_regex = r'Expected array-like \(array or non-string sequence\).*'
with pytest.raises(ValueError, match=msg_regex):
type_of_target(example)
for example in MULTILABEL_SEQUENCES:
msg = ('You appear to be using a legacy multi-label data '
'representation. Sequence of sequences are no longer supported;'
' use a binary array or sparse matrix instead.')
with pytest.raises(ValueError, match=msg):
type_of_target(example)
def test_type_of_target_pandas_sparse():
pd = pytest.importorskip("pandas")
if parse_version(pd.__version__) >= parse_version('0.25'):
pd_sparse_array = pd.arrays.SparseArray
else:
pd_sparse_array = pd.SparseArray
y = pd_sparse_array([1, np.nan, np.nan, 1, np.nan])
msg = "y cannot be class 'SparseSeries' or 'SparseArray'"
with pytest.raises(ValueError, match=msg):
type_of_target(y)
def test_class_distribution():
y = np.array([[1, 0, 0, 1],
[2, 2, 0, 1],
[1, 3, 0, 1],
[4, 2, 0, 1],
[2, 0, 0, 1],
[1, 3, 0, 1]])
# Define the sparse matrix with a mix of implicit and explicit zeros
data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1])
indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5])
indptr = np.array([0, 6, 11, 11, 17])
y_sp = sp.csc_matrix((data, indices, indptr), shape=(6, 4))
classes, n_classes, class_prior = class_distribution(y)
classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp)
classes_expected = [[1, 2, 4],
[0, 2, 3],
[0],
[1]]
n_classes_expected = [3, 3, 1, 1]
class_prior_expected = [[3/6, 2/6, 1/6],
[1/3, 1/3, 1/3],
[1.0],
[1.0]]
for k in range(y.shape[1]):
assert_array_almost_equal(classes[k], classes_expected[k])
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
assert_array_almost_equal(classes_sp[k], classes_expected[k])
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
# Test again with explicit sample weights
(classes,
n_classes,
class_prior) = class_distribution(y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0])
(classes_sp,
n_classes_sp,
class_prior_sp) = class_distribution(y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0])
class_prior_expected = [[4/9, 3/9, 2/9],
[2/9, 4/9, 3/9],
[1.0],
[1.0]]
for k in range(y.shape[1]):
assert_array_almost_equal(classes[k], classes_expected[k])
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
assert_array_almost_equal(classes_sp[k], classes_expected[k])
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
def test_safe_split_with_precomputed_kernel():
clf = SVC()
clfp = SVC(kernel="precomputed")
iris = datasets.load_iris()
X, y = iris.data, iris.target
K = np.dot(X, X.T)
cv = ShuffleSplit(test_size=0.25, random_state=0)
train, test = list(cv.split(X))[0]
X_train, y_train = _safe_split(clf, X, y, train)
K_train, y_train2 = _safe_split(clfp, K, y, train)
assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
assert_array_almost_equal(y_train, y_train2)
X_test, y_test = _safe_split(clf, X, y, test, train)
K_test, y_test2 = _safe_split(clfp, K, y, test, train)
assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
assert_array_almost_equal(y_test, y_test2)
def test_ovr_decision_function():
# test properties for ovr decision function
predictions = np.array([[0, 1, 1],
[0, 1, 0],
[0, 1, 1],
[0, 1, 1]])
confidences = np.array([[-1e16, 0, -1e16],
[1., 2., -3.],
[-5., 2., 5.],
[-0.5, 0.2, 0.5]])
n_classes = 3
dec_values = _ovr_decision_function(predictions, confidences, n_classes)
# check that the decision values are within 0.5 range of the votes
votes = np.array([[1, 0, 2],
[1, 1, 1],
[1, 0, 2],
[1, 0, 2]])
assert_allclose(votes, dec_values, atol=0.5)
# check that the prediction are what we expect
# highest vote or highest confidence if there is a tie.
# for the second sample we have a tie (should be won by 1)
expected_prediction = np.array([2, 1, 2, 2])
assert_array_equal(np.argmax(dec_values, axis=1), expected_prediction)
# third and fourth sample have the same vote but third sample
# has higher confidence, this should reflect on the decision values
assert (dec_values[2, 2] > dec_values[3, 2])
# assert subset invariance.
dec_values_one = [_ovr_decision_function(np.array([predictions[i]]),
np.array([confidences[i]]),
n_classes)[0] for i in range(4)]
assert_allclose(dec_values, dec_values_one, atol=1e-6)

View file

@ -0,0 +1,78 @@
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#
# License: BSD 3 clause
import numpy as np
from sklearn.utils.murmurhash import murmurhash3_32
from numpy.testing import assert_array_almost_equal
from numpy.testing import assert_array_equal
def test_mmhash3_int():
assert murmurhash3_32(3) == 847579505
assert murmurhash3_32(3, seed=0) == 847579505
assert murmurhash3_32(3, seed=42) == -1823081949
assert murmurhash3_32(3, positive=False) == 847579505
assert murmurhash3_32(3, seed=0, positive=False) == 847579505
assert murmurhash3_32(3, seed=42, positive=False) == -1823081949
assert murmurhash3_32(3, positive=True) == 847579505
assert murmurhash3_32(3, seed=0, positive=True) == 847579505
assert murmurhash3_32(3, seed=42, positive=True) == 2471885347
def test_mmhash3_int_array():
rng = np.random.RandomState(42)
keys = rng.randint(-5342534, 345345, size=3 * 2 * 1).astype(np.int32)
keys = keys.reshape((3, 2, 1))
for seed in [0, 42]:
expected = np.array([murmurhash3_32(int(k), seed)
for k in keys.flat])
expected = expected.reshape(keys.shape)
assert_array_equal(murmurhash3_32(keys, seed), expected)
for seed in [0, 42]:
expected = np.array([murmurhash3_32(k, seed, positive=True)
for k in keys.flat])
expected = expected.reshape(keys.shape)
assert_array_equal(murmurhash3_32(keys, seed, positive=True),
expected)
def test_mmhash3_bytes():
assert murmurhash3_32(b'foo', 0) == -156908512
assert murmurhash3_32(b'foo', 42) == -1322301282
assert murmurhash3_32(b'foo', 0, positive=True) == 4138058784
assert murmurhash3_32(b'foo', 42, positive=True) == 2972666014
def test_mmhash3_unicode():
assert murmurhash3_32('foo', 0) == -156908512
assert murmurhash3_32('foo', 42) == -1322301282
assert murmurhash3_32('foo', 0, positive=True) == 4138058784
assert murmurhash3_32('foo', 42, positive=True) == 2972666014
def test_no_collision_on_byte_range():
previous_hashes = set()
for i in range(100):
h = murmurhash3_32(' ' * i, 0)
assert h not in previous_hashes, \
"Found collision on growing empty string"
def test_uniform_distribution():
n_bins, n_samples = 10, 100000
bins = np.zeros(n_bins, dtype=np.float64)
for i in range(n_samples):
bins[murmurhash3_32(i, positive=True) % n_bins] += 1
means = bins / n_samples
expected = np.full(n_bins, 1. / n_bins)
assert_array_almost_equal(means / expected, np.ones(n_bins), 2)

View file

@ -0,0 +1,32 @@
import numpy as np
from sklearn.utils.optimize import _newton_cg
from scipy.optimize import fmin_ncg
from sklearn.utils._testing import assert_array_almost_equal
def test_newton_cg():
# Test that newton_cg gives same result as scipy's fmin_ncg
rng = np.random.RandomState(0)
A = rng.normal(size=(10, 10))
x0 = np.ones(10)
def func(x):
Ax = A.dot(x)
return .5 * (Ax).dot(Ax)
def grad(x):
return A.T.dot(A.dot(x))
def hess(x, p):
return p.dot(A.T.dot(A.dot(x.all())))
def grad_hess(x):
return grad(x), lambda x: A.T.dot(A.dot(x))
assert_array_almost_equal(
_newton_cg(grad_hess, func, grad, x0, tol=1e-10)[0],
fmin_ncg(f=func, x0=x0, fprime=grad, fhess_p=hess)
)

View file

@ -0,0 +1,576 @@
import re
from pprint import PrettyPrinter
import numpy as np
from sklearn.utils._pprint import _EstimatorPrettyPrinter
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import set_config, config_context
# Ignore flake8 (lots of line too long issues)
# flake8: noqa
# Constructors excerpted to test pprinting
class LogisticRegression(BaseEstimator):
def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
fit_intercept=True, intercept_scaling=1, class_weight=None,
random_state=None, solver='warn', max_iter=100,
multi_class='warn', verbose=0, warm_start=False, n_jobs=None,
l1_ratio=None):
self.penalty = penalty
self.dual = dual
self.tol = tol
self.C = C
self.fit_intercept = fit_intercept
self.intercept_scaling = intercept_scaling
self.class_weight = class_weight
self.random_state = random_state
self.solver = solver
self.max_iter = max_iter
self.multi_class = multi_class
self.verbose = verbose
self.warm_start = warm_start
self.n_jobs = n_jobs
self.l1_ratio = l1_ratio
def fit(self, X, y):
return self
class StandardScaler(TransformerMixin, BaseEstimator):
def __init__(self, copy=True, with_mean=True, with_std=True):
self.with_mean = with_mean
self.with_std = with_std
self.copy = copy
def transform(self, X, copy=None):
return self
class RFE(BaseEstimator):
def __init__(self, estimator, n_features_to_select=None, step=1,
verbose=0):
self.estimator = estimator
self.n_features_to_select = n_features_to_select
self.step = step
self.verbose = verbose
class GridSearchCV(BaseEstimator):
def __init__(self, estimator, param_grid, scoring=None,
n_jobs=None, iid='warn', refit=True, cv='warn', verbose=0,
pre_dispatch='2*n_jobs', error_score='raise-deprecating',
return_train_score=False):
self.estimator = estimator
self.param_grid = param_grid
self.scoring = scoring
self.n_jobs = n_jobs
self.iid = iid
self.refit = refit
self.cv = cv
self.verbose = verbose
self.pre_dispatch = pre_dispatch
self.error_score = error_score
self.return_train_score = return_train_score
class CountVectorizer(BaseEstimator):
def __init__(self, input='content', encoding='utf-8',
decode_error='strict', strip_accents=None,
lowercase=True, preprocessor=None, tokenizer=None,
stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
ngram_range=(1, 1), analyzer='word',
max_df=1.0, min_df=1, max_features=None,
vocabulary=None, binary=False, dtype=np.int64):
self.input = input
self.encoding = encoding
self.decode_error = decode_error
self.strip_accents = strip_accents
self.preprocessor = preprocessor
self.tokenizer = tokenizer
self.analyzer = analyzer
self.lowercase = lowercase
self.token_pattern = token_pattern
self.stop_words = stop_words
self.max_df = max_df
self.min_df = min_df
self.max_features = max_features
self.ngram_range = ngram_range
self.vocabulary = vocabulary
self.binary = binary
self.dtype = dtype
class Pipeline(BaseEstimator):
def __init__(self, steps, memory=None):
self.steps = steps
self.memory = memory
class SVC(BaseEstimator):
def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated',
coef0=0.0, shrinking=True, probability=False,
tol=1e-3, cache_size=200, class_weight=None,
verbose=False, max_iter=-1, decision_function_shape='ovr',
random_state=None):
self.kernel = kernel
self.degree = degree
self.gamma = gamma
self.coef0 = coef0
self.tol = tol
self.C = C
self.shrinking = shrinking
self.probability = probability
self.cache_size = cache_size
self.class_weight = class_weight
self.verbose = verbose
self.max_iter = max_iter
self.decision_function_shape = decision_function_shape
self.random_state = random_state
class PCA(BaseEstimator):
def __init__(self, n_components=None, copy=True, whiten=False,
svd_solver='auto', tol=0.0, iterated_power='auto',
random_state=None):
self.n_components = n_components
self.copy = copy
self.whiten = whiten
self.svd_solver = svd_solver
self.tol = tol
self.iterated_power = iterated_power
self.random_state = random_state
class NMF(BaseEstimator):
def __init__(self, n_components=None, init=None, solver='cd',
beta_loss='frobenius', tol=1e-4, max_iter=200,
random_state=None, alpha=0., l1_ratio=0., verbose=0,
shuffle=False):
self.n_components = n_components
self.init = init
self.solver = solver
self.beta_loss = beta_loss
self.tol = tol
self.max_iter = max_iter
self.random_state = random_state
self.alpha = alpha
self.l1_ratio = l1_ratio
self.verbose = verbose
self.shuffle = shuffle
class SimpleImputer(BaseEstimator):
def __init__(self, missing_values=np.nan, strategy="mean",
fill_value=None, verbose=0, copy=True):
self.missing_values = missing_values
self.strategy = strategy
self.fill_value = fill_value
self.verbose = verbose
self.copy = copy
def test_basic(print_changed_only_false):
# Basic pprint test
lr = LogisticRegression()
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)"""
expected = expected[1:] # remove first \n
assert lr.__repr__() == expected
def test_changed_only():
# Make sure the changed_only param is correctly used when True (default)
lr = LogisticRegression(C=99)
expected = """LogisticRegression(C=99)"""
assert lr.__repr__() == expected
# Check with a repr that doesn't fit on a single line
lr = LogisticRegression(C=99, class_weight=.4, fit_intercept=False,
tol=1234, verbose=True)
expected = """
LogisticRegression(C=99, class_weight=0.4, fit_intercept=False, tol=1234,
verbose=True)"""
expected = expected[1:] # remove first \n
assert lr.__repr__() == expected
imputer = SimpleImputer(missing_values=0)
expected = """SimpleImputer(missing_values=0)"""
assert imputer.__repr__() == expected
# Defaults to np.NaN, trying with float('NaN')
imputer = SimpleImputer(missing_values=float('NaN'))
expected = """SimpleImputer()"""
assert imputer.__repr__() == expected
# make sure array parameters don't throw error (see #13583)
repr(LogisticRegressionCV(Cs=np.array([0.1, 1])))
def test_pipeline(print_changed_only_false):
# Render a pipeline object
pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999))
expected = """
Pipeline(memory=None,
steps=[('standardscaler',
StandardScaler(copy=True, with_mean=True, with_std=True)),
('logisticregression',
LogisticRegression(C=999, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1,
l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None,
penalty='l2', random_state=None,
solver='warn', tol=0.0001, verbose=0,
warm_start=False))],
verbose=False)"""
expected = expected[1:] # remove first \n
assert pipeline.__repr__() == expected
def test_deeply_nested(print_changed_only_false):
# Render a deeply nested estimator
rfe = RFE(RFE(RFE(RFE(RFE(RFE(RFE(LogisticRegression())))))))
expected = """
RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=LogisticRegression(C=1.0,
class_weight=None,
dual=False,
fit_intercept=True,
intercept_scaling=1,
l1_ratio=None,
max_iter=100,
multi_class='warn',
n_jobs=None,
penalty='l2',
random_state=None,
solver='warn',
tol=0.0001,
verbose=0,
warm_start=False),
n_features_to_select=None,
step=1,
verbose=0),
n_features_to_select=None,
step=1,
verbose=0),
n_features_to_select=None,
step=1, verbose=0),
n_features_to_select=None, step=1,
verbose=0),
n_features_to_select=None, step=1, verbose=0),
n_features_to_select=None, step=1, verbose=0),
n_features_to_select=None, step=1, verbose=0)"""
expected = expected[1:] # remove first \n
assert rfe.__repr__() == expected
def test_gridsearch(print_changed_only_false):
# render a gridsearch
param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
gs = GridSearchCV(SVC(), param_grid, cv=5)
expected = """
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3,
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False),
iid='warn', n_jobs=None,
param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
'kernel': ['rbf']},
{'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)"""
expected = expected[1:] # remove first \n
assert gs.__repr__() == expected
def test_gridsearch_pipeline(print_changed_only_false):
# render a pipeline inside a gridsearch
pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True)
pipeline = Pipeline([
('reduce_dim', PCA()),
('classify', SVC())
])
N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 10, 100, 1000]
param_grid = [
{
'reduce_dim': [PCA(iterated_power=7), NMF()],
'reduce_dim__n_components': N_FEATURES_OPTIONS,
'classify__C': C_OPTIONS
},
{
'reduce_dim': [SelectKBest(chi2)],
'reduce_dim__k': N_FEATURES_OPTIONS,
'classify__C': C_OPTIONS
}
]
gspipline = GridSearchCV(pipeline, cv=3, n_jobs=1, param_grid=param_grid)
expected = """
GridSearchCV(cv=3, error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('reduce_dim',
PCA(copy=True, iterated_power='auto',
n_components=None,
random_state=None,
svd_solver='auto', tol=0.0,
whiten=False)),
('classify',
SVC(C=1.0, cache_size=200,
class_weight=None, coef0=0.0,
decision_function_shape='ovr',
degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1,
probability=False,
random_state=None, shrinking=True,
tol=0.001, verbose=False))]),
iid='warn', n_jobs=1,
param_grid=[{'classify__C': [1, 10, 100, 1000],
'reduce_dim': [PCA(copy=True, iterated_power=7,
n_components=None,
random_state=None,
svd_solver='auto', tol=0.0,
whiten=False),
NMF(alpha=0.0, beta_loss='frobenius',
init=None, l1_ratio=0.0,
max_iter=200, n_components=None,
random_state=None, shuffle=False,
solver='cd', tol=0.0001,
verbose=0)],
'reduce_dim__n_components': [2, 4, 8]},
{'classify__C': [1, 10, 100, 1000],
'reduce_dim': [SelectKBest(k=10,
score_func=<function chi2 at some_address>)],
'reduce_dim__k': [2, 4, 8]}],
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)"""
expected = expected[1:] # remove first \n
repr_ = pp.pformat(gspipline)
# Remove address of '<function chi2 at 0x.....>' for reproducibility
repr_ = re.sub('function chi2 at 0x.*>',
'function chi2 at some_address>', repr_)
assert repr_ == expected
def test_n_max_elements_to_show(print_changed_only_false):
n_max_elements_to_show = 30
pp = _EstimatorPrettyPrinter(
compact=True, indent=1, indent_at_name=True,
n_max_elements_to_show=n_max_elements_to_show
)
# No ellipsis
vocabulary = {i: i for i in range(n_max_elements_to_show)}
vectorizer = CountVectorizer(vocabulary=vocabulary)
expected = r"""
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None,
vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
27: 27, 28: 28, 29: 29})"""
expected = expected[1:] # remove first \n
assert pp.pformat(vectorizer) == expected
# Now with ellipsis
vocabulary = {i: i for i in range(n_max_elements_to_show + 1)}
vectorizer = CountVectorizer(vocabulary=vocabulary)
expected = r"""
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None,
vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
27: 27, 28: 28, 29: 29, ...})"""
expected = expected[1:] # remove first \n
assert pp.pformat(vectorizer) == expected
# Also test with lists
param_grid = {'C': list(range(n_max_elements_to_show))}
gs = GridSearchCV(SVC(), param_grid)
expected = """
GridSearchCV(cv='warn', error_score='raise-deprecating',
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3,
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False),
iid='warn', n_jobs=None,
param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)"""
expected = expected[1:] # remove first \n
assert pp.pformat(gs) == expected
# Now with ellipsis
param_grid = {'C': list(range(n_max_elements_to_show + 1))}
gs = GridSearchCV(SVC(), param_grid)
expected = """
GridSearchCV(cv='warn', error_score='raise-deprecating',
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3,
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False),
iid='warn', n_jobs=None,
param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, ...]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)"""
expected = expected[1:] # remove first \n
assert pp.pformat(gs) == expected
def test_bruteforce_ellipsis(print_changed_only_false):
# Check that the bruteforce ellipsis (used when the number of non-blank
# characters exceeds N_CHAR_MAX) renders correctly.
lr = LogisticRegression()
# test when the left and right side of the ellipsis aren't on the same
# line.
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
in...
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)"""
expected = expected[1:] # remove first \n
assert expected == lr.__repr__(N_CHAR_MAX=150)
# test with very small N_CHAR_MAX
# Note that N_CHAR_MAX is not strictly enforced, but it's normal: to avoid
# weird reprs we still keep the whole line of the right part (after the
# ellipsis).
expected = """
Lo...
warm_start=False)"""
expected = expected[1:] # remove first \n
assert expected == lr.__repr__(N_CHAR_MAX=4)
# test with N_CHAR_MAX == number of non-blank characters: In this case we
# don't want ellipsis
full_repr = lr.__repr__(N_CHAR_MAX=float('inf'))
n_nonblank = len(''.join(full_repr.split()))
assert lr.__repr__(N_CHAR_MAX=n_nonblank) == full_repr
assert '...' not in full_repr
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
# right side of the ellispsis are on different lines. In this case we
# want to expend the whole line of the right side
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_i...
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)"""
expected = expected[1:] # remove first \n
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 10)
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
# right side of the ellispsis are on the same line. In this case we don't
# want to expend the whole line of the right side, just add the ellispsis
# between the 2 sides.
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter...,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)"""
expected = expected[1:] # remove first \n
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 4)
# test with N_CHAR_MAX == number of non-blank characters - 2: the left and
# right side of the ellispsis are on the same line, but adding the ellipsis
# would actually make the repr longer. So we don't add the ellipsis.
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)"""
expected = expected[1:] # remove first \n
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 2)
def test_builtin_prettyprinter():
# non regression test than ensures we can still use the builtin
# PrettyPrinter class for estimators (as done e.g. by joblib).
# Used to be a bug
PrettyPrinter().pprint(LogisticRegression())
def test_kwargs_in_init():
# Make sure the changed_only=True mode is OK when an argument is passed as
# kwargs.
# Non-regression test for
# https://github.com/scikit-learn/scikit-learn/issues/17206
class WithKWargs(BaseEstimator):
# Estimator with a kwargs argument. These need to hack around
# set_params and get_params. Here we mimic what LightGBM does.
def __init__(self, a='willchange', b='unchanged', **kwargs):
self.a = a
self.b = b
self._other_params = {}
self.set_params(**kwargs)
def get_params(self, deep=True):
params = super().get_params(deep=deep)
params.update(self._other_params)
return params
def set_params(self, **params):
for key, value in params.items():
setattr(self, key, value)
self._other_params[key] = value
return self
est = WithKWargs(a='something', c='abcd', d=None)
expected = "WithKWargs(a='something', c='abcd', d=None)"
assert expected == est.__repr__()
with config_context(print_changed_only=False):
expected = "WithKWargs(a='something', b='unchanged', c='abcd', d=None)"
assert expected == est.__repr__()

View file

@ -0,0 +1,187 @@
import numpy as np
import pytest
import scipy.sparse as sp
from scipy.special import comb
from numpy.testing import assert_array_almost_equal
from sklearn.utils.random import _random_choice_csc, sample_without_replacement
from sklearn.utils._random import _our_rand_r_py
###############################################################################
# test custom sampling without replacement algorithm
###############################################################################
def test_invalid_sample_without_replacement_algorithm():
with pytest.raises(ValueError):
sample_without_replacement(5, 4, "unknown")
def test_sample_without_replacement_algorithms():
methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
for m in methods:
def sample_without_replacement_method(n_population, n_samples,
random_state=None):
return sample_without_replacement(n_population, n_samples,
method=m,
random_state=random_state)
check_edge_case_of_sample_int(sample_without_replacement_method)
check_sample_int(sample_without_replacement_method)
check_sample_int_distribution(sample_without_replacement_method)
def check_edge_case_of_sample_int(sample_without_replacement):
# n_population < n_sample
with pytest.raises(ValueError):
sample_without_replacement(0, 1)
with pytest.raises(ValueError):
sample_without_replacement(1, 2)
# n_population == n_samples
assert sample_without_replacement(0, 0).shape == (0, )
assert sample_without_replacement(1, 1).shape == (1, )
# n_population >= n_samples
assert sample_without_replacement(5, 0).shape == (0, )
assert sample_without_replacement(5, 1).shape == (1, )
# n_population < 0 or n_samples < 0
with pytest.raises(ValueError):
sample_without_replacement(-1, 5)
with pytest.raises(ValueError):
sample_without_replacement(5, -1)
def check_sample_int(sample_without_replacement):
# This test is heavily inspired from test_random.py of python-core.
#
# For the entire allowable range of 0 <= k <= N, validate that
# the sample is of the correct length and contains only unique items
n_population = 100
for n_samples in range(n_population + 1):
s = sample_without_replacement(n_population, n_samples)
assert len(s) == n_samples
unique = np.unique(s)
assert np.size(unique) == n_samples
assert np.all(unique < n_population)
# test edge case n_population == n_samples == 0
assert np.size(sample_without_replacement(0, 0)) == 0
def check_sample_int_distribution(sample_without_replacement):
# This test is heavily inspired from test_random.py of python-core.
#
# For the entire allowable range of 0 <= k <= N, validate that
# sample generates all possible permutations
n_population = 10
# a large number of trials prevents false negatives without slowing normal
# case
n_trials = 10000
for n_samples in range(n_population):
# Counting the number of combinations is not as good as counting the
# the number of permutations. However, it works with sampling algorithm
# that does not provide a random permutation of the subset of integer.
n_expected = comb(n_population, n_samples, exact=True)
output = {}
for i in range(n_trials):
output[frozenset(sample_without_replacement(n_population,
n_samples))] = None
if len(output) == n_expected:
break
else:
raise AssertionError(
"number of combinations != number of expected (%s != %s)" %
(len(output), n_expected))
def test_random_choice_csc(n_samples=10000, random_state=24):
# Explicit class probabilities
classes = [np.array([0, 1]), np.array([0, 1, 2])]
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
got = _random_choice_csc(n_samples, classes, class_probabilities,
random_state)
assert sp.issparse(got)
for k in range(len(classes)):
p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
# Implicit class probabilities
classes = [[0, 1], [1, 2]] # test for array-like support
class_probabilities = [np.array([0.5, 0.5]), np.array([0, 1/2, 1/2])]
got = _random_choice_csc(n_samples=n_samples,
classes=classes,
random_state=random_state)
assert sp.issparse(got)
for k in range(len(classes)):
p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
# Edge case probabilities 1.0 and 0.0
classes = [np.array([0, 1]), np.array([0, 1, 2])]
class_probabilities = [np.array([1.0, 0.0]), np.array([0.0, 1.0, 0.0])]
got = _random_choice_csc(n_samples, classes, class_probabilities,
random_state)
assert sp.issparse(got)
for k in range(len(classes)):
p = np.bincount(got.getcol(k).toarray().ravel(),
minlength=len(class_probabilities[k])) / n_samples
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
# One class target data
classes = [[1], [0]] # test for array-like support
class_probabilities = [np.array([0.0, 1.0]), np.array([1.0])]
got = _random_choice_csc(n_samples=n_samples,
classes=classes,
random_state=random_state)
assert sp.issparse(got)
for k in range(len(classes)):
p = np.bincount(got.getcol(k).toarray().ravel()) / n_samples
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
def test_random_choice_csc_errors():
# the length of an array in classes and class_probabilities is mismatched
classes = [np.array([0, 1]), np.array([0, 1, 2, 3])]
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
with pytest.raises(ValueError):
_random_choice_csc(4, classes, class_probabilities, 1)
# the class dtype is not supported
classes = [np.array(["a", "1"]), np.array(["z", "1", "2"])]
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
with pytest.raises(ValueError):
_random_choice_csc(4, classes, class_probabilities, 1)
# the class dtype is not supported
classes = [np.array([4.2, 0.1]), np.array([0.1, 0.2, 9.4])]
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
with pytest.raises(ValueError):
_random_choice_csc(4, classes, class_probabilities, 1)
# Given probabilities don't sum to 1
classes = [np.array([0, 1]), np.array([0, 1, 2])]
class_probabilities = [np.array([0.5, 0.6]), np.array([0.6, 0.1, 0.3])]
with pytest.raises(ValueError):
_random_choice_csc(4, classes, class_probabilities, 1)
def test_our_rand_r():
assert 131541053 == _our_rand_r_py(1273642419)
assert 270369 == _our_rand_r_py(0)

View file

@ -0,0 +1,153 @@
# Author: Tom Dupre la Tour
# Joan Massich <mailsik@gmail.com>
#
# License: BSD 3 clause
import numpy as np
import pytest
import scipy.sparse as sp
from numpy.testing import assert_array_equal
from sklearn.utils._seq_dataset import (
ArrayDataset32, ArrayDataset64, CSRDataset32, CSRDataset64)
from sklearn.datasets import load_iris
from sklearn.utils._testing import assert_allclose
iris = load_iris()
X64 = iris.data.astype(np.float64)
y64 = iris.target.astype(np.float64)
X_csr64 = sp.csr_matrix(X64)
sample_weight64 = np.arange(y64.size, dtype=np.float64)
X32 = iris.data.astype(np.float32)
y32 = iris.target.astype(np.float32)
X_csr32 = sp.csr_matrix(X32)
sample_weight32 = np.arange(y32.size, dtype=np.float32)
def assert_csr_equal_values(current, expected):
current.eliminate_zeros()
expected.eliminate_zeros()
expected = expected.astype(current.dtype)
assert current.shape[0] == expected.shape[0]
assert current.shape[1] == expected.shape[1]
assert_array_equal(current.data, expected.data)
assert_array_equal(current.indices, expected.indices)
assert_array_equal(current.indptr, expected.indptr)
def make_dense_dataset_32():
return ArrayDataset32(X32, y32, sample_weight32, seed=42)
def make_dense_dataset_64():
return ArrayDataset64(X64, y64, sample_weight64, seed=42)
def make_sparse_dataset_32():
return CSRDataset32(X_csr32.data, X_csr32.indptr, X_csr32.indices, y32,
sample_weight32, seed=42)
def make_sparse_dataset_64():
return CSRDataset64(X_csr64.data, X_csr64.indptr, X_csr64.indices, y64,
sample_weight64, seed=42)
@pytest.mark.parametrize('dataset_constructor', [
make_dense_dataset_32,
make_dense_dataset_64,
make_sparse_dataset_32,
make_sparse_dataset_64,
])
def test_seq_dataset_basic_iteration(dataset_constructor):
NUMBER_OF_RUNS = 5
dataset = dataset_constructor()
for _ in range(NUMBER_OF_RUNS):
# next sample
xi_, yi, swi, idx = dataset._next_py()
xi = sp.csr_matrix((xi_), shape=(1, X64.shape[1]))
assert_csr_equal_values(xi, X_csr64[idx])
assert yi == y64[idx]
assert swi == sample_weight64[idx]
# random sample
xi_, yi, swi, idx = dataset._random_py()
xi = sp.csr_matrix((xi_), shape=(1, X64.shape[1]))
assert_csr_equal_values(xi, X_csr64[idx])
assert yi == y64[idx]
assert swi == sample_weight64[idx]
@pytest.mark.parametrize('make_dense_dataset,make_sparse_dataset', [
(make_dense_dataset_32, make_sparse_dataset_32),
(make_dense_dataset_64, make_sparse_dataset_64),
])
def test_seq_dataset_shuffle(make_dense_dataset, make_sparse_dataset):
dense_dataset, sparse_dataset = make_dense_dataset(), make_sparse_dataset()
# not shuffled
for i in range(5):
_, _, _, idx1 = dense_dataset._next_py()
_, _, _, idx2 = sparse_dataset._next_py()
assert idx1 == i
assert idx2 == i
for i in [132, 50, 9, 18, 58]:
_, _, _, idx1 = dense_dataset._random_py()
_, _, _, idx2 = sparse_dataset._random_py()
assert idx1 == i
assert idx2 == i
seed = 77
dense_dataset._shuffle_py(seed)
sparse_dataset._shuffle_py(seed)
idx_next = [63, 91, 148, 87, 29]
idx_shuffle = [137, 125, 56, 121, 127]
for i, j in zip(idx_next, idx_shuffle):
_, _, _, idx1 = dense_dataset._next_py()
_, _, _, idx2 = sparse_dataset._next_py()
assert idx1 == i
assert idx2 == i
_, _, _, idx1 = dense_dataset._random_py()
_, _, _, idx2 = sparse_dataset._random_py()
assert idx1 == j
assert idx2 == j
@pytest.mark.parametrize('make_dataset_32,make_dataset_64', [
(make_dense_dataset_32, make_dense_dataset_64),
(make_sparse_dataset_32, make_sparse_dataset_64),
])
def test_fused_types_consistency(make_dataset_32, make_dataset_64):
dataset_32, dataset_64 = make_dataset_32(), make_dataset_64()
NUMBER_OF_RUNS = 5
for _ in range(NUMBER_OF_RUNS):
# next sample
(xi_data32, _, _), yi32, _, _ = dataset_32._next_py()
(xi_data64, _, _), yi64, _, _ = dataset_64._next_py()
assert xi_data32.dtype == np.float32
assert xi_data64.dtype == np.float64
assert_allclose(xi_data64, xi_data32, rtol=1e-5)
assert_allclose(yi64, yi32, rtol=1e-5)
def test_buffer_dtype_mismatch_error():
with pytest.raises(ValueError, match='Buffer dtype mismatch'):
ArrayDataset64(X32, y32, sample_weight32, seed=42),
with pytest.raises(ValueError, match='Buffer dtype mismatch'):
ArrayDataset32(X64, y64, sample_weight64, seed=42),
with pytest.raises(ValueError, match='Buffer dtype mismatch'):
CSRDataset64(X_csr32.data, X_csr32.indptr, X_csr32.indices, y32,
sample_weight32, seed=42),
with pytest.raises(ValueError, match='Buffer dtype mismatch'):
CSRDataset32(X_csr64.data, X_csr64.indptr, X_csr64.indices, y64,
sample_weight64, seed=42),

View file

@ -0,0 +1,95 @@
from collections import defaultdict
import numpy as np
from numpy.testing import assert_array_almost_equal
from sklearn.utils.graph import (graph_shortest_path,
single_source_shortest_path_length)
def floyd_warshall_slow(graph, directed=False):
N = graph.shape[0]
#set nonzero entries to infinity
graph[np.where(graph == 0)] = np.inf
#set diagonal to zero
graph.flat[::N + 1] = 0
if not directed:
graph = np.minimum(graph, graph.T)
for k in range(N):
for i in range(N):
for j in range(N):
graph[i, j] = min(graph[i, j], graph[i, k] + graph[k, j])
graph[np.where(np.isinf(graph))] = 0
return graph
def generate_graph(N=20):
#sparse grid of distances
rng = np.random.RandomState(0)
dist_matrix = rng.random_sample((N, N))
#make symmetric: distances are not direction-dependent
dist_matrix = dist_matrix + dist_matrix.T
#make graph sparse
i = (rng.randint(N, size=N * N // 2), rng.randint(N, size=N * N // 2))
dist_matrix[i] = 0
#set diagonal to zero
dist_matrix.flat[::N + 1] = 0
return dist_matrix
def test_floyd_warshall():
dist_matrix = generate_graph(20)
for directed in (True, False):
graph_FW = graph_shortest_path(dist_matrix, directed, 'FW')
graph_py = floyd_warshall_slow(dist_matrix.copy(), directed)
assert_array_almost_equal(graph_FW, graph_py)
def test_dijkstra():
dist_matrix = generate_graph(20)
for directed in (True, False):
graph_D = graph_shortest_path(dist_matrix, directed, 'D')
graph_py = floyd_warshall_slow(dist_matrix.copy(), directed)
assert_array_almost_equal(graph_D, graph_py)
def test_shortest_path():
dist_matrix = generate_graph(20)
# We compare path length and not costs (-> set distances to 0 or 1)
dist_matrix[dist_matrix != 0] = 1
for directed in (True, False):
if not directed:
dist_matrix = np.minimum(dist_matrix, dist_matrix.T)
graph_py = floyd_warshall_slow(dist_matrix.copy(), directed)
for i in range(dist_matrix.shape[0]):
# Non-reachable nodes have distance 0 in graph_py
dist_dict = defaultdict(int)
dist_dict.update(single_source_shortest_path_length(dist_matrix,
i))
for j in range(graph_py[i].shape[0]):
assert_array_almost_equal(dist_dict[j], graph_py[i, j])
def test_dijkstra_bug_fix():
X = np.array([[0., 0., 4.],
[1., 0., 2.],
[0., 5., 0.]])
dist_FW = graph_shortest_path(X, directed=False, method='FW')
dist_D = graph_shortest_path(X, directed=False, method='D')
assert_array_almost_equal(dist_D, dist_FW)

View file

@ -0,0 +1,37 @@
from sklearn.utils._show_versions import _get_sys_info
from sklearn.utils._show_versions import _get_deps_info
from sklearn.utils._show_versions import show_versions
from sklearn.utils._testing import ignore_warnings
def test_get_sys_info():
sys_info = _get_sys_info()
assert 'python' in sys_info
assert 'executable' in sys_info
assert 'machine' in sys_info
def test_get_deps_info():
with ignore_warnings():
deps_info = _get_deps_info()
assert 'pip' in deps_info
assert 'setuptools' in deps_info
assert 'sklearn' in deps_info
assert 'numpy' in deps_info
assert 'scipy' in deps_info
assert 'Cython' in deps_info
assert 'pandas' in deps_info
assert 'matplotlib' in deps_info
assert 'joblib' in deps_info
def test_show_versions(capsys):
with ignore_warnings():
show_versions()
out, err = capsys.readouterr()
assert 'python' in out
assert 'numpy' in out

View file

@ -0,0 +1,617 @@
import pytest
import numpy as np
import scipy.sparse as sp
from scipy import linalg
from numpy.testing import assert_array_almost_equal, assert_array_equal
from numpy.random import RandomState
from sklearn.datasets import make_classification
from sklearn.utils.sparsefuncs import (mean_variance_axis,
incr_mean_variance_axis,
inplace_column_scale,
inplace_row_scale,
inplace_swap_row, inplace_swap_column,
min_max_axis,
count_nonzero, csc_median_axis_0)
from sklearn.utils.sparsefuncs_fast import (assign_rows_csr,
inplace_csr_row_normalize_l1,
inplace_csr_row_normalize_l2,
csr_row_norms)
from sklearn.utils._testing import assert_allclose
def test_mean_variance_axis0():
X, _ = make_classification(5, 4, random_state=0)
# Sparsify the array a little bit
X[0, 0] = 0
X[2, 1] = 0
X[4, 3] = 0
X_lil = sp.lil_matrix(X)
X_lil[1, 0] = 0
X[1, 0] = 0
with pytest.raises(TypeError):
mean_variance_axis(X_lil, axis=0)
X_csr = sp.csr_matrix(X_lil)
X_csc = sp.csc_matrix(X_lil)
expected_dtypes = [(np.float32, np.float32),
(np.float64, np.float64),
(np.int32, np.float64),
(np.int64, np.float64)]
for input_dtype, output_dtype in expected_dtypes:
X_test = X.astype(input_dtype)
for X_sparse in (X_csr, X_csc):
X_sparse = X_sparse.astype(input_dtype)
X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
assert X_means.dtype == output_dtype
assert X_vars.dtype == output_dtype
assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
def test_mean_variance_axis1():
X, _ = make_classification(5, 4, random_state=0)
# Sparsify the array a little bit
X[0, 0] = 0
X[2, 1] = 0
X[4, 3] = 0
X_lil = sp.lil_matrix(X)
X_lil[1, 0] = 0
X[1, 0] = 0
with pytest.raises(TypeError):
mean_variance_axis(X_lil, axis=1)
X_csr = sp.csr_matrix(X_lil)
X_csc = sp.csc_matrix(X_lil)
expected_dtypes = [(np.float32, np.float32),
(np.float64, np.float64),
(np.int32, np.float64),
(np.int64, np.float64)]
for input_dtype, output_dtype in expected_dtypes:
X_test = X.astype(input_dtype)
for X_sparse in (X_csr, X_csc):
X_sparse = X_sparse.astype(input_dtype)
X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
assert X_means.dtype == output_dtype
assert X_vars.dtype == output_dtype
assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
def test_incr_mean_variance_axis():
for axis in [0, 1]:
rng = np.random.RandomState(0)
n_features = 50
n_samples = 10
data_chunks = [rng.randint(0, 2, size=n_features)
for i in range(n_samples)]
# default params for incr_mean_variance
last_mean = np.zeros(n_features)
last_var = np.zeros_like(last_mean)
last_n = np.zeros_like(last_mean, dtype=np.int64)
# Test errors
X = np.array(data_chunks[0])
X = np.atleast_2d(X)
X_lil = sp.lil_matrix(X)
X_csr = sp.csr_matrix(X_lil)
with pytest.raises(TypeError):
incr_mean_variance_axis(X=axis, axis=last_mean, last_mean=last_var,
last_var=last_n)
with pytest.raises(TypeError):
incr_mean_variance_axis(X_lil, axis=axis, last_mean=last_mean,
last_var=last_var, last_n=last_n)
# Test _incr_mean_and_var with a 1 row input
X_means, X_vars = mean_variance_axis(X_csr, axis)
X_means_incr, X_vars_incr, n_incr = \
incr_mean_variance_axis(X_csr, axis=axis, last_mean=last_mean,
last_var=last_var, last_n=last_n)
assert_array_almost_equal(X_means, X_means_incr)
assert_array_almost_equal(X_vars, X_vars_incr)
# X.shape[axis] picks # samples
assert_array_equal(X.shape[axis], n_incr)
X_csc = sp.csc_matrix(X_lil)
X_means, X_vars = mean_variance_axis(X_csc, axis)
assert_array_almost_equal(X_means, X_means_incr)
assert_array_almost_equal(X_vars, X_vars_incr)
assert_array_equal(X.shape[axis], n_incr)
# Test _incremental_mean_and_var with whole data
X = np.vstack(data_chunks)
X_lil = sp.lil_matrix(X)
X_csr = sp.csr_matrix(X_lil)
X_csc = sp.csc_matrix(X_lil)
expected_dtypes = [(np.float32, np.float32),
(np.float64, np.float64),
(np.int32, np.float64),
(np.int64, np.float64)]
for input_dtype, output_dtype in expected_dtypes:
for X_sparse in (X_csr, X_csc):
X_sparse = X_sparse.astype(input_dtype)
last_mean = last_mean.astype(output_dtype)
last_var = last_var.astype(output_dtype)
X_means, X_vars = mean_variance_axis(X_sparse, axis)
X_means_incr, X_vars_incr, n_incr = \
incr_mean_variance_axis(X_sparse, axis=axis,
last_mean=last_mean,
last_var=last_var,
last_n=last_n)
assert X_means_incr.dtype == output_dtype
assert X_vars_incr.dtype == output_dtype
assert_array_almost_equal(X_means, X_means_incr)
assert_array_almost_equal(X_vars, X_vars_incr)
assert_array_equal(X.shape[axis], n_incr)
@pytest.mark.parametrize(
"X1, X2",
[
(sp.random(5, 2, density=0.8, format='csr', random_state=0),
sp.random(13, 2, density=0.8, format='csr', random_state=0)),
(sp.random(5, 2, density=0.8, format='csr', random_state=0),
sp.hstack([sp.csr_matrix(np.full((13, 1), fill_value=np.nan)),
sp.random(13, 1, density=0.8, random_state=42)],
format="csr"))
]
)
def test_incr_mean_variance_axis_equivalence_mean_variance(X1, X2):
# non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/16448
# check that computing the incremental mean and variance is equivalent to
# computing the mean and variance on the stacked dataset.
axis = 0
last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1])
last_n = np.zeros(X1.shape[1], dtype=np.int64)
updated_mean, updated_var, updated_n = incr_mean_variance_axis(
X1, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
)
updated_mean, updated_var, updated_n = incr_mean_variance_axis(
X2, axis=axis, last_mean=updated_mean, last_var=updated_var,
last_n=updated_n
)
X = sp.vstack([X1, X2])
assert_allclose(updated_mean, np.nanmean(X.A, axis=axis))
assert_allclose(updated_var, np.nanvar(X.A, axis=axis))
assert_allclose(updated_n, np.count_nonzero(~np.isnan(X.A), axis=0))
def test_incr_mean_variance_no_new_n():
# check the behaviour when we update the variance with an empty matrix
axis = 0
X1 = sp.random(5, 1, density=0.8, random_state=0).tocsr()
X2 = sp.random(0, 1, density=0.8, random_state=0).tocsr()
last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1])
last_n = np.zeros(X1.shape[1], dtype=np.int64)
last_mean, last_var, last_n = incr_mean_variance_axis(
X1, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
)
# update statistic with a column which should ignored
updated_mean, updated_var, updated_n = incr_mean_variance_axis(
X2, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
)
assert_allclose(updated_mean, last_mean)
assert_allclose(updated_var, last_var)
assert_allclose(updated_n, last_n)
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("sparse_constructor", [sp.csc_matrix, sp.csr_matrix])
def test_incr_mean_variance_axis_ignore_nan(axis, sparse_constructor):
old_means = np.array([535., 535., 535., 535.])
old_variances = np.array([4225., 4225., 4225., 4225.])
old_sample_count = np.array([2, 2, 2, 2], dtype=np.int64)
X = sparse_constructor(
np.array([[170, 170, 170, 170],
[430, 430, 430, 430],
[300, 300, 300, 300]]))
X_nan = sparse_constructor(
np.array([[170, np.nan, 170, 170],
[np.nan, 170, 430, 430],
[430, 430, np.nan, 300],
[300, 300, 300, np.nan]]))
# we avoid creating specific data for axis 0 and 1: translating the data is
# enough.
if axis:
X = X.T
X_nan = X_nan.T
# take a copy of the old statistics since they are modified in place.
X_means, X_vars, X_sample_count = incr_mean_variance_axis(
X, axis=axis, last_mean=old_means.copy(),
last_var=old_variances.copy(), last_n=old_sample_count.copy())
X_nan_means, X_nan_vars, X_nan_sample_count = incr_mean_variance_axis(
X_nan, axis=axis, last_mean=old_means.copy(),
last_var=old_variances.copy(), last_n=old_sample_count.copy())
assert_allclose(X_nan_means, X_means)
assert_allclose(X_nan_vars, X_vars)
assert_allclose(X_nan_sample_count, X_sample_count)
def test_mean_variance_illegal_axis():
X, _ = make_classification(5, 4, random_state=0)
# Sparsify the array a little bit
X[0, 0] = 0
X[2, 1] = 0
X[4, 3] = 0
X_csr = sp.csr_matrix(X)
with pytest.raises(ValueError):
mean_variance_axis(X_csr, axis=-3)
with pytest.raises(ValueError):
mean_variance_axis(X_csr, axis=2)
with pytest.raises(ValueError):
mean_variance_axis(X_csr, axis=-1)
with pytest.raises(ValueError):
incr_mean_variance_axis(X_csr, axis=-3, last_mean=None, last_var=None,
last_n=None)
with pytest.raises(ValueError):
incr_mean_variance_axis(X_csr, axis=2, last_mean=None, last_var=None,
last_n=None)
with pytest.raises(ValueError):
incr_mean_variance_axis(X_csr, axis=-1, last_mean=None, last_var=None,
last_n=None)
def test_densify_rows():
for dtype in (np.float32, np.float64):
X = sp.csr_matrix([[0, 3, 0],
[2, 4, 0],
[0, 0, 0],
[9, 8, 7],
[4, 0, 5]], dtype=dtype)
X_rows = np.array([0, 2, 3], dtype=np.intp)
out = np.ones((6, X.shape[1]), dtype=dtype)
out_rows = np.array([1, 3, 4], dtype=np.intp)
expect = np.ones_like(out)
expect[out_rows] = X[X_rows, :].toarray()
assign_rows_csr(X, X_rows, out_rows, out)
assert_array_equal(out, expect)
def test_inplace_column_scale():
rng = np.random.RandomState(0)
X = sp.rand(100, 200, 0.05)
Xr = X.tocsr()
Xc = X.tocsc()
XA = X.toarray()
scale = rng.rand(200)
XA *= scale
inplace_column_scale(Xc, scale)
inplace_column_scale(Xr, scale)
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
assert_array_almost_equal(XA, Xc.toarray())
assert_array_almost_equal(XA, Xr.toarray())
with pytest.raises(TypeError):
inplace_column_scale(X.tolil(), scale)
X = X.astype(np.float32)
scale = scale.astype(np.float32)
Xr = X.tocsr()
Xc = X.tocsc()
XA = X.toarray()
XA *= scale
inplace_column_scale(Xc, scale)
inplace_column_scale(Xr, scale)
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
assert_array_almost_equal(XA, Xc.toarray())
assert_array_almost_equal(XA, Xr.toarray())
with pytest.raises(TypeError):
inplace_column_scale(X.tolil(), scale)
def test_inplace_row_scale():
rng = np.random.RandomState(0)
X = sp.rand(100, 200, 0.05)
Xr = X.tocsr()
Xc = X.tocsc()
XA = X.toarray()
scale = rng.rand(100)
XA *= scale.reshape(-1, 1)
inplace_row_scale(Xc, scale)
inplace_row_scale(Xr, scale)
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
assert_array_almost_equal(XA, Xc.toarray())
assert_array_almost_equal(XA, Xr.toarray())
with pytest.raises(TypeError):
inplace_column_scale(X.tolil(), scale)
X = X.astype(np.float32)
scale = scale.astype(np.float32)
Xr = X.tocsr()
Xc = X.tocsc()
XA = X.toarray()
XA *= scale.reshape(-1, 1)
inplace_row_scale(Xc, scale)
inplace_row_scale(Xr, scale)
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
assert_array_almost_equal(XA, Xc.toarray())
assert_array_almost_equal(XA, Xr.toarray())
with pytest.raises(TypeError):
inplace_column_scale(X.tolil(), scale)
def test_inplace_swap_row():
X = np.array([[0, 3, 0],
[2, 4, 0],
[0, 0, 0],
[9, 8, 7],
[4, 0, 5]], dtype=np.float64)
X_csr = sp.csr_matrix(X)
X_csc = sp.csc_matrix(X)
swap = linalg.get_blas_funcs(('swap',), (X,))
swap = swap[0]
X[0], X[-1] = swap(X[0], X[-1])
inplace_swap_row(X_csr, 0, -1)
inplace_swap_row(X_csc, 0, -1)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
X[2], X[3] = swap(X[2], X[3])
inplace_swap_row(X_csr, 2, 3)
inplace_swap_row(X_csc, 2, 3)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
with pytest.raises(TypeError):
inplace_swap_row(X_csr.tolil())
X = np.array([[0, 3, 0],
[2, 4, 0],
[0, 0, 0],
[9, 8, 7],
[4, 0, 5]], dtype=np.float32)
X_csr = sp.csr_matrix(X)
X_csc = sp.csc_matrix(X)
swap = linalg.get_blas_funcs(('swap',), (X,))
swap = swap[0]
X[0], X[-1] = swap(X[0], X[-1])
inplace_swap_row(X_csr, 0, -1)
inplace_swap_row(X_csc, 0, -1)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
X[2], X[3] = swap(X[2], X[3])
inplace_swap_row(X_csr, 2, 3)
inplace_swap_row(X_csc, 2, 3)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
with pytest.raises(TypeError):
inplace_swap_row(X_csr.tolil())
def test_inplace_swap_column():
X = np.array([[0, 3, 0],
[2, 4, 0],
[0, 0, 0],
[9, 8, 7],
[4, 0, 5]], dtype=np.float64)
X_csr = sp.csr_matrix(X)
X_csc = sp.csc_matrix(X)
swap = linalg.get_blas_funcs(('swap',), (X,))
swap = swap[0]
X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1])
inplace_swap_column(X_csr, 0, -1)
inplace_swap_column(X_csc, 0, -1)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1])
inplace_swap_column(X_csr, 0, 1)
inplace_swap_column(X_csc, 0, 1)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
with pytest.raises(TypeError):
inplace_swap_column(X_csr.tolil())
X = np.array([[0, 3, 0],
[2, 4, 0],
[0, 0, 0],
[9, 8, 7],
[4, 0, 5]], dtype=np.float32)
X_csr = sp.csr_matrix(X)
X_csc = sp.csc_matrix(X)
swap = linalg.get_blas_funcs(('swap',), (X,))
swap = swap[0]
X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1])
inplace_swap_column(X_csr, 0, -1)
inplace_swap_column(X_csc, 0, -1)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1])
inplace_swap_column(X_csr, 0, 1)
inplace_swap_column(X_csc, 0, 1)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
with pytest.raises(TypeError):
inplace_swap_column(X_csr.tolil())
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("axis", [0, 1, None])
@pytest.mark.parametrize("sparse_format", [sp.csr_matrix, sp.csc_matrix])
@pytest.mark.parametrize(
"missing_values, min_func, max_func, ignore_nan",
[(0, np.min, np.max, False),
(np.nan, np.nanmin, np.nanmax, True)]
)
@pytest.mark.parametrize("large_indices", [True, False])
def test_min_max(dtype, axis, sparse_format, missing_values, min_func,
max_func, ignore_nan, large_indices):
X = np.array([[0, 3, 0],
[2, -1, missing_values],
[0, 0, 0],
[9, missing_values, 7],
[4, 0, 5]], dtype=dtype)
X_sparse = sparse_format(X)
if large_indices:
X_sparse.indices = X_sparse.indices.astype('int64')
X_sparse.indptr = X_sparse.indptr.astype('int64')
mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis,
ignore_nan=ignore_nan)
assert_array_equal(mins_sparse, min_func(X, axis=axis))
assert_array_equal(maxs_sparse, max_func(X, axis=axis))
def test_min_max_axis_errors():
X = np.array([[0, 3, 0],
[2, -1, 0],
[0, 0, 0],
[9, 8, 7],
[4, 0, 5]], dtype=np.float64)
X_csr = sp.csr_matrix(X)
X_csc = sp.csc_matrix(X)
with pytest.raises(TypeError):
min_max_axis(X_csr.tolil(), axis=0)
with pytest.raises(ValueError):
min_max_axis(X_csr, axis=2)
with pytest.raises(ValueError):
min_max_axis(X_csc, axis=-3)
def test_count_nonzero():
X = np.array([[0, 3, 0],
[2, -1, 0],
[0, 0, 0],
[9, 8, 7],
[4, 0, 5]], dtype=np.float64)
X_csr = sp.csr_matrix(X)
X_csc = sp.csc_matrix(X)
X_nonzero = X != 0
sample_weight = [.5, .2, .3, .1, .1]
X_nonzero_weighted = X_nonzero * np.array(sample_weight)[:, None]
for axis in [0, 1, -1, -2, None]:
assert_array_almost_equal(count_nonzero(X_csr, axis=axis),
X_nonzero.sum(axis=axis))
assert_array_almost_equal(count_nonzero(X_csr, axis=axis,
sample_weight=sample_weight),
X_nonzero_weighted.sum(axis=axis))
with pytest.raises(TypeError):
count_nonzero(X_csc)
with pytest.raises(ValueError):
count_nonzero(X_csr, axis=2)
assert (count_nonzero(X_csr, axis=0).dtype ==
count_nonzero(X_csr, axis=1).dtype)
assert (count_nonzero(X_csr, axis=0, sample_weight=sample_weight).dtype ==
count_nonzero(X_csr, axis=1, sample_weight=sample_weight).dtype)
# Check dtypes with large sparse matrices too
# XXX: test fails on 32bit (Windows/Linux)
try:
X_csr.indices = X_csr.indices.astype(np.int64)
X_csr.indptr = X_csr.indptr.astype(np.int64)
assert (count_nonzero(X_csr, axis=0).dtype ==
count_nonzero(X_csr, axis=1).dtype)
assert (count_nonzero(X_csr, axis=0,
sample_weight=sample_weight).dtype ==
count_nonzero(X_csr, axis=1,
sample_weight=sample_weight).dtype)
except TypeError as e:
assert ("according to the rule 'safe'" in e.args[0]
and np.intp().nbytes < 8), e
def test_csc_row_median():
# Test csc_row_median actually calculates the median.
# Test that it gives the same output when X is dense.
rng = np.random.RandomState(0)
X = rng.rand(100, 50)
dense_median = np.median(X, axis=0)
csc = sp.csc_matrix(X)
sparse_median = csc_median_axis_0(csc)
assert_array_equal(sparse_median, dense_median)
# Test that it gives the same output when X is sparse
X = rng.rand(51, 100)
X[X < 0.7] = 0.0
ind = rng.randint(0, 50, 10)
X[ind] = -X[ind]
csc = sp.csc_matrix(X)
dense_median = np.median(X, axis=0)
sparse_median = csc_median_axis_0(csc)
assert_array_equal(sparse_median, dense_median)
# Test for toy data.
X = [[0, -2], [-1, -1], [1, 0], [2, 1]]
csc = sp.csc_matrix(X)
assert_array_equal(csc_median_axis_0(csc), np.array([0.5, -0.5]))
X = [[0, -2], [-1, -5], [1, -3]]
csc = sp.csc_matrix(X)
assert_array_equal(csc_median_axis_0(csc), np.array([0., -3]))
# Test that it raises an Error for non-csc matrices.
with pytest.raises(TypeError):
csc_median_axis_0(sp.csr_matrix(X))
def test_inplace_normalize():
ones = np.ones((10, 1))
rs = RandomState(10)
for inplace_csr_row_normalize in (inplace_csr_row_normalize_l1,
inplace_csr_row_normalize_l2):
for dtype in (np.float64, np.float32):
X = rs.randn(10, 5).astype(dtype)
X_csr = sp.csr_matrix(X)
for index_dtype in [np.int32, np.int64]:
# csr_matrix will use int32 indices by default,
# up-casting those to int64 when necessary
if index_dtype is np.int64:
X_csr.indptr = X_csr.indptr.astype(index_dtype)
X_csr.indices = X_csr.indices.astype(index_dtype)
assert X_csr.indices.dtype == index_dtype
assert X_csr.indptr.dtype == index_dtype
inplace_csr_row_normalize(X_csr)
assert X_csr.dtype == dtype
if inplace_csr_row_normalize is inplace_csr_row_normalize_l2:
X_csr.data **= 2
assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_csr_row_norms(dtype):
# checks that csr_row_norms returns the same output as
# scipy.sparse.linalg.norm, and that the dype is the same as X.dtype.
X = sp.random(100, 10, format='csr', dtype=dtype, random_state=42)
scipy_norms = sp.linalg.norm(X, axis=1)**2
norms = csr_row_norms(X)
assert norms.dtype == dtype
rtol = 1e-6 if dtype == np.float32 else 1e-7
assert_allclose(norms, scipy_norms, rtol=rtol)

View file

@ -0,0 +1,696 @@
import warnings
import unittest
import sys
import os
import atexit
import numpy as np
from scipy import sparse
import pytest
from sklearn.utils.deprecation import deprecated
from sklearn.utils.metaestimators import if_delegate_has_method
from sklearn.utils._testing import (
assert_raises,
assert_less,
assert_greater,
assert_less_equal,
assert_greater_equal,
assert_warns,
assert_no_warnings,
assert_equal,
assert_not_equal,
assert_in,
assert_not_in,
set_random_state,
assert_raise_message,
ignore_warnings,
check_docstring_parameters,
assert_allclose_dense_sparse,
assert_raises_regex,
TempMemmap,
create_memmap_backed_data,
_delete_folder,
_convert_container)
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
@pytest.mark.filterwarnings("ignore",
category=FutureWarning) # 0.24
def test_assert_less():
assert 0 < 1
with pytest.raises(AssertionError):
assert_less(1, 0)
@pytest.mark.filterwarnings("ignore",
category=FutureWarning) # 0.24
def test_assert_greater():
assert 1 > 0
with pytest.raises(AssertionError):
assert_greater(0, 1)
@pytest.mark.filterwarnings("ignore",
category=FutureWarning) # 0.24
def test_assert_less_equal():
assert 0 <= 1
assert 1 <= 1
with pytest.raises(AssertionError):
assert_less_equal(1, 0)
@pytest.mark.filterwarnings("ignore",
category=FutureWarning) # 0.24
def test_assert_greater_equal():
assert 1 >= 0
assert 1 >= 1
with pytest.raises(AssertionError):
assert_greater_equal(0, 1)
def test_set_random_state():
lda = LinearDiscriminantAnalysis()
tree = DecisionTreeClassifier()
# Linear Discriminant Analysis doesn't have random state: smoke test
set_random_state(lda, 3)
set_random_state(tree, 3)
assert tree.random_state == 3
def test_assert_allclose_dense_sparse():
x = np.arange(9).reshape(3, 3)
msg = "Not equal to tolerance "
y = sparse.csc_matrix(x)
for X in [x, y]:
# basic compare
with pytest.raises(AssertionError, match=msg):
assert_allclose_dense_sparse(X, X*2)
assert_allclose_dense_sparse(X, X)
with pytest.raises(ValueError, match="Can only compare two sparse"):
assert_allclose_dense_sparse(x, y)
A = sparse.diags(np.ones(5), offsets=0).tocsr()
B = sparse.csr_matrix(np.ones((1, 5)))
with pytest.raises(AssertionError, match="Arrays are not equal"):
assert_allclose_dense_sparse(B, A)
def test_assert_raises_msg():
with assert_raises_regex(AssertionError, 'Hello world'):
with assert_raises(ValueError, msg='Hello world'):
pass
def test_assert_raise_message():
def _raise_ValueError(message):
raise ValueError(message)
def _no_raise():
pass
assert_raise_message(ValueError, "test",
_raise_ValueError, "test")
assert_raises(AssertionError,
assert_raise_message, ValueError, "something else",
_raise_ValueError, "test")
assert_raises(ValueError,
assert_raise_message, TypeError, "something else",
_raise_ValueError, "test")
assert_raises(AssertionError,
assert_raise_message, ValueError, "test",
_no_raise)
# multiple exceptions in a tuple
assert_raises(AssertionError,
assert_raise_message, (ValueError, AttributeError),
"test", _no_raise)
def test_ignore_warning():
# This check that ignore_warning decorateur and context manager are working
# as expected
def _warning_function():
warnings.warn("deprecation warning", DeprecationWarning)
def _multiple_warning_function():
warnings.warn("deprecation warning", DeprecationWarning)
warnings.warn("deprecation warning")
# Check the function directly
assert_no_warnings(ignore_warnings(_warning_function))
assert_no_warnings(ignore_warnings(_warning_function,
category=DeprecationWarning))
assert_warns(DeprecationWarning, ignore_warnings(_warning_function,
category=UserWarning))
assert_warns(UserWarning,
ignore_warnings(_multiple_warning_function,
category=FutureWarning))
assert_warns(DeprecationWarning,
ignore_warnings(_multiple_warning_function,
category=UserWarning))
assert_no_warnings(ignore_warnings(_warning_function,
category=(DeprecationWarning,
UserWarning)))
# Check the decorator
@ignore_warnings
def decorator_no_warning():
_warning_function()
_multiple_warning_function()
@ignore_warnings(category=(DeprecationWarning, UserWarning))
def decorator_no_warning_multiple():
_multiple_warning_function()
@ignore_warnings(category=DeprecationWarning)
def decorator_no_deprecation_warning():
_warning_function()
@ignore_warnings(category=UserWarning)
def decorator_no_user_warning():
_warning_function()
@ignore_warnings(category=DeprecationWarning)
def decorator_no_deprecation_multiple_warning():
_multiple_warning_function()
@ignore_warnings(category=UserWarning)
def decorator_no_user_multiple_warning():
_multiple_warning_function()
assert_no_warnings(decorator_no_warning)
assert_no_warnings(decorator_no_warning_multiple)
assert_no_warnings(decorator_no_deprecation_warning)
assert_warns(DeprecationWarning, decorator_no_user_warning)
assert_warns(UserWarning, decorator_no_deprecation_multiple_warning)
assert_warns(DeprecationWarning, decorator_no_user_multiple_warning)
# Check the context manager
def context_manager_no_warning():
with ignore_warnings():
_warning_function()
def context_manager_no_warning_multiple():
with ignore_warnings(category=(DeprecationWarning, UserWarning)):
_multiple_warning_function()
def context_manager_no_deprecation_warning():
with ignore_warnings(category=DeprecationWarning):
_warning_function()
def context_manager_no_user_warning():
with ignore_warnings(category=UserWarning):
_warning_function()
def context_manager_no_deprecation_multiple_warning():
with ignore_warnings(category=DeprecationWarning):
_multiple_warning_function()
def context_manager_no_user_multiple_warning():
with ignore_warnings(category=UserWarning):
_multiple_warning_function()
assert_no_warnings(context_manager_no_warning)
assert_no_warnings(context_manager_no_warning_multiple)
assert_no_warnings(context_manager_no_deprecation_warning)
assert_warns(DeprecationWarning, context_manager_no_user_warning)
assert_warns(UserWarning, context_manager_no_deprecation_multiple_warning)
assert_warns(DeprecationWarning, context_manager_no_user_multiple_warning)
# Check that passing warning class as first positional argument
warning_class = UserWarning
match = "'obj' should be a callable.+you should use 'category=UserWarning'"
with pytest.raises(ValueError, match=match):
silence_warnings_func = ignore_warnings(warning_class)(
_warning_function)
silence_warnings_func()
with pytest.raises(ValueError, match=match):
@ignore_warnings(warning_class)
def test():
pass
class TestWarns(unittest.TestCase):
def test_warn(self):
def f():
warnings.warn("yo")
return 3
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
filters_orig = warnings.filters[:]
assert assert_warns(UserWarning, f) == 3
# test that assert_warns doesn't have side effects on warnings
# filters
assert warnings.filters == filters_orig
with pytest.raises(AssertionError):
assert_no_warnings(f)
assert assert_no_warnings(lambda x: x, 1) == 1
def test_warn_wrong_warning(self):
def f():
warnings.warn("yo", FutureWarning)
failed = False
filters = sys.modules['warnings'].filters[:]
try:
try:
# Should raise an AssertionError
# assert_warns has a special handling of "FutureWarning" that
# pytest.warns does not have
assert_warns(UserWarning, f)
failed = True
except AssertionError:
pass
finally:
sys.modules['warnings'].filters = filters
if failed:
raise AssertionError("wrong warning caught by assert_warn")
# Tests for docstrings:
def f_ok(a, b):
"""Function f
Parameters
----------
a : int
Parameter a
b : float
Parameter b
Returns
-------
c : list
Parameter c
"""
c = a + b
return c
def f_bad_sections(a, b):
"""Function f
Parameters
----------
a : int
Parameter a
b : float
Parameter b
Results
-------
c : list
Parameter c
"""
c = a + b
return c
def f_bad_order(b, a):
"""Function f
Parameters
----------
a : int
Parameter a
b : float
Parameter b
Returns
-------
c : list
Parameter c
"""
c = a + b
return c
def f_too_many_param_docstring(a, b):
"""Function f
Parameters
----------
a : int
Parameter a
b : int
Parameter b
c : int
Parameter c
Returns
-------
d : list
Parameter c
"""
d = a + b
return d
def f_missing(a, b):
"""Function f
Parameters
----------
a : int
Parameter a
Returns
-------
c : list
Parameter c
"""
c = a + b
return c
def f_check_param_definition(a, b, c, d, e):
"""Function f
Parameters
----------
a: int
Parameter a
b:
Parameter b
c :
Parameter c
d:int
Parameter d
e
No typespec is allowed without colon
"""
return a + b + c + d
class Klass:
def f_missing(self, X, y):
pass
def f_bad_sections(self, X, y):
"""Function f
Parameter
----------
a : int
Parameter a
b : float
Parameter b
Results
-------
c : list
Parameter c
"""
pass
class MockEst:
def __init__(self):
"""MockEstimator"""
def fit(self, X, y):
return X
def predict(self, X):
return X
def predict_proba(self, X):
return X
def score(self, X):
return 1.
class MockMetaEstimator:
def __init__(self, delegate):
"""MetaEstimator to check if doctest on delegated methods work.
Parameters
---------
delegate : estimator
Delegated estimator.
"""
self.delegate = delegate
@if_delegate_has_method(delegate=('delegate'))
def predict(self, X):
"""This is available only if delegate has predict.
Parameters
----------
y : ndarray
Parameter y
"""
return self.delegate.predict(X)
@if_delegate_has_method(delegate=('delegate'))
@deprecated("Testing a deprecated delegated method")
def score(self, X):
"""This is available only if delegate has score.
Parameters
---------
y : ndarray
Parameter y
"""
@if_delegate_has_method(delegate=('delegate'))
def predict_proba(self, X):
"""This is available only if delegate has predict_proba.
Parameters
---------
X : ndarray
Parameter X
"""
return X
@deprecated('Testing deprecated function with wrong params')
def fit(self, X, y):
"""Incorrect docstring but should not be tested"""
def test_check_docstring_parameters():
pytest.importorskip('numpydoc',
reason="numpydoc is required to test the docstrings")
incorrect = check_docstring_parameters(f_ok)
assert incorrect == []
incorrect = check_docstring_parameters(f_ok, ignore=['b'])
assert incorrect == []
incorrect = check_docstring_parameters(f_missing, ignore=['b'])
assert incorrect == []
with pytest.raises(RuntimeError, match="Unknown section Results"):
check_docstring_parameters(f_bad_sections)
with pytest.raises(RuntimeError, match="Unknown section Parameter"):
check_docstring_parameters(Klass.f_bad_sections)
incorrect = check_docstring_parameters(f_check_param_definition)
assert (
incorrect == [
"sklearn.utils.tests.test_testing.f_check_param_definition There "
"was no space between the param name and colon ('a: int')",
"sklearn.utils.tests.test_testing.f_check_param_definition There "
"was no space between the param name and colon ('b:')",
"sklearn.utils.tests.test_testing.f_check_param_definition "
"Parameter 'c :' has an empty type spec. Remove the colon",
"sklearn.utils.tests.test_testing.f_check_param_definition There "
"was no space between the param name and colon ('d:int')",
])
messages = [
["In function: sklearn.utils.tests.test_testing.f_bad_order",
"There's a parameter name mismatch in function docstring w.r.t."
" function signature, at index 0 diff: 'b' != 'a'",
"Full diff:",
"- ['b', 'a']",
"+ ['a', 'b']"],
["In function: " +
"sklearn.utils.tests.test_testing.f_too_many_param_docstring",
"Parameters in function docstring have more items w.r.t. function"
" signature, first extra item: c",
"Full diff:",
"- ['a', 'b']",
"+ ['a', 'b', 'c']",
"? +++++"],
["In function: sklearn.utils.tests.test_testing.f_missing",
"Parameters in function docstring have less items w.r.t. function"
" signature, first missing item: b",
"Full diff:",
"- ['a', 'b']",
"+ ['a']"],
["In function: sklearn.utils.tests.test_testing.Klass.f_missing",
"Parameters in function docstring have less items w.r.t. function"
" signature, first missing item: X",
"Full diff:",
"- ['X', 'y']",
"+ []"],
["In function: " +
"sklearn.utils.tests.test_testing.MockMetaEstimator.predict",
"There's a parameter name mismatch in function docstring w.r.t."
" function signature, at index 0 diff: 'X' != 'y'",
"Full diff:",
"- ['X']",
"? ^",
"+ ['y']",
"? ^"],
["In function: " +
"sklearn.utils.tests.test_testing.MockMetaEstimator."
+ "predict_proba",
"Parameters in function docstring have less items w.r.t. function"
" signature, first missing item: X",
"Full diff:",
"- ['X']",
"+ []"],
["In function: " +
"sklearn.utils.tests.test_testing.MockMetaEstimator.score",
"Parameters in function docstring have less items w.r.t. function"
" signature, first missing item: X",
"Full diff:",
"- ['X']",
"+ []"],
["In function: " +
"sklearn.utils.tests.test_testing.MockMetaEstimator.fit",
"Parameters in function docstring have less items w.r.t. function"
" signature, first missing item: X",
"Full diff:",
"- ['X', 'y']",
"+ []"],
]
mock_meta = MockMetaEstimator(delegate=MockEst())
for msg, f in zip(messages,
[f_bad_order,
f_too_many_param_docstring,
f_missing,
Klass.f_missing,
mock_meta.predict,
mock_meta.predict_proba,
mock_meta.score,
mock_meta.fit]):
incorrect = check_docstring_parameters(f)
assert msg == incorrect, ('\n"%s"\n not in \n"%s"' % (msg, incorrect))
class RegistrationCounter:
def __init__(self):
self.nb_calls = 0
def __call__(self, to_register_func):
self.nb_calls += 1
assert to_register_func.func is _delete_folder
def check_memmap(input_array, mmap_data, mmap_mode='r'):
assert isinstance(mmap_data, np.memmap)
writeable = mmap_mode != 'r'
assert mmap_data.flags.writeable is writeable
np.testing.assert_array_equal(input_array, mmap_data)
def test_tempmemmap(monkeypatch):
registration_counter = RegistrationCounter()
monkeypatch.setattr(atexit, 'register', registration_counter)
input_array = np.ones(3)
with TempMemmap(input_array) as data:
check_memmap(input_array, data)
temp_folder = os.path.dirname(data.filename)
if os.name != 'nt':
assert not os.path.exists(temp_folder)
assert registration_counter.nb_calls == 1
mmap_mode = 'r+'
with TempMemmap(input_array, mmap_mode=mmap_mode) as data:
check_memmap(input_array, data, mmap_mode=mmap_mode)
temp_folder = os.path.dirname(data.filename)
if os.name != 'nt':
assert not os.path.exists(temp_folder)
assert registration_counter.nb_calls == 2
def test_create_memmap_backed_data(monkeypatch):
registration_counter = RegistrationCounter()
monkeypatch.setattr(atexit, 'register', registration_counter)
input_array = np.ones(3)
data = create_memmap_backed_data(input_array)
check_memmap(input_array, data)
assert registration_counter.nb_calls == 1
data, folder = create_memmap_backed_data(input_array,
return_folder=True)
check_memmap(input_array, data)
assert folder == os.path.dirname(data.filename)
assert registration_counter.nb_calls == 2
mmap_mode = 'r+'
data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode)
check_memmap(input_array, data, mmap_mode)
assert registration_counter.nb_calls == 3
input_list = [input_array, input_array + 1, input_array + 2]
mmap_data_list = create_memmap_backed_data(input_list)
for input_array, data in zip(input_list, mmap_data_list):
check_memmap(input_array, data)
assert registration_counter.nb_calls == 4
# 0.24
@pytest.mark.parametrize('callable, args', [
(assert_equal, (0, 0)),
(assert_not_equal, (0, 1)),
(assert_greater, (1, 0)),
(assert_greater_equal, (1, 0)),
(assert_less, (0, 1)),
(assert_less_equal, (0, 1)),
(assert_in, (0, [0])),
(assert_not_in, (0, [1]))])
def test_deprecated_helpers(callable, args):
msg = ('is deprecated in version 0.22 and will be removed in version '
'0.24. Please use "assert" instead')
with pytest.warns(FutureWarning, match=msg):
callable(*args)
@pytest.mark.parametrize(
"constructor_name, container_type",
[('list', list),
('tuple', tuple),
('array', np.ndarray),
('sparse', sparse.csr_matrix),
('dataframe', pytest.importorskip('pandas').DataFrame),
('series', pytest.importorskip('pandas').Series),
('index', pytest.importorskip('pandas').Index),
('slice', slice)]
)
def test_convert_container(constructor_name, container_type):
container = [0, 1]
assert isinstance(_convert_container(container, constructor_name),
container_type)

View file

@ -0,0 +1,697 @@
from copy import copy
from itertools import chain
import warnings
import string
import timeit
import pytest
import numpy as np
import scipy.sparse as sp
from sklearn.utils._testing import (assert_array_equal,
assert_allclose_dense_sparse,
assert_warns_message,
assert_no_warnings,
_convert_container)
from sklearn.utils import check_random_state
from sklearn.utils import _determine_key_type
from sklearn.utils import deprecated
from sklearn.utils import gen_batches
from sklearn.utils import _get_column_indices
from sklearn.utils import resample
from sklearn.utils import safe_mask
from sklearn.utils import column_or_1d
from sklearn.utils import _safe_indexing
from sklearn.utils import shuffle
from sklearn.utils import gen_even_slices
from sklearn.utils import _message_with_time, _print_elapsed_time
from sklearn.utils import get_chunk_n_rows
from sklearn.utils import is_scalar_nan
from sklearn.utils import _to_object_array
from sklearn.utils._mocking import MockDataFrame
from sklearn import config_context
# toy array
X_toy = np.arange(9).reshape((3, 3))
def test_make_rng():
# Check the check_random_state utility function behavior
assert check_random_state(None) is np.random.mtrand._rand
assert check_random_state(np.random) is np.random.mtrand._rand
rng_42 = np.random.RandomState(42)
assert check_random_state(42).randint(100) == rng_42.randint(100)
rng_42 = np.random.RandomState(42)
assert check_random_state(rng_42) is rng_42
rng_42 = np.random.RandomState(42)
assert check_random_state(43).randint(100) != rng_42.randint(100)
with pytest.raises(ValueError):
check_random_state("some invalid seed")
def test_gen_batches():
# Make sure gen_batches errors on invalid batch_size
assert_array_equal(
list(gen_batches(4, 2)),
[slice(0, 2, None), slice(2, 4, None)]
)
msg_zero = "gen_batches got batch_size=0, must be positive"
with pytest.raises(ValueError, match=msg_zero):
next(gen_batches(4, 0))
msg_float = "gen_batches got batch_size=0.5, must be an integer"
with pytest.raises(TypeError, match=msg_float):
next(gen_batches(4, 0.5))
def test_deprecated():
# Test whether the deprecated decorator issues appropriate warnings
# Copied almost verbatim from https://docs.python.org/library/warnings.html
# First a function...
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
@deprecated()
def ham():
return "spam"
spam = ham()
assert spam == "spam" # function must remain usable
assert len(w) == 1
assert issubclass(w[0].category, FutureWarning)
assert "deprecated" in str(w[0].message).lower()
# ... then a class.
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
@deprecated("don't use this")
class Ham:
SPAM = 1
ham = Ham()
assert hasattr(ham, "SPAM")
assert len(w) == 1
assert issubclass(w[0].category, FutureWarning)
assert "deprecated" in str(w[0].message).lower()
def test_resample():
# Border case not worth mentioning in doctests
assert resample() is None
# Check that invalid arguments yield ValueError
with pytest.raises(ValueError):
resample([0], [0, 1])
with pytest.raises(ValueError):
resample([0, 1], [0, 1], replace=False, n_samples=3)
with pytest.raises(ValueError):
resample([0, 1], [0, 1], meaning_of_life=42)
# Issue:6581, n_samples can be more when replace is True (default).
assert len(resample([1, 2], n_samples=5)) == 5
def test_resample_stratified():
# Make sure resample can stratify
rng = np.random.RandomState(0)
n_samples = 100
p = .9
X = rng.normal(size=(n_samples, 1))
y = rng.binomial(1, p, size=n_samples)
_, y_not_stratified = resample(X, y, n_samples=10, random_state=0,
stratify=None)
assert np.all(y_not_stratified == 1)
_, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
assert not np.all(y_stratified == 1)
assert np.sum(y_stratified) == 9 # all 1s, one 0
def test_resample_stratified_replace():
# Make sure stratified resampling supports the replace parameter
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 1))
y = rng.randint(0, 2, size=n_samples)
X_replace, _ = resample(X, y, replace=True, n_samples=50,
random_state=rng, stratify=y)
X_no_replace, _ = resample(X, y, replace=False, n_samples=50,
random_state=rng, stratify=y)
assert np.unique(X_replace).shape[0] < 50
assert np.unique(X_no_replace).shape[0] == 50
# make sure n_samples can be greater than X.shape[0] if we sample with
# replacement
X_replace, _ = resample(X, y, replace=True, n_samples=1000,
random_state=rng, stratify=y)
assert X_replace.shape[0] == 1000
assert np.unique(X_replace).shape[0] == 100
def test_resample_stratify_2dy():
# Make sure y can be 2d when stratifying
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 1))
y = rng.randint(0, 2, size=(n_samples, 2))
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y)
assert y.ndim == 2
def test_resample_stratify_sparse_error():
# resample must be ndarray
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 2))
y = rng.randint(0, 2, size=n_samples)
stratify = sp.csr_matrix(y)
with pytest.raises(TypeError, match='A sparse matrix was passed'):
X, y = resample(X, y, n_samples=50, random_state=rng,
stratify=stratify)
def test_safe_mask():
random_state = check_random_state(0)
X = random_state.rand(5, 4)
X_csr = sp.csr_matrix(X)
mask = [False, False, True, True, True]
mask = safe_mask(X, mask)
assert X[mask].shape[0] == 3
mask = safe_mask(X_csr, mask)
assert X_csr[mask].shape[0] == 3
def test_column_or_1d():
EXAMPLES = [
("binary", ["spam", "egg", "spam"]),
("binary", [0, 1, 0, 1]),
("continuous", np.arange(10) / 20.),
("multiclass", [1, 2, 3]),
("multiclass", [0, 1, 2, 2, 0]),
("multiclass", [[1], [2], [3]]),
("multilabel-indicator", [[0, 1, 0], [0, 0, 1]]),
("multiclass-multioutput", [[1, 2, 3]]),
("multiclass-multioutput", [[1, 1], [2, 2], [3, 1]]),
("multiclass-multioutput", [[5, 1], [4, 2], [3, 1]]),
("multiclass-multioutput", [[1, 2, 3]]),
("continuous-multioutput", np.arange(30).reshape((-1, 3))),
]
for y_type, y in EXAMPLES:
if y_type in ["binary", 'multiclass', "continuous"]:
assert_array_equal(column_or_1d(y), np.ravel(y))
else:
with pytest.raises(ValueError):
column_or_1d(y)
@pytest.mark.parametrize(
"key, dtype",
[(0, 'int'),
('0', 'str'),
(True, 'bool'),
(np.bool_(True), 'bool'),
([0, 1, 2], 'int'),
(['0', '1', '2'], 'str'),
((0, 1, 2), 'int'),
(('0', '1', '2'), 'str'),
(slice(None, None), None),
(slice(0, 2), 'int'),
(np.array([0, 1, 2], dtype=np.int32), 'int'),
(np.array([0, 1, 2], dtype=np.int64), 'int'),
(np.array([0, 1, 2], dtype=np.uint8), 'int'),
([True, False], 'bool'),
((True, False), 'bool'),
(np.array([True, False]), 'bool'),
('col_0', 'str'),
(['col_0', 'col_1', 'col_2'], 'str'),
(('col_0', 'col_1', 'col_2'), 'str'),
(slice('begin', 'end'), 'str'),
(np.array(['col_0', 'col_1', 'col_2']), 'str'),
(np.array(['col_0', 'col_1', 'col_2'], dtype=object), 'str')]
)
def test_determine_key_type(key, dtype):
assert _determine_key_type(key) == dtype
def test_determine_key_type_error():
with pytest.raises(ValueError, match="No valid specification of the"):
_determine_key_type(1.0)
def test_determine_key_type_slice_error():
with pytest.raises(TypeError, match="Only array-like or scalar are"):
_determine_key_type(slice(0, 2, 1), accept_slice=False)
@pytest.mark.parametrize(
"array_type", ["list", "array", "sparse", "dataframe"]
)
@pytest.mark.parametrize(
"indices_type", ["list", "tuple", "array", "series", "slice"]
)
def test_safe_indexing_2d_container_axis_0(array_type, indices_type):
indices = [1, 2]
if indices_type == 'slice' and isinstance(indices[1], int):
indices[1] += 1
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=0)
assert_allclose_dense_sparse(
subset, _convert_container([[4, 5, 6], [7, 8, 9]], array_type)
)
@pytest.mark.parametrize("array_type", ["list", "array", "series"])
@pytest.mark.parametrize(
"indices_type", ["list", "tuple", "array", "series", "slice"]
)
def test_safe_indexing_1d_container(array_type, indices_type):
indices = [1, 2]
if indices_type == 'slice' and isinstance(indices[1], int):
indices[1] += 1
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=0)
assert_allclose_dense_sparse(
subset, _convert_container([2, 3], array_type)
)
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
@pytest.mark.parametrize(
"indices_type", ["list", "tuple", "array", "series", "slice"]
)
@pytest.mark.parametrize("indices", [[1, 2], ["col_1", "col_2"]])
def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices):
# validation of the indices
# we make a copy because indices is mutable and shared between tests
indices_converted = copy(indices)
if indices_type == 'slice' and isinstance(indices[1], int):
indices_converted[1] += 1
columns_name = ['col_0', 'col_1', 'col_2']
array = _convert_container(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
)
indices_converted = _convert_container(indices_converted, indices_type)
if isinstance(indices[0], str) and array_type != 'dataframe':
err_msg = ("Specifying the columns using strings is only supported "
"for pandas DataFrames")
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(array, indices_converted, axis=1)
else:
subset = _safe_indexing(array, indices_converted, axis=1)
assert_allclose_dense_sparse(
subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type)
)
@pytest.mark.parametrize("array_read_only", [True, False])
@pytest.mark.parametrize("indices_read_only", [True, False])
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
@pytest.mark.parametrize("indices_type", ["array", "series"])
@pytest.mark.parametrize(
"axis, expected_array",
[(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])]
)
def test_safe_indexing_2d_read_only_axis_1(array_read_only, indices_read_only,
array_type, indices_type, axis,
expected_array):
array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
if array_read_only:
array.setflags(write=False)
array = _convert_container(array, array_type)
indices = np.array([1, 2])
if indices_read_only:
indices.setflags(write=False)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=axis)
assert_allclose_dense_sparse(
subset, _convert_container(expected_array, array_type)
)
@pytest.mark.parametrize("array_type", ["list", "array", "series"])
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
def test_safe_indexing_1d_container_mask(array_type, indices_type):
indices = [False] + [True] * 2 + [False] * 6
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=0)
assert_allclose_dense_sparse(
subset, _convert_container([2, 3], array_type)
)
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
@pytest.mark.parametrize(
"axis, expected_subset",
[(0, [[4, 5, 6], [7, 8, 9]]),
(1, [[2, 3], [5, 6], [8, 9]])]
)
def test_safe_indexing_2d_mask(array_type, indices_type, axis,
expected_subset):
columns_name = ['col_0', 'col_1', 'col_2']
array = _convert_container(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
)
indices = [False, True, True]
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=axis)
assert_allclose_dense_sparse(
subset, _convert_container(expected_subset, array_type)
)
@pytest.mark.parametrize(
"array_type, expected_output_type",
[("list", "list"), ("array", "array"),
("sparse", "sparse"), ("dataframe", "series")]
)
def test_safe_indexing_2d_scalar_axis_0(array_type, expected_output_type):
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
indices = 2
subset = _safe_indexing(array, indices, axis=0)
expected_array = _convert_container([7, 8, 9], expected_output_type)
assert_allclose_dense_sparse(subset, expected_array)
@pytest.mark.parametrize("array_type", ["list", "array", "series"])
def test_safe_indexing_1d_scalar(array_type):
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = 2
subset = _safe_indexing(array, indices, axis=0)
assert subset == 3
@pytest.mark.parametrize(
"array_type, expected_output_type",
[("array", "array"), ("sparse", "sparse"), ("dataframe", "series")]
)
@pytest.mark.parametrize("indices", [2, "col_2"])
def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type,
indices):
columns_name = ['col_0', 'col_1', 'col_2']
array = _convert_container(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
)
if isinstance(indices, str) and array_type != 'dataframe':
err_msg = ("Specifying the columns using strings is only supported "
"for pandas DataFrames")
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(array, indices, axis=1)
else:
subset = _safe_indexing(array, indices, axis=1)
expected_output = [3, 6, 9]
if expected_output_type == 'sparse':
# sparse matrix are keeping the 2D shape
expected_output = [[3], [6], [9]]
expected_array = _convert_container(
expected_output, expected_output_type
)
assert_allclose_dense_sparse(subset, expected_array)
@pytest.mark.parametrize("array_type", ["list", "array", "sparse"])
def test_safe_indexing_None_axis_0(array_type):
X = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
X_subset = _safe_indexing(X, None, axis=0)
assert_allclose_dense_sparse(X_subset, X)
def test_safe_indexing_pandas_no_matching_cols_error():
pd = pytest.importorskip('pandas')
err_msg = "No valid specification of the columns."
X = pd.DataFrame(X_toy)
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(X, [1.0], axis=1)
@pytest.mark.parametrize("axis", [None, 3])
def test_safe_indexing_error_axis(axis):
with pytest.raises(ValueError, match="'axis' should be either 0"):
_safe_indexing(X_toy, [0, 1], axis=axis)
@pytest.mark.parametrize("X_constructor", ['array', 'series'])
def test_safe_indexing_1d_array_error(X_constructor):
# check that we are raising an error if the array-like passed is 1D and
# we try to index on the 2nd dimension
X = list(range(5))
if X_constructor == 'array':
X_constructor = np.asarray(X)
elif X_constructor == 'series':
pd = pytest.importorskip("pandas")
X_constructor = pd.Series(X)
err_msg = "'X' should be a 2D NumPy array, 2D sparse matrix or pandas"
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(X_constructor, [0, 1], axis=1)
def test_safe_indexing_container_axis_0_unsupported_type():
indices = ["col_1", "col_2"]
array = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
err_msg = "String indexing is not supported with 'axis=0'"
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(array, indices, axis=0)
@pytest.mark.parametrize(
"key, err_msg",
[(10, r"all features must be in \[0, 2\]"),
('whatever', 'A given column is not a column of the dataframe')]
)
def test_get_column_indices_error(key, err_msg):
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame(X_toy, columns=['col_0', 'col_1', 'col_2'])
with pytest.raises(ValueError, match=err_msg):
_get_column_indices(X_df, key)
@pytest.mark.parametrize(
"key",
[['col1'], ['col2'], ['col1', 'col2'], ['col1', 'col3'], ['col2', 'col3']]
)
def test_get_column_indices_pandas_nonunique_columns_error(key):
pd = pytest.importorskip('pandas')
toy = np.zeros((1, 5), dtype=int)
columns = ['col1', 'col1', 'col2', 'col3', 'col2']
X = pd.DataFrame(toy, columns=columns)
err_msg = "Selected columns, {}, are not unique in dataframe".format(key)
with pytest.raises(ValueError) as exc_info:
_get_column_indices(X, key)
assert str(exc_info.value) == err_msg
def test_shuffle_on_ndim_equals_three():
def to_tuple(A): # to make the inner arrays hashable
return tuple(tuple(tuple(C) for C in B) for B in A)
A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2)
S = set(to_tuple(A))
shuffle(A) # shouldn't raise a ValueError for dim = 3
assert set(to_tuple(A)) == S
def test_shuffle_dont_convert_to_array():
# Check that shuffle does not try to convert to numpy arrays with float
# dtypes can let any indexable datastructure pass-through.
a = ['a', 'b', 'c']
b = np.array(['a', 'b', 'c'], dtype=object)
c = [1, 2, 3]
d = MockDataFrame(np.array([['a', 0],
['b', 1],
['c', 2]],
dtype=object))
e = sp.csc_matrix(np.arange(6).reshape(3, 2))
a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0)
assert a_s == ['c', 'b', 'a']
assert type(a_s) == list
assert_array_equal(b_s, ['c', 'b', 'a'])
assert b_s.dtype == object
assert c_s == [3, 2, 1]
assert type(c_s) == list
assert_array_equal(d_s, np.array([['c', 2],
['b', 1],
['a', 0]],
dtype=object))
assert type(d_s) == MockDataFrame
assert_array_equal(e_s.toarray(), np.array([[4, 5],
[2, 3],
[0, 1]]))
def test_gen_even_slices():
# check that gen_even_slices contains all samples
some_range = range(10)
joined_range = list(chain(*[some_range[slice] for slice in
gen_even_slices(10, 3)]))
assert_array_equal(some_range, joined_range)
# check that passing negative n_chunks raises an error
slices = gen_even_slices(10, -1)
with pytest.raises(ValueError, match="gen_even_slices got n_packs=-1,"
" must be >=1"):
next(slices)
@pytest.mark.parametrize(
('row_bytes', 'max_n_rows', 'working_memory', 'expected', 'warning'),
[(1024, None, 1, 1024, None),
(1024, None, 0.99999999, 1023, None),
(1023, None, 1, 1025, None),
(1025, None, 1, 1023, None),
(1024, None, 2, 2048, None),
(1024, 7, 1, 7, None),
(1024 * 1024, None, 1, 1, None),
(1024 * 1024 + 1, None, 1, 1,
'Could not adhere to working_memory config. '
'Currently 1MiB, 2MiB required.'),
])
def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory,
expected, warning):
if warning is not None:
def check_warning(*args, **kw):
return assert_warns_message(UserWarning, warning, *args, **kw)
else:
check_warning = assert_no_warnings
actual = check_warning(get_chunk_n_rows,
row_bytes=row_bytes,
max_n_rows=max_n_rows,
working_memory=working_memory)
assert actual == expected
assert type(actual) is type(expected)
with config_context(working_memory=working_memory):
actual = check_warning(get_chunk_n_rows,
row_bytes=row_bytes,
max_n_rows=max_n_rows)
assert actual == expected
assert type(actual) is type(expected)
@pytest.mark.parametrize(
['source', 'message', 'is_long'],
[
('ABC', string.ascii_lowercase, False),
('ABCDEF', string.ascii_lowercase, False),
('ABC', string.ascii_lowercase * 3, True),
('ABC' * 10, string.ascii_lowercase, True),
('ABC', string.ascii_lowercase + u'\u1048', False),
])
@pytest.mark.parametrize(
['time', 'time_str'],
[
(0.2, ' 0.2s'),
(20, ' 20.0s'),
(2000, '33.3min'),
(20000, '333.3min'),
])
def test_message_with_time(source, message, is_long, time, time_str):
out = _message_with_time(source, message, time)
if is_long:
assert len(out) > 70
else:
assert len(out) == 70
assert out.startswith('[' + source + '] ')
out = out[len(source) + 3:]
assert out.endswith(time_str)
out = out[:-len(time_str)]
assert out.endswith(', total=')
out = out[:-len(', total=')]
assert out.endswith(message)
out = out[:-len(message)]
assert out.endswith(' ')
out = out[:-1]
if is_long:
assert not out
else:
assert list(set(out)) == ['.']
@pytest.mark.parametrize(
['message', 'expected'],
[
('hello', _message_with_time('ABC', 'hello', 0.1) + '\n'),
('', _message_with_time('ABC', '', 0.1) + '\n'),
(None, ''),
])
def test_print_elapsed_time(message, expected, capsys, monkeypatch):
monkeypatch.setattr(timeit, 'default_timer', lambda: 0)
with _print_elapsed_time('ABC', message):
monkeypatch.setattr(timeit, 'default_timer', lambda: 0.1)
assert capsys.readouterr().out == expected
@pytest.mark.parametrize("value, result", [(float("nan"), True),
(np.nan, True),
(np.float("nan"), True),
(np.float32("nan"), True),
(np.float64("nan"), True),
(0, False),
(0., False),
(None, False),
("", False),
("nan", False),
([np.nan], False)])
def test_is_scalar_nan(value, result):
assert is_scalar_nan(value) is result
def dummy_func():
pass
def test_deprecation_joblib_api(tmpdir):
# Only parallel_backend and register_parallel_backend are not deprecated in
# sklearn.utils
from sklearn.utils import parallel_backend, register_parallel_backend
assert_no_warnings(parallel_backend, 'loky', None)
assert_no_warnings(register_parallel_backend, 'failing', None)
from sklearn.utils._joblib import joblib
del joblib.parallel.BACKENDS['failing']
@pytest.mark.parametrize(
"sequence",
[[np.array(1), np.array(2)], [[1, 2], [3, 4]]]
)
def test_to_object_array(sequence):
out = _to_object_array(sequence)
assert isinstance(out, np.ndarray)
assert out.dtype.kind == 'O'
assert out.ndim == 1

File diff suppressed because it is too large Load diff