3021 lines
113 KiB
Python
3021 lines
113 KiB
Python
import types
|
|
import warnings
|
|
import sys
|
|
import traceback
|
|
import pickle
|
|
import re
|
|
from copy import deepcopy
|
|
from functools import partial
|
|
from itertools import chain
|
|
from inspect import signature
|
|
|
|
import numpy as np
|
|
from scipy import sparse
|
|
from scipy.stats import rankdata
|
|
import joblib
|
|
|
|
from . import IS_PYPY
|
|
from .. import config_context
|
|
from ._testing import assert_raises, _get_args
|
|
from ._testing import assert_raises_regex
|
|
from ._testing import assert_raise_message
|
|
from ._testing import assert_array_equal
|
|
from ._testing import assert_array_almost_equal
|
|
from ._testing import assert_allclose
|
|
from ._testing import assert_allclose_dense_sparse
|
|
from ._testing import assert_warns_message
|
|
from ._testing import set_random_state
|
|
from ._testing import SkipTest
|
|
from ._testing import ignore_warnings
|
|
from ._testing import create_memmap_backed_data
|
|
from . import is_scalar_nan
|
|
from ..discriminant_analysis import LinearDiscriminantAnalysis
|
|
from ..linear_model import Ridge
|
|
|
|
from ..base import (clone, ClusterMixin, is_classifier, is_regressor,
|
|
RegressorMixin, is_outlier_detector, BaseEstimator)
|
|
|
|
from ..metrics import accuracy_score, adjusted_rand_score, f1_score
|
|
from ..random_projection import BaseRandomProjection
|
|
from ..feature_selection import SelectKBest
|
|
from ..pipeline import make_pipeline
|
|
from ..exceptions import DataConversionWarning
|
|
from ..exceptions import NotFittedError
|
|
from ..exceptions import SkipTestWarning
|
|
from ..model_selection import train_test_split
|
|
from ..model_selection import ShuffleSplit
|
|
from ..model_selection._validation import _safe_split
|
|
from ..metrics.pairwise import (rbf_kernel, linear_kernel, pairwise_distances)
|
|
|
|
from .import shuffle
|
|
from .import deprecated
|
|
from .validation import has_fit_parameter, _num_samples
|
|
from ..preprocessing import StandardScaler
|
|
from ..datasets import (load_iris, load_boston, make_blobs,
|
|
make_multilabel_classification, make_regression)
|
|
|
|
|
|
BOSTON = None
|
|
CROSS_DECOMPOSITION = ['PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']
|
|
|
|
|
|
def _yield_checks(name, estimator):
|
|
tags = estimator._get_tags()
|
|
yield check_no_attributes_set_in_init
|
|
yield check_estimators_dtypes
|
|
yield check_fit_score_takes_y
|
|
yield check_sample_weights_pandas_series
|
|
yield check_sample_weights_not_an_array
|
|
yield check_sample_weights_list
|
|
yield check_sample_weights_shape
|
|
yield check_sample_weights_invariance
|
|
yield check_estimators_fit_returns_self
|
|
yield partial(check_estimators_fit_returns_self, readonly_memmap=True)
|
|
|
|
# Check that all estimator yield informative messages when
|
|
# trained on empty datasets
|
|
if not tags["no_validation"]:
|
|
yield check_complex_data
|
|
yield check_dtype_object
|
|
yield check_estimators_empty_data_messages
|
|
|
|
if name not in CROSS_DECOMPOSITION:
|
|
# cross-decomposition's "transform" returns X and Y
|
|
yield check_pipeline_consistency
|
|
|
|
if not tags["allow_nan"] and not tags["no_validation"]:
|
|
# Test that all estimators check their input for NaN's and infs
|
|
yield check_estimators_nan_inf
|
|
|
|
if _is_pairwise(estimator):
|
|
# Check that pairwise estimator throws error on non-square input
|
|
yield check_nonsquare_error
|
|
|
|
yield check_estimators_overwrite_params
|
|
if hasattr(estimator, 'sparsify'):
|
|
yield check_sparsify_coefficients
|
|
|
|
yield check_estimator_sparse_data
|
|
|
|
# Test that estimators can be pickled, and once pickled
|
|
# give the same answer as before.
|
|
yield check_estimators_pickle
|
|
|
|
|
|
def _yield_classifier_checks(name, classifier):
|
|
tags = classifier._get_tags()
|
|
|
|
# test classifiers can handle non-array data and pandas objects
|
|
yield check_classifier_data_not_an_array
|
|
# test classifiers trained on a single label always return this label
|
|
yield check_classifiers_one_label
|
|
yield check_classifiers_classes
|
|
yield check_estimators_partial_fit_n_features
|
|
if tags["multioutput"]:
|
|
yield check_classifier_multioutput
|
|
# basic consistency testing
|
|
yield check_classifiers_train
|
|
yield partial(check_classifiers_train, readonly_memmap=True)
|
|
yield partial(check_classifiers_train, readonly_memmap=True,
|
|
X_dtype='float32')
|
|
yield check_classifiers_regression_target
|
|
if tags["multilabel"]:
|
|
yield check_classifiers_multilabel_representation_invariance
|
|
if not tags["no_validation"]:
|
|
yield check_supervised_y_no_nan
|
|
yield check_supervised_y_2d
|
|
if tags["requires_fit"]:
|
|
yield check_estimators_unfitted
|
|
if 'class_weight' in classifier.get_params().keys():
|
|
yield check_class_weight_classifiers
|
|
|
|
yield check_non_transformer_estimators_n_iter
|
|
# test if predict_proba is a monotonic transformation of decision_function
|
|
yield check_decision_proba_consistency
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_supervised_y_no_nan(name, estimator_orig):
|
|
# Checks that the Estimator targets are not NaN.
|
|
estimator = clone(estimator_orig)
|
|
rng = np.random.RandomState(888)
|
|
X = rng.randn(10, 5)
|
|
y = np.full(10, np.inf)
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
|
|
errmsg = "Input contains NaN, infinity or a value too large for " \
|
|
"dtype('float64')."
|
|
try:
|
|
estimator.fit(X, y)
|
|
except ValueError as e:
|
|
if str(e) != errmsg:
|
|
raise ValueError("Estimator {0} raised error as expected, but "
|
|
"does not match expected error message"
|
|
.format(name))
|
|
else:
|
|
raise ValueError("Estimator {0} should have raised error on fitting "
|
|
"array y with NaN value.".format(name))
|
|
|
|
|
|
def _yield_regressor_checks(name, regressor):
|
|
tags = regressor._get_tags()
|
|
# TODO: test with intercept
|
|
# TODO: test with multiple responses
|
|
# basic testing
|
|
yield check_regressors_train
|
|
yield partial(check_regressors_train, readonly_memmap=True)
|
|
yield partial(check_regressors_train, readonly_memmap=True,
|
|
X_dtype='float32')
|
|
yield check_regressor_data_not_an_array
|
|
yield check_estimators_partial_fit_n_features
|
|
if tags["multioutput"]:
|
|
yield check_regressor_multioutput
|
|
yield check_regressors_no_decision_function
|
|
if not tags["no_validation"]:
|
|
yield check_supervised_y_2d
|
|
yield check_supervised_y_no_nan
|
|
if name != 'CCA':
|
|
# check that the regressor handles int input
|
|
yield check_regressors_int
|
|
if tags["requires_fit"]:
|
|
yield check_estimators_unfitted
|
|
yield check_non_transformer_estimators_n_iter
|
|
|
|
|
|
def _yield_transformer_checks(name, transformer):
|
|
# All transformers should either deal with sparse data or raise an
|
|
# exception with type TypeError and an intelligible error message
|
|
if not transformer._get_tags()["no_validation"]:
|
|
yield check_transformer_data_not_an_array
|
|
# these don't actually fit the data, so don't raise errors
|
|
yield check_transformer_general
|
|
yield partial(check_transformer_general, readonly_memmap=True)
|
|
if not transformer._get_tags()["stateless"]:
|
|
yield check_transformers_unfitted
|
|
# Dependent on external solvers and hence accessing the iter
|
|
# param is non-trivial.
|
|
external_solver = ['Isomap', 'KernelPCA', 'LocallyLinearEmbedding',
|
|
'RandomizedLasso', 'LogisticRegressionCV']
|
|
if name not in external_solver:
|
|
yield check_transformer_n_iter
|
|
|
|
|
|
def _yield_clustering_checks(name, clusterer):
|
|
yield check_clusterer_compute_labels_predict
|
|
if name not in ('WardAgglomeration', "FeatureAgglomeration"):
|
|
# this is clustering on the features
|
|
# let's not test that here.
|
|
yield check_clustering
|
|
yield partial(check_clustering, readonly_memmap=True)
|
|
yield check_estimators_partial_fit_n_features
|
|
yield check_non_transformer_estimators_n_iter
|
|
|
|
|
|
def _yield_outliers_checks(name, estimator):
|
|
|
|
# checks for outlier detectors that have a fit_predict method
|
|
if hasattr(estimator, 'fit_predict'):
|
|
yield check_outliers_fit_predict
|
|
|
|
# checks for estimators that can be used on a test set
|
|
if hasattr(estimator, 'predict'):
|
|
yield check_outliers_train
|
|
yield partial(check_outliers_train, readonly_memmap=True)
|
|
# test outlier detectors can handle non-array data
|
|
yield check_classifier_data_not_an_array
|
|
# test if NotFittedError is raised
|
|
if estimator._get_tags()["requires_fit"]:
|
|
yield check_estimators_unfitted
|
|
|
|
|
|
def _yield_all_checks(name, estimator):
|
|
tags = estimator._get_tags()
|
|
if "2darray" not in tags["X_types"]:
|
|
warnings.warn("Can't test estimator {} which requires input "
|
|
" of type {}".format(name, tags["X_types"]),
|
|
SkipTestWarning)
|
|
return
|
|
if tags["_skip_test"]:
|
|
warnings.warn("Explicit SKIP via _skip_test tag for estimator "
|
|
"{}.".format(name),
|
|
SkipTestWarning)
|
|
return
|
|
|
|
for check in _yield_checks(name, estimator):
|
|
yield check
|
|
if is_classifier(estimator):
|
|
for check in _yield_classifier_checks(name, estimator):
|
|
yield check
|
|
if is_regressor(estimator):
|
|
for check in _yield_regressor_checks(name, estimator):
|
|
yield check
|
|
if hasattr(estimator, 'transform'):
|
|
for check in _yield_transformer_checks(name, estimator):
|
|
yield check
|
|
if isinstance(estimator, ClusterMixin):
|
|
for check in _yield_clustering_checks(name, estimator):
|
|
yield check
|
|
if is_outlier_detector(estimator):
|
|
for check in _yield_outliers_checks(name, estimator):
|
|
yield check
|
|
yield check_fit2d_predict1d
|
|
yield check_methods_subset_invariance
|
|
yield check_fit2d_1sample
|
|
yield check_fit2d_1feature
|
|
yield check_fit1d
|
|
yield check_get_params_invariance
|
|
yield check_set_params
|
|
yield check_dict_unchanged
|
|
yield check_dont_overwrite_parameters
|
|
yield check_fit_idempotent
|
|
if not tags["no_validation"]:
|
|
yield check_n_features_in
|
|
if tags["requires_y"]:
|
|
yield check_requires_y_none
|
|
if tags["requires_positive_X"]:
|
|
yield check_fit_non_negative
|
|
|
|
|
|
def _set_check_estimator_ids(obj):
|
|
"""Create pytest ids for checks.
|
|
|
|
When `obj` is an estimator, this returns the pprint version of the
|
|
estimator (with `print_changed_only=True`). When `obj` is a function, the
|
|
name of the function is returned with its keyworld arguments.
|
|
|
|
`_set_check_estimator_ids` is designed to be used as the `id` in
|
|
`pytest.mark.parametrize` where `check_estimator(..., generate_only=True)`
|
|
is yielding estimators and checks.
|
|
|
|
Parameters
|
|
----------
|
|
obj : estimator or function
|
|
Items generated by `check_estimator`
|
|
|
|
Returns
|
|
-------
|
|
id : string or None
|
|
|
|
See also
|
|
--------
|
|
check_estimator
|
|
"""
|
|
if callable(obj):
|
|
if not isinstance(obj, partial):
|
|
return obj.__name__
|
|
|
|
if not obj.keywords:
|
|
return obj.func.__name__
|
|
|
|
kwstring = ",".join(["{}={}".format(k, v)
|
|
for k, v in obj.keywords.items()])
|
|
return "{}({})".format(obj.func.__name__, kwstring)
|
|
if hasattr(obj, "get_params"):
|
|
with config_context(print_changed_only=True):
|
|
return re.sub(r"\s", "", str(obj))
|
|
|
|
|
|
def _construct_instance(Estimator):
|
|
"""Construct Estimator instance if possible"""
|
|
required_parameters = getattr(Estimator, "_required_parameters", [])
|
|
if len(required_parameters):
|
|
if required_parameters in (["estimator"], ["base_estimator"]):
|
|
if issubclass(Estimator, RegressorMixin):
|
|
estimator = Estimator(Ridge())
|
|
else:
|
|
estimator = Estimator(LinearDiscriminantAnalysis())
|
|
else:
|
|
raise SkipTest("Can't instantiate estimator {} which requires "
|
|
"parameters {}".format(Estimator.__name__,
|
|
required_parameters))
|
|
else:
|
|
estimator = Estimator()
|
|
return estimator
|
|
|
|
|
|
# TODO: probably not needed anymore in 0.24 since _generate_class_checks should
|
|
# be removed too. Just put this in check_estimator()
|
|
def _generate_instance_checks(name, estimator):
|
|
"""Generate instance checks."""
|
|
yield from ((estimator, partial(check, name))
|
|
for check in _yield_all_checks(name, estimator))
|
|
|
|
|
|
# TODO: remove this in 0.24
|
|
def _generate_class_checks(Estimator):
|
|
"""Generate class checks."""
|
|
name = Estimator.__name__
|
|
yield (Estimator, partial(check_parameters_default_constructible, name))
|
|
estimator = _construct_instance(Estimator)
|
|
yield from _generate_instance_checks(name, estimator)
|
|
|
|
|
|
def _mark_xfail_checks(estimator, check, pytest):
|
|
"""Mark (estimator, check) pairs with xfail according to the
|
|
_xfail_checks_ tag"""
|
|
if isinstance(estimator, type):
|
|
# try to construct estimator instance, if it is unable to then
|
|
# return the estimator class, ignoring the tag
|
|
# TODO: remove this if block in 0.24 since passing instances isn't
|
|
# supported anymore
|
|
try:
|
|
estimator = _construct_instance(estimator)
|
|
except Exception:
|
|
return estimator, check
|
|
|
|
xfail_checks = estimator._get_tags()['_xfail_checks'] or {}
|
|
check_name = _set_check_estimator_ids(check)
|
|
|
|
if check_name not in xfail_checks:
|
|
# check isn't part of the xfail_checks tags, just return it
|
|
return estimator, check
|
|
else:
|
|
# check is in the tag, mark it as xfail for pytest
|
|
reason = xfail_checks[check_name]
|
|
return pytest.param(estimator, check,
|
|
marks=pytest.mark.xfail(reason=reason))
|
|
|
|
|
|
def parametrize_with_checks(estimators):
|
|
"""Pytest specific decorator for parametrizing estimator checks.
|
|
|
|
The `id` of each check is set to be a pprint version of the estimator
|
|
and the name of the check with its keyword arguments.
|
|
This allows to use `pytest -k` to specify which tests to run::
|
|
|
|
pytest test_check_estimators.py -k check_estimators_fit_returns_self
|
|
|
|
Parameters
|
|
----------
|
|
estimators : list of estimators objects or classes
|
|
Estimators to generated checks for.
|
|
|
|
.. deprecated:: 0.23
|
|
Passing a class is deprecated from version 0.23, and won't be
|
|
supported in 0.24. Pass an instance instead.
|
|
|
|
Returns
|
|
-------
|
|
decorator : `pytest.mark.parametrize`
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.utils.estimator_checks import parametrize_with_checks
|
|
>>> from sklearn.linear_model import LogisticRegression
|
|
>>> from sklearn.tree import DecisionTreeRegressor
|
|
|
|
>>> @parametrize_with_checks([LogisticRegression(),
|
|
... DecisionTreeRegressor()])
|
|
... def test_sklearn_compatible_estimator(estimator, check):
|
|
... check(estimator)
|
|
|
|
"""
|
|
import pytest
|
|
|
|
if any(isinstance(est, type) for est in estimators):
|
|
# TODO: remove class support in 0.24 and update docstrings
|
|
msg = ("Passing a class is deprecated since version 0.23 "
|
|
"and won't be supported in 0.24."
|
|
"Please pass an instance instead.")
|
|
warnings.warn(msg, FutureWarning)
|
|
|
|
checks_generator = chain.from_iterable(
|
|
check_estimator(estimator, generate_only=True)
|
|
for estimator in estimators)
|
|
|
|
checks_with_marks = (
|
|
_mark_xfail_checks(estimator, check, pytest)
|
|
for estimator, check in checks_generator)
|
|
|
|
return pytest.mark.parametrize("estimator, check", checks_with_marks,
|
|
ids=_set_check_estimator_ids)
|
|
|
|
|
|
def check_estimator(Estimator, generate_only=False):
|
|
"""Check if estimator adheres to scikit-learn conventions.
|
|
|
|
This estimator will run an extensive test-suite for input validation,
|
|
shapes, etc, making sure that the estimator complies with `scikit-learn`
|
|
conventions as detailed in :ref:`rolling_your_own_estimator`.
|
|
Additional tests for classifiers, regressors, clustering or transformers
|
|
will be run if the Estimator class inherits from the corresponding mixin
|
|
from sklearn.base.
|
|
|
|
This test can be applied to classes or instances.
|
|
Classes currently have some additional tests that related to construction,
|
|
while passing instances allows the testing of multiple options. However,
|
|
support for classes is deprecated since version 0.23 and will be removed
|
|
in version 0.24 (class checks will still be run on the instances).
|
|
|
|
Setting `generate_only=True` returns a generator that yields (estimator,
|
|
check) tuples where the check can be called independently from each
|
|
other, i.e. `check(estimator)`. This allows all checks to be run
|
|
independently and report the checks that are failing.
|
|
|
|
scikit-learn provides a pytest specific decorator,
|
|
:func:`~sklearn.utils.parametrize_with_checks`, making it easier to test
|
|
multiple estimators.
|
|
|
|
Parameters
|
|
----------
|
|
estimator : estimator object
|
|
Estimator to check. Estimator is a class object or instance.
|
|
|
|
.. deprecated:: 0.23
|
|
Passing a class is deprecated from version 0.23, and won't be
|
|
supported in 0.24. Pass an instance instead.
|
|
|
|
generate_only : bool, optional (default=False)
|
|
When `False`, checks are evaluated when `check_estimator` is called.
|
|
When `True`, `check_estimator` returns a generator that yields
|
|
(estimator, check) tuples. The check is run by calling
|
|
`check(estimator)`.
|
|
|
|
.. versionadded:: 0.22
|
|
|
|
Returns
|
|
-------
|
|
checks_generator : generator
|
|
Generator that yields (estimator, check) tuples. Returned when
|
|
`generate_only=True`.
|
|
"""
|
|
# TODO: remove class support in 0.24 and update docstrings
|
|
if isinstance(Estimator, type):
|
|
# got a class
|
|
msg = ("Passing a class is deprecated since version 0.23 "
|
|
"and won't be supported in 0.24."
|
|
"Please pass an instance instead.")
|
|
warnings.warn(msg, FutureWarning)
|
|
|
|
checks_generator = _generate_class_checks(Estimator)
|
|
else:
|
|
# got an instance
|
|
estimator = Estimator
|
|
name = type(estimator).__name__
|
|
checks_generator = _generate_instance_checks(name, estimator)
|
|
|
|
if generate_only:
|
|
return checks_generator
|
|
|
|
for estimator, check in checks_generator:
|
|
try:
|
|
check(estimator)
|
|
except SkipTest as exception:
|
|
# the only SkipTest thrown currently results from not
|
|
# being able to import pandas.
|
|
warnings.warn(str(exception), SkipTestWarning)
|
|
|
|
|
|
def _boston_subset(n_samples=200):
|
|
global BOSTON
|
|
if BOSTON is None:
|
|
X, y = load_boston(return_X_y=True)
|
|
X, y = shuffle(X, y, random_state=0)
|
|
X, y = X[:n_samples], y[:n_samples]
|
|
X = StandardScaler().fit_transform(X)
|
|
BOSTON = X, y
|
|
return BOSTON
|
|
|
|
|
|
@deprecated("set_checking_parameters is deprecated in version "
|
|
"0.22 and will be removed in version 0.24.")
|
|
def set_checking_parameters(estimator):
|
|
_set_checking_parameters(estimator)
|
|
|
|
|
|
def _set_checking_parameters(estimator):
|
|
# set parameters to speed up some estimators and
|
|
# avoid deprecated behaviour
|
|
params = estimator.get_params()
|
|
name = estimator.__class__.__name__
|
|
if ("n_iter" in params and name != "TSNE"):
|
|
estimator.set_params(n_iter=5)
|
|
if "max_iter" in params:
|
|
if estimator.max_iter is not None:
|
|
estimator.set_params(max_iter=min(5, estimator.max_iter))
|
|
# LinearSVR, LinearSVC
|
|
if estimator.__class__.__name__ in ['LinearSVR', 'LinearSVC']:
|
|
estimator.set_params(max_iter=20)
|
|
# NMF
|
|
if estimator.__class__.__name__ == 'NMF':
|
|
estimator.set_params(max_iter=100)
|
|
# MLP
|
|
if estimator.__class__.__name__ in ['MLPClassifier', 'MLPRegressor']:
|
|
estimator.set_params(max_iter=100)
|
|
if "n_resampling" in params:
|
|
# randomized lasso
|
|
estimator.set_params(n_resampling=5)
|
|
if "n_estimators" in params:
|
|
estimator.set_params(n_estimators=min(5, estimator.n_estimators))
|
|
if "max_trials" in params:
|
|
# RANSAC
|
|
estimator.set_params(max_trials=10)
|
|
if "n_init" in params:
|
|
# K-Means
|
|
estimator.set_params(n_init=2)
|
|
|
|
if name == 'TruncatedSVD':
|
|
# TruncatedSVD doesn't run with n_components = n_features
|
|
# This is ugly :-/
|
|
estimator.n_components = 1
|
|
|
|
if hasattr(estimator, "n_clusters"):
|
|
estimator.n_clusters = min(estimator.n_clusters, 2)
|
|
|
|
if hasattr(estimator, "n_best"):
|
|
estimator.n_best = 1
|
|
|
|
if name == "SelectFdr":
|
|
# be tolerant of noisy datasets (not actually speed)
|
|
estimator.set_params(alpha=.5)
|
|
|
|
if name == "TheilSenRegressor":
|
|
estimator.max_subpopulation = 100
|
|
|
|
if isinstance(estimator, BaseRandomProjection):
|
|
# Due to the jl lemma and often very few samples, the number
|
|
# of components of the random matrix projection will be probably
|
|
# greater than the number of features.
|
|
# So we impose a smaller number (avoid "auto" mode)
|
|
estimator.set_params(n_components=2)
|
|
|
|
if isinstance(estimator, SelectKBest):
|
|
# SelectKBest has a default of k=10
|
|
# which is more feature than we have in most case.
|
|
estimator.set_params(k=1)
|
|
|
|
if name in ('HistGradientBoostingClassifier',
|
|
'HistGradientBoostingRegressor'):
|
|
# The default min_samples_leaf (20) isn't appropriate for small
|
|
# datasets (only very shallow trees are built) that the checks use.
|
|
estimator.set_params(min_samples_leaf=5)
|
|
|
|
# Speed-up by reducing the number of CV or splits for CV estimators
|
|
loo_cv = ['RidgeCV']
|
|
if name not in loo_cv and hasattr(estimator, 'cv'):
|
|
estimator.set_params(cv=3)
|
|
if hasattr(estimator, 'n_splits'):
|
|
estimator.set_params(n_splits=3)
|
|
|
|
if name == 'OneHotEncoder':
|
|
estimator.set_params(handle_unknown='ignore')
|
|
|
|
|
|
class _NotAnArray:
|
|
"""An object that is convertible to an array
|
|
|
|
Parameters
|
|
----------
|
|
data : array_like
|
|
The data.
|
|
"""
|
|
|
|
def __init__(self, data):
|
|
self.data = np.asarray(data)
|
|
|
|
def __array__(self, dtype=None):
|
|
return self.data
|
|
|
|
def __array_function__(self, func, types, args, kwargs):
|
|
if func.__name__ == "may_share_memory":
|
|
return True
|
|
raise TypeError("Don't want to call array_function {}!".format(
|
|
func.__name__))
|
|
|
|
|
|
@deprecated("NotAnArray is deprecated in version "
|
|
"0.22 and will be removed in version 0.24.")
|
|
class NotAnArray(_NotAnArray):
|
|
# TODO: remove in 0.24
|
|
pass
|
|
|
|
|
|
def _is_pairwise(estimator):
|
|
"""Returns True if estimator has a _pairwise attribute set to True.
|
|
|
|
Parameters
|
|
----------
|
|
estimator : object
|
|
Estimator object to test.
|
|
|
|
Returns
|
|
-------
|
|
out : bool
|
|
True if _pairwise is set to True and False otherwise.
|
|
"""
|
|
return bool(getattr(estimator, "_pairwise", False))
|
|
|
|
|
|
def _is_pairwise_metric(estimator):
|
|
"""Returns True if estimator accepts pairwise metric.
|
|
|
|
Parameters
|
|
----------
|
|
estimator : object
|
|
Estimator object to test.
|
|
|
|
Returns
|
|
-------
|
|
out : bool
|
|
True if _pairwise is set to True and False otherwise.
|
|
"""
|
|
metric = getattr(estimator, "metric", None)
|
|
|
|
return bool(metric == 'precomputed')
|
|
|
|
|
|
@deprecated("pairwise_estimator_convert_X is deprecated in version "
|
|
"0.22 and will be removed in version 0.24.")
|
|
def pairwise_estimator_convert_X(X, estimator, kernel=linear_kernel):
|
|
return _pairwise_estimator_convert_X(X, estimator, kernel)
|
|
|
|
|
|
def _pairwise_estimator_convert_X(X, estimator, kernel=linear_kernel):
|
|
|
|
if _is_pairwise_metric(estimator):
|
|
return pairwise_distances(X, metric='euclidean')
|
|
if _is_pairwise(estimator):
|
|
return kernel(X, X)
|
|
|
|
return X
|
|
|
|
|
|
def _generate_sparse_matrix(X_csr):
|
|
"""Generate sparse matrices with {32,64}bit indices of diverse format
|
|
|
|
Parameters
|
|
----------
|
|
X_csr: CSR Matrix
|
|
Input matrix in CSR format
|
|
|
|
Returns
|
|
-------
|
|
out: iter(Matrices)
|
|
In format['dok', 'lil', 'dia', 'bsr', 'csr', 'csc', 'coo',
|
|
'coo_64', 'csc_64', 'csr_64']
|
|
"""
|
|
|
|
assert X_csr.format == 'csr'
|
|
yield 'csr', X_csr.copy()
|
|
for sparse_format in ['dok', 'lil', 'dia', 'bsr', 'csc', 'coo']:
|
|
yield sparse_format, X_csr.asformat(sparse_format)
|
|
|
|
# Generate large indices matrix only if its supported by scipy
|
|
X_coo = X_csr.asformat('coo')
|
|
X_coo.row = X_coo.row.astype('int64')
|
|
X_coo.col = X_coo.col.astype('int64')
|
|
yield "coo_64", X_coo
|
|
|
|
for sparse_format in ['csc', 'csr']:
|
|
X = X_csr.asformat(sparse_format)
|
|
X.indices = X.indices.astype('int64')
|
|
X.indptr = X.indptr.astype('int64')
|
|
yield sparse_format + "_64", X
|
|
|
|
|
|
def check_estimator_sparse_data(name, estimator_orig):
|
|
rng = np.random.RandomState(0)
|
|
X = rng.rand(40, 10)
|
|
X[X < .8] = 0
|
|
X = _pairwise_estimator_convert_X(X, estimator_orig)
|
|
X_csr = sparse.csr_matrix(X)
|
|
y = (4 * rng.rand(40)).astype(int)
|
|
# catch deprecation warnings
|
|
with ignore_warnings(category=FutureWarning):
|
|
estimator = clone(estimator_orig)
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
tags = estimator_orig._get_tags()
|
|
for matrix_format, X in _generate_sparse_matrix(X_csr):
|
|
# catch deprecation warnings
|
|
with ignore_warnings(category=FutureWarning):
|
|
estimator = clone(estimator_orig)
|
|
if name in ['Scaler', 'StandardScaler']:
|
|
estimator.set_params(with_mean=False)
|
|
# fit and predict
|
|
try:
|
|
with ignore_warnings(category=FutureWarning):
|
|
estimator.fit(X, y)
|
|
if hasattr(estimator, "predict"):
|
|
pred = estimator.predict(X)
|
|
if tags['multioutput_only']:
|
|
assert pred.shape == (X.shape[0], 1)
|
|
else:
|
|
assert pred.shape == (X.shape[0],)
|
|
if hasattr(estimator, 'predict_proba'):
|
|
probs = estimator.predict_proba(X)
|
|
if tags['binary_only']:
|
|
expected_probs_shape = (X.shape[0], 2)
|
|
else:
|
|
expected_probs_shape = (X.shape[0], 4)
|
|
assert probs.shape == expected_probs_shape
|
|
except (TypeError, ValueError) as e:
|
|
if 'sparse' not in repr(e).lower():
|
|
if "64" in matrix_format:
|
|
msg = ("Estimator %s doesn't seem to support %s matrix, "
|
|
"and is not failing gracefully, e.g. by using "
|
|
"check_array(X, accept_large_sparse=False)")
|
|
raise AssertionError(msg % (name, matrix_format))
|
|
else:
|
|
print("Estimator %s doesn't seem to fail gracefully on "
|
|
"sparse data: error message state explicitly that "
|
|
"sparse input is not supported if this is not"
|
|
" the case." % name)
|
|
raise
|
|
except Exception:
|
|
print("Estimator %s doesn't seem to fail gracefully on "
|
|
"sparse data: it should raise a TypeError if sparse input "
|
|
"is explicitly not supported." % name)
|
|
raise
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_sample_weights_pandas_series(name, estimator_orig):
|
|
# check that estimators will accept a 'sample_weight' parameter of
|
|
# type pandas.Series in the 'fit' function.
|
|
estimator = clone(estimator_orig)
|
|
if has_fit_parameter(estimator, "sample_weight"):
|
|
try:
|
|
import pandas as pd
|
|
X = np.array([[1, 1], [1, 2], [1, 3], [1, 4],
|
|
[2, 1], [2, 2], [2, 3], [2, 4],
|
|
[3, 1], [3, 2], [3, 3], [3, 4]])
|
|
X = pd.DataFrame(_pairwise_estimator_convert_X(X, estimator_orig))
|
|
y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])
|
|
weights = pd.Series([1] * 12)
|
|
if estimator._get_tags()["multioutput_only"]:
|
|
y = pd.DataFrame(y)
|
|
try:
|
|
estimator.fit(X, y, sample_weight=weights)
|
|
except ValueError:
|
|
raise ValueError("Estimator {0} raises error if "
|
|
"'sample_weight' parameter is of "
|
|
"type pandas.Series".format(name))
|
|
except ImportError:
|
|
raise SkipTest("pandas is not installed: not testing for "
|
|
"input of type pandas.Series to class weight.")
|
|
|
|
|
|
@ignore_warnings(category=(FutureWarning))
|
|
def check_sample_weights_not_an_array(name, estimator_orig):
|
|
# check that estimators will accept a 'sample_weight' parameter of
|
|
# type _NotAnArray in the 'fit' function.
|
|
estimator = clone(estimator_orig)
|
|
if has_fit_parameter(estimator, "sample_weight"):
|
|
X = np.array([[1, 1], [1, 2], [1, 3], [1, 4],
|
|
[2, 1], [2, 2], [2, 3], [2, 4],
|
|
[3, 1], [3, 2], [3, 3], [3, 4]])
|
|
X = _NotAnArray(pairwise_estimator_convert_X(X, estimator_orig))
|
|
y = _NotAnArray([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])
|
|
weights = _NotAnArray([1] * 12)
|
|
if estimator._get_tags()["multioutput_only"]:
|
|
y = _NotAnArray(y.data.reshape(-1, 1))
|
|
estimator.fit(X, y, sample_weight=weights)
|
|
|
|
|
|
@ignore_warnings(category=(FutureWarning))
|
|
def check_sample_weights_list(name, estimator_orig):
|
|
# check that estimators will accept a 'sample_weight' parameter of
|
|
# type list in the 'fit' function.
|
|
if has_fit_parameter(estimator_orig, "sample_weight"):
|
|
estimator = clone(estimator_orig)
|
|
rnd = np.random.RandomState(0)
|
|
n_samples = 30
|
|
X = _pairwise_estimator_convert_X(rnd.uniform(size=(n_samples, 3)),
|
|
estimator_orig)
|
|
y = np.arange(n_samples) % 3
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
sample_weight = [3] * n_samples
|
|
# Test that estimators don't raise any exception
|
|
estimator.fit(X, y, sample_weight=sample_weight)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_sample_weights_shape(name, estimator_orig):
|
|
# check that estimators raise an error if sample_weight
|
|
# shape mismatches the input
|
|
if (has_fit_parameter(estimator_orig, "sample_weight") and
|
|
not (hasattr(estimator_orig, "_pairwise")
|
|
and estimator_orig._pairwise)):
|
|
estimator = clone(estimator_orig)
|
|
X = np.array([[1, 3], [1, 3], [1, 3], [1, 3],
|
|
[2, 1], [2, 1], [2, 1], [2, 1],
|
|
[3, 3], [3, 3], [3, 3], [3, 3],
|
|
[4, 1], [4, 1], [4, 1], [4, 1]])
|
|
y = np.array([1, 1, 1, 1, 2, 2, 2, 2,
|
|
1, 1, 1, 1, 2, 2, 2, 2])
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
|
|
estimator.fit(X, y, sample_weight=np.ones(len(y)))
|
|
|
|
assert_raises(ValueError, estimator.fit, X, y,
|
|
sample_weight=np.ones(2*len(y)))
|
|
|
|
assert_raises(ValueError, estimator.fit, X, y,
|
|
sample_weight=np.ones((len(y), 2)))
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_sample_weights_invariance(name, estimator_orig):
|
|
# check that the estimators yield same results for
|
|
# unit weights and no weights
|
|
if (has_fit_parameter(estimator_orig, "sample_weight") and
|
|
not (hasattr(estimator_orig, "_pairwise")
|
|
and estimator_orig._pairwise)):
|
|
# We skip pairwise because the data is not pairwise
|
|
|
|
estimator1 = clone(estimator_orig)
|
|
estimator2 = clone(estimator_orig)
|
|
set_random_state(estimator1, random_state=0)
|
|
set_random_state(estimator2, random_state=0)
|
|
|
|
X = np.array([[1, 3], [1, 3], [1, 3], [1, 3],
|
|
[2, 1], [2, 1], [2, 1], [2, 1],
|
|
[3, 3], [3, 3], [3, 3], [3, 3],
|
|
[4, 1], [4, 1], [4, 1], [4, 1]], dtype=np.dtype('float'))
|
|
y = np.array([1, 1, 1, 1, 2, 2, 2, 2,
|
|
1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('int'))
|
|
y = _enforce_estimator_tags_y(estimator1, y)
|
|
|
|
estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y)))
|
|
estimator2.fit(X, y=y, sample_weight=None)
|
|
|
|
for method in ["predict", "transform"]:
|
|
if hasattr(estimator_orig, method):
|
|
X_pred1 = getattr(estimator1, method)(X)
|
|
X_pred2 = getattr(estimator2, method)(X)
|
|
if sparse.issparse(X_pred1):
|
|
X_pred1 = X_pred1.toarray()
|
|
X_pred2 = X_pred2.toarray()
|
|
assert_allclose(X_pred1, X_pred2,
|
|
err_msg="For %s sample_weight=None is not"
|
|
" equivalent to sample_weight=ones"
|
|
% name)
|
|
|
|
|
|
@ignore_warnings(category=(FutureWarning, UserWarning))
|
|
def check_dtype_object(name, estimator_orig):
|
|
# check that estimators treat dtype object as numeric if possible
|
|
rng = np.random.RandomState(0)
|
|
X = _pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig)
|
|
X = X.astype(object)
|
|
tags = estimator_orig._get_tags()
|
|
y = (X[:, 0] * 4).astype(int)
|
|
estimator = clone(estimator_orig)
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
|
|
estimator.fit(X, y)
|
|
if hasattr(estimator, "predict"):
|
|
estimator.predict(X)
|
|
|
|
if hasattr(estimator, "transform"):
|
|
estimator.transform(X)
|
|
|
|
try:
|
|
estimator.fit(X, y.astype(object))
|
|
except Exception as e:
|
|
if "Unknown label type" not in str(e):
|
|
raise
|
|
|
|
if 'string' not in tags['X_types']:
|
|
X[0, 0] = {'foo': 'bar'}
|
|
msg = "argument must be a string.* number"
|
|
assert_raises_regex(TypeError, msg, estimator.fit, X, y)
|
|
else:
|
|
# Estimators supporting string will not call np.asarray to convert the
|
|
# data to numeric and therefore, the error will not be raised.
|
|
# Checking for each element dtype in the input array will be costly.
|
|
# Refer to #11401 for full discussion.
|
|
estimator.fit(X, y)
|
|
|
|
|
|
def check_complex_data(name, estimator_orig):
|
|
# check that estimators raise an exception on providing complex data
|
|
X = np.random.sample(10) + 1j * np.random.sample(10)
|
|
X = X.reshape(-1, 1)
|
|
y = np.random.sample(10) + 1j * np.random.sample(10)
|
|
estimator = clone(estimator_orig)
|
|
assert_raises_regex(ValueError, "Complex data not supported",
|
|
estimator.fit, X, y)
|
|
|
|
|
|
@ignore_warnings
|
|
def check_dict_unchanged(name, estimator_orig):
|
|
# this estimator raises
|
|
# ValueError: Found array with 0 feature(s) (shape=(23, 0))
|
|
# while a minimum of 1 is required.
|
|
# error
|
|
if name in ['SpectralCoclustering']:
|
|
return
|
|
rnd = np.random.RandomState(0)
|
|
if name in ['RANSACRegressor']:
|
|
X = 3 * rnd.uniform(size=(20, 3))
|
|
else:
|
|
X = 2 * rnd.uniform(size=(20, 3))
|
|
|
|
X = _pairwise_estimator_convert_X(X, estimator_orig)
|
|
|
|
y = X[:, 0].astype(np.int)
|
|
estimator = clone(estimator_orig)
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
if hasattr(estimator, "n_components"):
|
|
estimator.n_components = 1
|
|
|
|
if hasattr(estimator, "n_clusters"):
|
|
estimator.n_clusters = 1
|
|
|
|
if hasattr(estimator, "n_best"):
|
|
estimator.n_best = 1
|
|
|
|
set_random_state(estimator, 1)
|
|
|
|
estimator.fit(X, y)
|
|
for method in ["predict", "transform", "decision_function",
|
|
"predict_proba"]:
|
|
if hasattr(estimator, method):
|
|
dict_before = estimator.__dict__.copy()
|
|
getattr(estimator, method)(X)
|
|
assert estimator.__dict__ == dict_before, (
|
|
'Estimator changes __dict__ during %s' % method)
|
|
|
|
|
|
@deprecated("is_public_parameter is deprecated in version "
|
|
"0.22 and will be removed in version 0.24.")
|
|
def is_public_parameter(attr):
|
|
return _is_public_parameter(attr)
|
|
|
|
|
|
def _is_public_parameter(attr):
|
|
return not (attr.startswith('_') or attr.endswith('_'))
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_dont_overwrite_parameters(name, estimator_orig):
|
|
# check that fit method only changes or sets private attributes
|
|
if hasattr(estimator_orig.__init__, "deprecated_original"):
|
|
# to not check deprecated classes
|
|
return
|
|
estimator = clone(estimator_orig)
|
|
rnd = np.random.RandomState(0)
|
|
X = 3 * rnd.uniform(size=(20, 3))
|
|
X = _pairwise_estimator_convert_X(X, estimator_orig)
|
|
y = X[:, 0].astype(int)
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
|
|
if hasattr(estimator, "n_components"):
|
|
estimator.n_components = 1
|
|
if hasattr(estimator, "n_clusters"):
|
|
estimator.n_clusters = 1
|
|
|
|
set_random_state(estimator, 1)
|
|
dict_before_fit = estimator.__dict__.copy()
|
|
estimator.fit(X, y)
|
|
|
|
dict_after_fit = estimator.__dict__
|
|
|
|
public_keys_after_fit = [key for key in dict_after_fit.keys()
|
|
if _is_public_parameter(key)]
|
|
|
|
attrs_added_by_fit = [key for key in public_keys_after_fit
|
|
if key not in dict_before_fit.keys()]
|
|
|
|
# check that fit doesn't add any public attribute
|
|
assert not attrs_added_by_fit, (
|
|
'Estimator adds public attribute(s) during'
|
|
' the fit method.'
|
|
' Estimators are only allowed to add private attributes'
|
|
' either started with _ or ended'
|
|
' with _ but %s added'
|
|
% ', '.join(attrs_added_by_fit))
|
|
|
|
# check that fit doesn't change any public attribute
|
|
attrs_changed_by_fit = [key for key in public_keys_after_fit
|
|
if (dict_before_fit[key]
|
|
is not dict_after_fit[key])]
|
|
|
|
assert not attrs_changed_by_fit, (
|
|
'Estimator changes public attribute(s) during'
|
|
' the fit method. Estimators are only allowed'
|
|
' to change attributes started'
|
|
' or ended with _, but'
|
|
' %s changed'
|
|
% ', '.join(attrs_changed_by_fit))
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_fit2d_predict1d(name, estimator_orig):
|
|
# check by fitting a 2d array and predicting with a 1d array
|
|
rnd = np.random.RandomState(0)
|
|
X = 3 * rnd.uniform(size=(20, 3))
|
|
X = _pairwise_estimator_convert_X(X, estimator_orig)
|
|
y = X[:, 0].astype(np.int)
|
|
tags = estimator_orig._get_tags()
|
|
estimator = clone(estimator_orig)
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
|
|
if hasattr(estimator, "n_components"):
|
|
estimator.n_components = 1
|
|
if hasattr(estimator, "n_clusters"):
|
|
estimator.n_clusters = 1
|
|
|
|
set_random_state(estimator, 1)
|
|
estimator.fit(X, y)
|
|
if tags["no_validation"]:
|
|
# FIXME this is a bit loose
|
|
return
|
|
|
|
for method in ["predict", "transform", "decision_function",
|
|
"predict_proba"]:
|
|
if hasattr(estimator, method):
|
|
assert_raise_message(ValueError, "Reshape your data",
|
|
getattr(estimator, method), X[0])
|
|
|
|
|
|
def _apply_on_subsets(func, X):
|
|
# apply function on the whole set and on mini batches
|
|
result_full = func(X)
|
|
n_features = X.shape[1]
|
|
result_by_batch = [func(batch.reshape(1, n_features))
|
|
for batch in X]
|
|
|
|
# func can output tuple (e.g. score_samples)
|
|
if type(result_full) == tuple:
|
|
result_full = result_full[0]
|
|
result_by_batch = list(map(lambda x: x[0], result_by_batch))
|
|
|
|
if sparse.issparse(result_full):
|
|
result_full = result_full.A
|
|
result_by_batch = [x.A for x in result_by_batch]
|
|
|
|
return np.ravel(result_full), np.ravel(result_by_batch)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_methods_subset_invariance(name, estimator_orig):
|
|
# check that method gives invariant results if applied
|
|
# on mini batches or the whole set
|
|
rnd = np.random.RandomState(0)
|
|
X = 3 * rnd.uniform(size=(20, 3))
|
|
X = _pairwise_estimator_convert_X(X, estimator_orig)
|
|
y = X[:, 0].astype(int)
|
|
estimator = clone(estimator_orig)
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
|
|
if hasattr(estimator, "n_components"):
|
|
estimator.n_components = 1
|
|
if hasattr(estimator, "n_clusters"):
|
|
estimator.n_clusters = 1
|
|
|
|
set_random_state(estimator, 1)
|
|
estimator.fit(X, y)
|
|
|
|
for method in ["predict", "transform", "decision_function",
|
|
"score_samples", "predict_proba"]:
|
|
|
|
msg = ("{method} of {name} is not invariant when applied "
|
|
"to a subset.").format(method=method, name=name)
|
|
|
|
if hasattr(estimator, method):
|
|
result_full, result_by_batch = _apply_on_subsets(
|
|
getattr(estimator, method), X)
|
|
assert_allclose(result_full, result_by_batch,
|
|
atol=1e-7, err_msg=msg)
|
|
|
|
|
|
@ignore_warnings
|
|
def check_fit2d_1sample(name, estimator_orig):
|
|
# Check that fitting a 2d array with only one sample either works or
|
|
# returns an informative message. The error message should either mention
|
|
# the number of samples or the number of classes.
|
|
rnd = np.random.RandomState(0)
|
|
X = 3 * rnd.uniform(size=(1, 10))
|
|
X = _pairwise_estimator_convert_X(X, estimator_orig)
|
|
|
|
y = X[:, 0].astype(np.int)
|
|
estimator = clone(estimator_orig)
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
|
|
if hasattr(estimator, "n_components"):
|
|
estimator.n_components = 1
|
|
if hasattr(estimator, "n_clusters"):
|
|
estimator.n_clusters = 1
|
|
|
|
set_random_state(estimator, 1)
|
|
|
|
# min_cluster_size cannot be less than the data size for OPTICS.
|
|
if name == 'OPTICS':
|
|
estimator.set_params(min_samples=1)
|
|
|
|
msgs = ["1 sample", "n_samples = 1", "n_samples=1", "one sample",
|
|
"1 class", "one class"]
|
|
|
|
try:
|
|
estimator.fit(X, y)
|
|
except ValueError as e:
|
|
if all(msg not in repr(e) for msg in msgs):
|
|
raise e
|
|
|
|
|
|
@ignore_warnings
|
|
def check_fit2d_1feature(name, estimator_orig):
|
|
# check fitting a 2d array with only 1 feature either works or returns
|
|
# informative message
|
|
rnd = np.random.RandomState(0)
|
|
X = 3 * rnd.uniform(size=(10, 1))
|
|
X = _pairwise_estimator_convert_X(X, estimator_orig)
|
|
y = X[:, 0].astype(np.int)
|
|
estimator = clone(estimator_orig)
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
|
|
if hasattr(estimator, "n_components"):
|
|
estimator.n_components = 1
|
|
if hasattr(estimator, "n_clusters"):
|
|
estimator.n_clusters = 1
|
|
# ensure two labels in subsample for RandomizedLogisticRegression
|
|
if name == 'RandomizedLogisticRegression':
|
|
estimator.sample_fraction = 1
|
|
# ensure non skipped trials for RANSACRegressor
|
|
if name == 'RANSACRegressor':
|
|
estimator.residual_threshold = 0.5
|
|
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
set_random_state(estimator, 1)
|
|
|
|
msgs = ["1 feature(s)", "n_features = 1", "n_features=1"]
|
|
|
|
try:
|
|
estimator.fit(X, y)
|
|
except ValueError as e:
|
|
if all(msg not in repr(e) for msg in msgs):
|
|
raise e
|
|
|
|
|
|
@ignore_warnings
|
|
def check_fit1d(name, estimator_orig):
|
|
# check fitting 1d X array raises a ValueError
|
|
rnd = np.random.RandomState(0)
|
|
X = 3 * rnd.uniform(size=(20))
|
|
y = X.astype(np.int)
|
|
estimator = clone(estimator_orig)
|
|
tags = estimator._get_tags()
|
|
if tags["no_validation"]:
|
|
# FIXME this is a bit loose
|
|
return
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
|
|
if hasattr(estimator, "n_components"):
|
|
estimator.n_components = 1
|
|
if hasattr(estimator, "n_clusters"):
|
|
estimator.n_clusters = 1
|
|
|
|
set_random_state(estimator, 1)
|
|
assert_raises(ValueError, estimator.fit, X, y)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_transformer_general(name, transformer, readonly_memmap=False):
|
|
X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
|
|
random_state=0, n_features=2, cluster_std=0.1)
|
|
X = StandardScaler().fit_transform(X)
|
|
X -= X.min()
|
|
X = _pairwise_estimator_convert_X(X, transformer)
|
|
|
|
if readonly_memmap:
|
|
X, y = create_memmap_backed_data([X, y])
|
|
|
|
_check_transformer(name, transformer, X, y)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_transformer_data_not_an_array(name, transformer):
|
|
X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
|
|
random_state=0, n_features=2, cluster_std=0.1)
|
|
X = StandardScaler().fit_transform(X)
|
|
# We need to make sure that we have non negative data, for things
|
|
# like NMF
|
|
X -= X.min() - .1
|
|
X = _pairwise_estimator_convert_X(X, transformer)
|
|
this_X = _NotAnArray(X)
|
|
this_y = _NotAnArray(np.asarray(y))
|
|
_check_transformer(name, transformer, this_X, this_y)
|
|
# try the same with some list
|
|
_check_transformer(name, transformer, X.tolist(), y.tolist())
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_transformers_unfitted(name, transformer):
|
|
X, y = _boston_subset()
|
|
|
|
transformer = clone(transformer)
|
|
with assert_raises((AttributeError, ValueError), msg="The unfitted "
|
|
"transformer {} does not raise an error when "
|
|
"transform is called. Perhaps use "
|
|
"check_is_fitted in transform.".format(name)):
|
|
transformer.transform(X)
|
|
|
|
|
|
def _check_transformer(name, transformer_orig, X, y):
|
|
n_samples, n_features = np.asarray(X).shape
|
|
transformer = clone(transformer_orig)
|
|
set_random_state(transformer)
|
|
|
|
# fit
|
|
|
|
if name in CROSS_DECOMPOSITION:
|
|
y_ = np.c_[np.asarray(y), np.asarray(y)]
|
|
y_[::2, 1] *= 2
|
|
if isinstance(X, _NotAnArray):
|
|
y_ = _NotAnArray(y_)
|
|
else:
|
|
y_ = y
|
|
|
|
transformer.fit(X, y_)
|
|
# fit_transform method should work on non fitted estimator
|
|
transformer_clone = clone(transformer)
|
|
X_pred = transformer_clone.fit_transform(X, y=y_)
|
|
|
|
if isinstance(X_pred, tuple):
|
|
for x_pred in X_pred:
|
|
assert x_pred.shape[0] == n_samples
|
|
else:
|
|
# check for consistent n_samples
|
|
assert X_pred.shape[0] == n_samples
|
|
|
|
if hasattr(transformer, 'transform'):
|
|
if name in CROSS_DECOMPOSITION:
|
|
X_pred2 = transformer.transform(X, y_)
|
|
X_pred3 = transformer.fit_transform(X, y=y_)
|
|
else:
|
|
X_pred2 = transformer.transform(X)
|
|
X_pred3 = transformer.fit_transform(X, y=y_)
|
|
|
|
if transformer_orig._get_tags()['non_deterministic']:
|
|
msg = name + ' is non deterministic'
|
|
raise SkipTest(msg)
|
|
if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
|
|
for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3):
|
|
assert_allclose_dense_sparse(
|
|
x_pred, x_pred2, atol=1e-2,
|
|
err_msg="fit_transform and transform outcomes "
|
|
"not consistent in %s"
|
|
% transformer)
|
|
assert_allclose_dense_sparse(
|
|
x_pred, x_pred3, atol=1e-2,
|
|
err_msg="consecutive fit_transform outcomes "
|
|
"not consistent in %s"
|
|
% transformer)
|
|
else:
|
|
assert_allclose_dense_sparse(
|
|
X_pred, X_pred2,
|
|
err_msg="fit_transform and transform outcomes "
|
|
"not consistent in %s"
|
|
% transformer, atol=1e-2)
|
|
assert_allclose_dense_sparse(
|
|
X_pred, X_pred3, atol=1e-2,
|
|
err_msg="consecutive fit_transform outcomes "
|
|
"not consistent in %s"
|
|
% transformer)
|
|
assert _num_samples(X_pred2) == n_samples
|
|
assert _num_samples(X_pred3) == n_samples
|
|
|
|
# raises error on malformed input for transform
|
|
if hasattr(X, 'shape') and \
|
|
not transformer._get_tags()["stateless"] and \
|
|
X.ndim == 2 and X.shape[1] > 1:
|
|
|
|
# If it's not an array, it does not have a 'T' property
|
|
with assert_raises(ValueError, msg="The transformer {} does "
|
|
"not raise an error when the number of "
|
|
"features in transform is different from"
|
|
" the number of features in "
|
|
"fit.".format(name)):
|
|
transformer.transform(X[:, :-1])
|
|
|
|
|
|
@ignore_warnings
|
|
def check_pipeline_consistency(name, estimator_orig):
|
|
if estimator_orig._get_tags()['non_deterministic']:
|
|
msg = name + ' is non deterministic'
|
|
raise SkipTest(msg)
|
|
|
|
# check that make_pipeline(est) gives same score as est
|
|
X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
|
|
random_state=0, n_features=2, cluster_std=0.1)
|
|
X -= X.min()
|
|
X = _pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
|
|
estimator = clone(estimator_orig)
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
set_random_state(estimator)
|
|
pipeline = make_pipeline(estimator)
|
|
estimator.fit(X, y)
|
|
pipeline.fit(X, y)
|
|
|
|
funcs = ["score", "fit_transform"]
|
|
|
|
for func_name in funcs:
|
|
func = getattr(estimator, func_name, None)
|
|
if func is not None:
|
|
func_pipeline = getattr(pipeline, func_name)
|
|
result = func(X, y)
|
|
result_pipe = func_pipeline(X, y)
|
|
assert_allclose_dense_sparse(result, result_pipe)
|
|
|
|
|
|
@ignore_warnings
|
|
def check_fit_score_takes_y(name, estimator_orig):
|
|
# check that all estimators accept an optional y
|
|
# in fit and score so they can be used in pipelines
|
|
rnd = np.random.RandomState(0)
|
|
n_samples = 30
|
|
X = rnd.uniform(size=(n_samples, 3))
|
|
X = _pairwise_estimator_convert_X(X, estimator_orig)
|
|
y = np.arange(n_samples) % 3
|
|
estimator = clone(estimator_orig)
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
set_random_state(estimator)
|
|
|
|
funcs = ["fit", "score", "partial_fit", "fit_predict", "fit_transform"]
|
|
for func_name in funcs:
|
|
func = getattr(estimator, func_name, None)
|
|
if func is not None:
|
|
func(X, y)
|
|
args = [p.name for p in signature(func).parameters.values()]
|
|
if args[0] == "self":
|
|
# if_delegate_has_method makes methods into functions
|
|
# with an explicit "self", so need to shift arguments
|
|
args = args[1:]
|
|
assert args[1] in ["y", "Y"], (
|
|
"Expected y or Y as second argument for method "
|
|
"%s of %s. Got arguments: %r."
|
|
% (func_name, type(estimator).__name__, args))
|
|
|
|
|
|
@ignore_warnings
|
|
def check_estimators_dtypes(name, estimator_orig):
|
|
rnd = np.random.RandomState(0)
|
|
X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32)
|
|
X_train_32 = _pairwise_estimator_convert_X(X_train_32, estimator_orig)
|
|
X_train_64 = X_train_32.astype(np.float64)
|
|
X_train_int_64 = X_train_32.astype(np.int64)
|
|
X_train_int_32 = X_train_32.astype(np.int32)
|
|
y = X_train_int_64[:, 0]
|
|
y = _enforce_estimator_tags_y(estimator_orig, y)
|
|
|
|
methods = ["predict", "transform", "decision_function", "predict_proba"]
|
|
|
|
for X_train in [X_train_32, X_train_64, X_train_int_64, X_train_int_32]:
|
|
estimator = clone(estimator_orig)
|
|
set_random_state(estimator, 1)
|
|
estimator.fit(X_train, y)
|
|
|
|
for method in methods:
|
|
if hasattr(estimator, method):
|
|
getattr(estimator, method)(X_train)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_estimators_empty_data_messages(name, estimator_orig):
|
|
e = clone(estimator_orig)
|
|
set_random_state(e, 1)
|
|
|
|
X_zero_samples = np.empty(0).reshape(0, 3)
|
|
# The precise message can change depending on whether X or y is
|
|
# validated first. Let us test the type of exception only:
|
|
with assert_raises(ValueError, msg="The estimator {} does not"
|
|
" raise an error when an empty data is used "
|
|
"to train. Perhaps use "
|
|
"check_array in train.".format(name)):
|
|
e.fit(X_zero_samples, [])
|
|
|
|
X_zero_features = np.empty(0).reshape(3, 0)
|
|
# the following y should be accepted by both classifiers and regressors
|
|
# and ignored by unsupervised models
|
|
y = _enforce_estimator_tags_y(e, np.array([1, 0, 1]))
|
|
msg = (r"0 feature\(s\) \(shape=\(3, 0\)\) while a minimum of \d* "
|
|
"is required.")
|
|
assert_raises_regex(ValueError, msg, e.fit, X_zero_features, y)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_estimators_nan_inf(name, estimator_orig):
|
|
# Checks that Estimator X's do not contain NaN or inf.
|
|
rnd = np.random.RandomState(0)
|
|
X_train_finite = _pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)),
|
|
estimator_orig)
|
|
X_train_nan = rnd.uniform(size=(10, 3))
|
|
X_train_nan[0, 0] = np.nan
|
|
X_train_inf = rnd.uniform(size=(10, 3))
|
|
X_train_inf[0, 0] = np.inf
|
|
y = np.ones(10)
|
|
y[:5] = 0
|
|
y = _enforce_estimator_tags_y(estimator_orig, y)
|
|
error_string_fit = "Estimator doesn't check for NaN and inf in fit."
|
|
error_string_predict = ("Estimator doesn't check for NaN and inf in"
|
|
" predict.")
|
|
error_string_transform = ("Estimator doesn't check for NaN and inf in"
|
|
" transform.")
|
|
for X_train in [X_train_nan, X_train_inf]:
|
|
# catch deprecation warnings
|
|
with ignore_warnings(category=FutureWarning):
|
|
estimator = clone(estimator_orig)
|
|
set_random_state(estimator, 1)
|
|
# try to fit
|
|
try:
|
|
estimator.fit(X_train, y)
|
|
except ValueError as e:
|
|
if 'inf' not in repr(e) and 'NaN' not in repr(e):
|
|
print(error_string_fit, estimator, e)
|
|
traceback.print_exc(file=sys.stdout)
|
|
raise e
|
|
except Exception as exc:
|
|
print(error_string_fit, estimator, exc)
|
|
traceback.print_exc(file=sys.stdout)
|
|
raise exc
|
|
else:
|
|
raise AssertionError(error_string_fit, estimator)
|
|
# actually fit
|
|
estimator.fit(X_train_finite, y)
|
|
|
|
# predict
|
|
if hasattr(estimator, "predict"):
|
|
try:
|
|
estimator.predict(X_train)
|
|
except ValueError as e:
|
|
if 'inf' not in repr(e) and 'NaN' not in repr(e):
|
|
print(error_string_predict, estimator, e)
|
|
traceback.print_exc(file=sys.stdout)
|
|
raise e
|
|
except Exception as exc:
|
|
print(error_string_predict, estimator, exc)
|
|
traceback.print_exc(file=sys.stdout)
|
|
else:
|
|
raise AssertionError(error_string_predict, estimator)
|
|
|
|
# transform
|
|
if hasattr(estimator, "transform"):
|
|
try:
|
|
estimator.transform(X_train)
|
|
except ValueError as e:
|
|
if 'inf' not in repr(e) and 'NaN' not in repr(e):
|
|
print(error_string_transform, estimator, e)
|
|
traceback.print_exc(file=sys.stdout)
|
|
raise e
|
|
except Exception as exc:
|
|
print(error_string_transform, estimator, exc)
|
|
traceback.print_exc(file=sys.stdout)
|
|
else:
|
|
raise AssertionError(error_string_transform, estimator)
|
|
|
|
|
|
@ignore_warnings
|
|
def check_nonsquare_error(name, estimator_orig):
|
|
"""Test that error is thrown when non-square data provided"""
|
|
|
|
X, y = make_blobs(n_samples=20, n_features=10)
|
|
estimator = clone(estimator_orig)
|
|
|
|
with assert_raises(ValueError, msg="The pairwise estimator {}"
|
|
" does not raise an error on non-square data"
|
|
.format(name)):
|
|
estimator.fit(X, y)
|
|
|
|
|
|
@ignore_warnings
|
|
def check_estimators_pickle(name, estimator_orig):
|
|
"""Test that we can pickle all estimators"""
|
|
check_methods = ["predict", "transform", "decision_function",
|
|
"predict_proba"]
|
|
|
|
X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
|
|
random_state=0, n_features=2, cluster_std=0.1)
|
|
|
|
# some estimators can't do features less than 0
|
|
X -= X.min()
|
|
X = _pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
|
|
|
|
tags = estimator_orig._get_tags()
|
|
# include NaN values when the estimator should deal with them
|
|
if tags['allow_nan']:
|
|
# set randomly 10 elements to np.nan
|
|
rng = np.random.RandomState(42)
|
|
mask = rng.choice(X.size, 10, replace=False)
|
|
X.reshape(-1)[mask] = np.nan
|
|
|
|
estimator = clone(estimator_orig)
|
|
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
|
|
set_random_state(estimator)
|
|
estimator.fit(X, y)
|
|
|
|
result = dict()
|
|
for method in check_methods:
|
|
if hasattr(estimator, method):
|
|
result[method] = getattr(estimator, method)(X)
|
|
|
|
# pickle and unpickle!
|
|
pickled_estimator = pickle.dumps(estimator)
|
|
if estimator.__module__.startswith('sklearn.'):
|
|
assert b"version" in pickled_estimator
|
|
unpickled_estimator = pickle.loads(pickled_estimator)
|
|
|
|
result = dict()
|
|
for method in check_methods:
|
|
if hasattr(estimator, method):
|
|
result[method] = getattr(estimator, method)(X)
|
|
|
|
for method in result:
|
|
unpickled_result = getattr(unpickled_estimator, method)(X)
|
|
assert_allclose_dense_sparse(result[method], unpickled_result)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_estimators_partial_fit_n_features(name, estimator_orig):
|
|
# check if number of features changes between calls to partial_fit.
|
|
if not hasattr(estimator_orig, 'partial_fit'):
|
|
return
|
|
estimator = clone(estimator_orig)
|
|
X, y = make_blobs(n_samples=50, random_state=1)
|
|
X -= X.min()
|
|
y = _enforce_estimator_tags_y(estimator_orig, y)
|
|
|
|
try:
|
|
if is_classifier(estimator):
|
|
classes = np.unique(y)
|
|
estimator.partial_fit(X, y, classes=classes)
|
|
else:
|
|
estimator.partial_fit(X, y)
|
|
except NotImplementedError:
|
|
return
|
|
|
|
with assert_raises(ValueError,
|
|
msg="The estimator {} does not raise an"
|
|
" error when the number of features"
|
|
" changes between calls to "
|
|
"partial_fit.".format(name)):
|
|
estimator.partial_fit(X[:, :-1], y)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_classifier_multioutput(name, estimator):
|
|
n_samples, n_labels, n_classes = 42, 5, 3
|
|
tags = estimator._get_tags()
|
|
estimator = clone(estimator)
|
|
X, y = make_multilabel_classification(random_state=42,
|
|
n_samples=n_samples,
|
|
n_labels=n_labels,
|
|
n_classes=n_classes)
|
|
estimator.fit(X, y)
|
|
y_pred = estimator.predict(X)
|
|
|
|
assert y_pred.shape == (n_samples, n_classes), (
|
|
"The shape of the prediction for multioutput data is "
|
|
"incorrect. Expected {}, got {}."
|
|
.format((n_samples, n_labels), y_pred.shape))
|
|
assert y_pred.dtype.kind == 'i'
|
|
|
|
if hasattr(estimator, "decision_function"):
|
|
decision = estimator.decision_function(X)
|
|
assert isinstance(decision, np.ndarray)
|
|
assert decision.shape == (n_samples, n_classes), (
|
|
"The shape of the decision function output for "
|
|
"multioutput data is incorrect. Expected {}, got {}."
|
|
.format((n_samples, n_classes), decision.shape))
|
|
|
|
dec_pred = (decision > 0).astype(np.int)
|
|
dec_exp = estimator.classes_[dec_pred]
|
|
assert_array_equal(dec_exp, y_pred)
|
|
|
|
if hasattr(estimator, "predict_proba"):
|
|
y_prob = estimator.predict_proba(X)
|
|
|
|
if isinstance(y_prob, list) and not tags['poor_score']:
|
|
for i in range(n_classes):
|
|
assert y_prob[i].shape == (n_samples, 2), (
|
|
"The shape of the probability for multioutput data is"
|
|
" incorrect. Expected {}, got {}."
|
|
.format((n_samples, 2), y_prob[i].shape))
|
|
assert_array_equal(
|
|
np.argmax(y_prob[i], axis=1).astype(np.int),
|
|
y_pred[:, i]
|
|
)
|
|
elif not tags['poor_score']:
|
|
assert y_prob.shape == (n_samples, n_classes), (
|
|
"The shape of the probability for multioutput data is"
|
|
" incorrect. Expected {}, got {}."
|
|
.format((n_samples, n_classes), y_prob.shape))
|
|
assert_array_equal(y_prob.round().astype(int), y_pred)
|
|
|
|
if (hasattr(estimator, "decision_function") and
|
|
hasattr(estimator, "predict_proba")):
|
|
for i in range(n_classes):
|
|
y_proba = estimator.predict_proba(X)[:, i]
|
|
y_decision = estimator.decision_function(X)
|
|
assert_array_equal(rankdata(y_proba), rankdata(y_decision[:, i]))
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_regressor_multioutput(name, estimator):
|
|
estimator = clone(estimator)
|
|
n_samples = n_features = 10
|
|
|
|
if not _is_pairwise_metric(estimator):
|
|
n_samples = n_samples + 1
|
|
|
|
X, y = make_regression(random_state=42, n_targets=5,
|
|
n_samples=n_samples, n_features=n_features)
|
|
X = pairwise_estimator_convert_X(X, estimator)
|
|
|
|
estimator.fit(X, y)
|
|
y_pred = estimator.predict(X)
|
|
|
|
assert y_pred.dtype == np.dtype('float64'), (
|
|
"Multioutput predictions by a regressor are expected to be"
|
|
" floating-point precision. Got {} instead".format(y_pred.dtype))
|
|
assert y_pred.shape == y.shape, (
|
|
"The shape of the orediction for multioutput data is incorrect."
|
|
" Expected {}, got {}.")
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_clustering(name, clusterer_orig, readonly_memmap=False):
|
|
clusterer = clone(clusterer_orig)
|
|
X, y = make_blobs(n_samples=50, random_state=1)
|
|
X, y = shuffle(X, y, random_state=7)
|
|
X = StandardScaler().fit_transform(X)
|
|
rng = np.random.RandomState(7)
|
|
X_noise = np.concatenate([X, rng.uniform(low=-3, high=3, size=(5, 2))])
|
|
|
|
if readonly_memmap:
|
|
X, y, X_noise = create_memmap_backed_data([X, y, X_noise])
|
|
|
|
n_samples, n_features = X.shape
|
|
# catch deprecation and neighbors warnings
|
|
if hasattr(clusterer, "n_clusters"):
|
|
clusterer.set_params(n_clusters=3)
|
|
set_random_state(clusterer)
|
|
if name == 'AffinityPropagation':
|
|
clusterer.set_params(preference=-100)
|
|
clusterer.set_params(max_iter=100)
|
|
|
|
# fit
|
|
clusterer.fit(X)
|
|
# with lists
|
|
clusterer.fit(X.tolist())
|
|
|
|
pred = clusterer.labels_
|
|
assert pred.shape == (n_samples,)
|
|
assert adjusted_rand_score(pred, y) > 0.4
|
|
if clusterer._get_tags()['non_deterministic']:
|
|
return
|
|
set_random_state(clusterer)
|
|
with warnings.catch_warnings(record=True):
|
|
pred2 = clusterer.fit_predict(X)
|
|
assert_array_equal(pred, pred2)
|
|
|
|
# fit_predict(X) and labels_ should be of type int
|
|
assert pred.dtype in [np.dtype('int32'), np.dtype('int64')]
|
|
assert pred2.dtype in [np.dtype('int32'), np.dtype('int64')]
|
|
|
|
# Add noise to X to test the possible values of the labels
|
|
labels = clusterer.fit_predict(X_noise)
|
|
|
|
# There should be at least one sample in every cluster. Equivalently
|
|
# labels_ should contain all the consecutive values between its
|
|
# min and its max.
|
|
labels_sorted = np.unique(labels)
|
|
assert_array_equal(labels_sorted, np.arange(labels_sorted[0],
|
|
labels_sorted[-1] + 1))
|
|
|
|
# Labels are expected to start at 0 (no noise) or -1 (if noise)
|
|
assert labels_sorted[0] in [0, -1]
|
|
# Labels should be less than n_clusters - 1
|
|
if hasattr(clusterer, 'n_clusters'):
|
|
n_clusters = getattr(clusterer, 'n_clusters')
|
|
assert n_clusters - 1 >= labels_sorted[-1]
|
|
# else labels should be less than max(labels_) which is necessarily true
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_clusterer_compute_labels_predict(name, clusterer_orig):
|
|
"""Check that predict is invariant of compute_labels"""
|
|
X, y = make_blobs(n_samples=20, random_state=0)
|
|
clusterer = clone(clusterer_orig)
|
|
set_random_state(clusterer)
|
|
|
|
if hasattr(clusterer, "compute_labels"):
|
|
# MiniBatchKMeans
|
|
X_pred1 = clusterer.fit(X).predict(X)
|
|
clusterer.set_params(compute_labels=False)
|
|
X_pred2 = clusterer.fit(X).predict(X)
|
|
assert_array_equal(X_pred1, X_pred2)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_classifiers_one_label(name, classifier_orig):
|
|
error_string_fit = "Classifier can't train when only one class is present."
|
|
error_string_predict = ("Classifier can't predict when only one class is "
|
|
"present.")
|
|
rnd = np.random.RandomState(0)
|
|
X_train = rnd.uniform(size=(10, 3))
|
|
X_test = rnd.uniform(size=(10, 3))
|
|
y = np.ones(10)
|
|
# catch deprecation warnings
|
|
with ignore_warnings(category=FutureWarning):
|
|
classifier = clone(classifier_orig)
|
|
# try to fit
|
|
try:
|
|
classifier.fit(X_train, y)
|
|
except ValueError as e:
|
|
if 'class' not in repr(e):
|
|
print(error_string_fit, classifier, e)
|
|
traceback.print_exc(file=sys.stdout)
|
|
raise e
|
|
else:
|
|
return
|
|
except Exception as exc:
|
|
print(error_string_fit, classifier, exc)
|
|
traceback.print_exc(file=sys.stdout)
|
|
raise exc
|
|
# predict
|
|
try:
|
|
assert_array_equal(classifier.predict(X_test), y)
|
|
except Exception as exc:
|
|
print(error_string_predict, classifier, exc)
|
|
raise exc
|
|
|
|
|
|
@ignore_warnings # Warnings are raised by decision function
|
|
def check_classifiers_train(name, classifier_orig, readonly_memmap=False,
|
|
X_dtype='float64'):
|
|
X_m, y_m = make_blobs(n_samples=300, random_state=0)
|
|
X_m = X_m.astype(X_dtype)
|
|
X_m, y_m = shuffle(X_m, y_m, random_state=7)
|
|
X_m = StandardScaler().fit_transform(X_m)
|
|
# generate binary problem from multi-class one
|
|
y_b = y_m[y_m != 2]
|
|
X_b = X_m[y_m != 2]
|
|
|
|
if name in ['BernoulliNB', 'MultinomialNB', 'ComplementNB',
|
|
'CategoricalNB']:
|
|
X_m -= X_m.min()
|
|
X_b -= X_b.min()
|
|
|
|
if readonly_memmap:
|
|
X_m, y_m, X_b, y_b = create_memmap_backed_data([X_m, y_m, X_b, y_b])
|
|
|
|
problems = [(X_b, y_b)]
|
|
tags = classifier_orig._get_tags()
|
|
if not tags['binary_only']:
|
|
problems.append((X_m, y_m))
|
|
|
|
for (X, y) in problems:
|
|
classes = np.unique(y)
|
|
n_classes = len(classes)
|
|
n_samples, n_features = X.shape
|
|
classifier = clone(classifier_orig)
|
|
X = _pairwise_estimator_convert_X(X, classifier)
|
|
y = _enforce_estimator_tags_y(classifier, y)
|
|
|
|
set_random_state(classifier)
|
|
# raises error on malformed input for fit
|
|
if not tags["no_validation"]:
|
|
with assert_raises(
|
|
ValueError,
|
|
msg="The classifier {} does not "
|
|
"raise an error when incorrect/malformed input "
|
|
"data for fit is passed. The number of training "
|
|
"examples is not the same as the number of labels. "
|
|
"Perhaps use check_X_y in fit.".format(name)):
|
|
classifier.fit(X, y[:-1])
|
|
|
|
# fit
|
|
classifier.fit(X, y)
|
|
# with lists
|
|
classifier.fit(X.tolist(), y.tolist())
|
|
assert hasattr(classifier, "classes_")
|
|
y_pred = classifier.predict(X)
|
|
|
|
assert y_pred.shape == (n_samples,)
|
|
# training set performance
|
|
if not tags['poor_score']:
|
|
assert accuracy_score(y, y_pred) > 0.83
|
|
|
|
# raises error on malformed input for predict
|
|
msg_pairwise = (
|
|
"The classifier {} does not raise an error when shape of X in "
|
|
" {} is not equal to (n_test_samples, n_training_samples)")
|
|
msg = ("The classifier {} does not raise an error when the number of "
|
|
"features in {} is different from the number of features in "
|
|
"fit.")
|
|
|
|
if not tags["no_validation"]:
|
|
if _is_pairwise(classifier):
|
|
with assert_raises(ValueError,
|
|
msg=msg_pairwise.format(name, "predict")):
|
|
classifier.predict(X.reshape(-1, 1))
|
|
else:
|
|
with assert_raises(ValueError,
|
|
msg=msg.format(name, "predict")):
|
|
classifier.predict(X.T)
|
|
if hasattr(classifier, "decision_function"):
|
|
try:
|
|
# decision_function agrees with predict
|
|
decision = classifier.decision_function(X)
|
|
if n_classes == 2:
|
|
if not tags["multioutput_only"]:
|
|
assert decision.shape == (n_samples,)
|
|
else:
|
|
assert decision.shape == (n_samples, 1)
|
|
dec_pred = (decision.ravel() > 0).astype(np.int)
|
|
assert_array_equal(dec_pred, y_pred)
|
|
else:
|
|
assert decision.shape == (n_samples, n_classes)
|
|
assert_array_equal(np.argmax(decision, axis=1), y_pred)
|
|
|
|
# raises error on malformed input for decision_function
|
|
if not tags["no_validation"]:
|
|
if _is_pairwise(classifier):
|
|
with assert_raises(ValueError, msg=msg_pairwise.format(
|
|
name, "decision_function")):
|
|
classifier.decision_function(X.reshape(-1, 1))
|
|
else:
|
|
with assert_raises(ValueError, msg=msg.format(
|
|
name, "decision_function")):
|
|
classifier.decision_function(X.T)
|
|
except NotImplementedError:
|
|
pass
|
|
|
|
if hasattr(classifier, "predict_proba"):
|
|
# predict_proba agrees with predict
|
|
y_prob = classifier.predict_proba(X)
|
|
assert y_prob.shape == (n_samples, n_classes)
|
|
assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
|
|
# check that probas for all classes sum to one
|
|
assert_array_almost_equal(np.sum(y_prob, axis=1),
|
|
np.ones(n_samples))
|
|
if not tags["no_validation"]:
|
|
# raises error on malformed input for predict_proba
|
|
if _is_pairwise(classifier_orig):
|
|
with assert_raises(ValueError, msg=msg_pairwise.format(
|
|
name, "predict_proba")):
|
|
classifier.predict_proba(X.reshape(-1, 1))
|
|
else:
|
|
with assert_raises(ValueError, msg=msg.format(
|
|
name, "predict_proba")):
|
|
classifier.predict_proba(X.T)
|
|
if hasattr(classifier, "predict_log_proba"):
|
|
# predict_log_proba is a transformation of predict_proba
|
|
y_log_prob = classifier.predict_log_proba(X)
|
|
assert_allclose(y_log_prob, np.log(y_prob), 8, atol=1e-9)
|
|
assert_array_equal(np.argsort(y_log_prob), np.argsort(y_prob))
|
|
|
|
|
|
def check_outlier_corruption(num_outliers, expected_outliers, decision):
|
|
# Check for deviation from the precise given contamination level that may
|
|
# be due to ties in the anomaly scores.
|
|
if num_outliers < expected_outliers:
|
|
start = num_outliers
|
|
end = expected_outliers + 1
|
|
else:
|
|
start = expected_outliers
|
|
end = num_outliers + 1
|
|
|
|
# ensure that all values in the 'critical area' are tied,
|
|
# leading to the observed discrepancy between provided
|
|
# and actual contamination levels.
|
|
sorted_decision = np.sort(decision)
|
|
msg = ('The number of predicted outliers is not equal to the expected '
|
|
'number of outliers and this difference is not explained by the '
|
|
'number of ties in the decision_function values')
|
|
assert len(np.unique(sorted_decision[start:end])) == 1, msg
|
|
|
|
|
|
def check_outliers_train(name, estimator_orig, readonly_memmap=True):
|
|
n_samples = 300
|
|
X, _ = make_blobs(n_samples=n_samples, random_state=0)
|
|
X = shuffle(X, random_state=7)
|
|
|
|
if readonly_memmap:
|
|
X = create_memmap_backed_data(X)
|
|
|
|
n_samples, n_features = X.shape
|
|
estimator = clone(estimator_orig)
|
|
set_random_state(estimator)
|
|
|
|
# fit
|
|
estimator.fit(X)
|
|
# with lists
|
|
estimator.fit(X.tolist())
|
|
|
|
y_pred = estimator.predict(X)
|
|
assert y_pred.shape == (n_samples,)
|
|
assert y_pred.dtype.kind == 'i'
|
|
assert_array_equal(np.unique(y_pred), np.array([-1, 1]))
|
|
|
|
decision = estimator.decision_function(X)
|
|
scores = estimator.score_samples(X)
|
|
for output in [decision, scores]:
|
|
assert output.dtype == np.dtype('float')
|
|
assert output.shape == (n_samples,)
|
|
|
|
# raises error on malformed input for predict
|
|
assert_raises(ValueError, estimator.predict, X.T)
|
|
|
|
# decision_function agrees with predict
|
|
dec_pred = (decision >= 0).astype(np.int)
|
|
dec_pred[dec_pred == 0] = -1
|
|
assert_array_equal(dec_pred, y_pred)
|
|
|
|
# raises error on malformed input for decision_function
|
|
assert_raises(ValueError, estimator.decision_function, X.T)
|
|
|
|
# decision_function is a translation of score_samples
|
|
y_dec = scores - estimator.offset_
|
|
assert_allclose(y_dec, decision)
|
|
|
|
# raises error on malformed input for score_samples
|
|
assert_raises(ValueError, estimator.score_samples, X.T)
|
|
|
|
# contamination parameter (not for OneClassSVM which has the nu parameter)
|
|
if (hasattr(estimator, 'contamination')
|
|
and not hasattr(estimator, 'novelty')):
|
|
# proportion of outliers equal to contamination parameter when not
|
|
# set to 'auto'. This is true for the training set and cannot thus be
|
|
# checked as follows for estimators with a novelty parameter such as
|
|
# LocalOutlierFactor (tested in check_outliers_fit_predict)
|
|
expected_outliers = 30
|
|
contamination = expected_outliers / n_samples
|
|
estimator.set_params(contamination=contamination)
|
|
estimator.fit(X)
|
|
y_pred = estimator.predict(X)
|
|
|
|
num_outliers = np.sum(y_pred != 1)
|
|
# num_outliers should be equal to expected_outliers unless
|
|
# there are ties in the decision_function values. this can
|
|
# only be tested for estimators with a decision_function
|
|
# method, i.e. all estimators except LOF which is already
|
|
# excluded from this if branch.
|
|
if num_outliers != expected_outliers:
|
|
decision = estimator.decision_function(X)
|
|
check_outlier_corruption(num_outliers, expected_outliers, decision)
|
|
|
|
# raises error when contamination is a scalar and not in [0,1]
|
|
for contamination in [-0.5, 2.3]:
|
|
estimator.set_params(contamination=contamination)
|
|
assert_raises(ValueError, estimator.fit, X)
|
|
|
|
|
|
@ignore_warnings(category=(FutureWarning))
|
|
def check_classifiers_multilabel_representation_invariance(name,
|
|
classifier_orig):
|
|
X, y = make_multilabel_classification(n_samples=100, n_features=20,
|
|
n_classes=5, n_labels=3,
|
|
length=50, allow_unlabeled=True,
|
|
random_state=0)
|
|
|
|
X_train, y_train = X[:80], y[:80]
|
|
X_test = X[80:]
|
|
|
|
y_train_list_of_lists = y_train.tolist()
|
|
y_train_list_of_arrays = list(y_train)
|
|
|
|
classifier = clone(classifier_orig)
|
|
set_random_state(classifier)
|
|
|
|
y_pred = classifier.fit(X_train, y_train).predict(X_test)
|
|
|
|
y_pred_list_of_lists = classifier.fit(
|
|
X_train, y_train_list_of_lists).predict(X_test)
|
|
|
|
y_pred_list_of_arrays = classifier.fit(
|
|
X_train, y_train_list_of_arrays).predict(X_test)
|
|
|
|
assert_array_equal(y_pred, y_pred_list_of_arrays)
|
|
assert_array_equal(y_pred, y_pred_list_of_lists)
|
|
|
|
assert y_pred.dtype == y_pred_list_of_arrays.dtype
|
|
assert y_pred.dtype == y_pred_list_of_lists.dtype
|
|
assert type(y_pred) == type(y_pred_list_of_arrays)
|
|
assert type(y_pred) == type(y_pred_list_of_lists)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_estimators_fit_returns_self(name, estimator_orig,
|
|
readonly_memmap=False):
|
|
"""Check if self is returned when calling fit"""
|
|
X, y = make_blobs(random_state=0, n_samples=21)
|
|
# some want non-negative input
|
|
X -= X.min()
|
|
X = _pairwise_estimator_convert_X(X, estimator_orig)
|
|
|
|
estimator = clone(estimator_orig)
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
|
|
if readonly_memmap:
|
|
X, y = create_memmap_backed_data([X, y])
|
|
|
|
set_random_state(estimator)
|
|
assert estimator.fit(X, y) is estimator
|
|
|
|
|
|
@ignore_warnings
|
|
def check_estimators_unfitted(name, estimator_orig):
|
|
"""Check that predict raises an exception in an unfitted estimator.
|
|
|
|
Unfitted estimators should raise a NotFittedError.
|
|
"""
|
|
# Common test for Regressors, Classifiers and Outlier detection estimators
|
|
X, y = _boston_subset()
|
|
|
|
estimator = clone(estimator_orig)
|
|
for method in ('decision_function', 'predict', 'predict_proba',
|
|
'predict_log_proba'):
|
|
if hasattr(estimator, method):
|
|
assert_raises(NotFittedError, getattr(estimator, method), X)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_supervised_y_2d(name, estimator_orig):
|
|
tags = estimator_orig._get_tags()
|
|
if tags['multioutput_only']:
|
|
# These only work on 2d, so this test makes no sense
|
|
return
|
|
rnd = np.random.RandomState(0)
|
|
n_samples = 30
|
|
X = _pairwise_estimator_convert_X(
|
|
rnd.uniform(size=(n_samples, 3)), estimator_orig
|
|
)
|
|
y = np.arange(n_samples) % 3
|
|
y = _enforce_estimator_tags_y(estimator_orig, y)
|
|
estimator = clone(estimator_orig)
|
|
set_random_state(estimator)
|
|
# fit
|
|
estimator.fit(X, y)
|
|
y_pred = estimator.predict(X)
|
|
|
|
set_random_state(estimator)
|
|
# Check that when a 2D y is given, a DataConversionWarning is
|
|
# raised
|
|
with warnings.catch_warnings(record=True) as w:
|
|
warnings.simplefilter("always", DataConversionWarning)
|
|
warnings.simplefilter("ignore", RuntimeWarning)
|
|
estimator.fit(X, y[:, np.newaxis])
|
|
y_pred_2d = estimator.predict(X)
|
|
msg = "expected 1 DataConversionWarning, got: %s" % (
|
|
", ".join([str(w_x) for w_x in w]))
|
|
if not tags['multioutput']:
|
|
# check that we warned if we don't support multi-output
|
|
assert len(w) > 0, msg
|
|
assert "DataConversionWarning('A column-vector y" \
|
|
" was passed when a 1d array was expected" in msg
|
|
assert_allclose(y_pred.ravel(), y_pred_2d.ravel())
|
|
|
|
|
|
@ignore_warnings
|
|
def check_classifiers_predictions(X, y, name, classifier_orig):
|
|
classes = np.unique(y)
|
|
classifier = clone(classifier_orig)
|
|
if name == 'BernoulliNB':
|
|
X = X > X.mean()
|
|
set_random_state(classifier)
|
|
|
|
classifier.fit(X, y)
|
|
y_pred = classifier.predict(X)
|
|
|
|
if hasattr(classifier, "decision_function"):
|
|
decision = classifier.decision_function(X)
|
|
assert isinstance(decision, np.ndarray)
|
|
if len(classes) == 2:
|
|
dec_pred = (decision.ravel() > 0).astype(np.int)
|
|
dec_exp = classifier.classes_[dec_pred]
|
|
assert_array_equal(dec_exp, y_pred,
|
|
err_msg="decision_function does not match "
|
|
"classifier for %r: expected '%s', got '%s'" %
|
|
(classifier, ", ".join(map(str, dec_exp)),
|
|
", ".join(map(str, y_pred))))
|
|
elif getattr(classifier, 'decision_function_shape', 'ovr') == 'ovr':
|
|
decision_y = np.argmax(decision, axis=1).astype(int)
|
|
y_exp = classifier.classes_[decision_y]
|
|
assert_array_equal(y_exp, y_pred,
|
|
err_msg="decision_function does not match "
|
|
"classifier for %r: expected '%s', got '%s'" %
|
|
(classifier, ", ".join(map(str, y_exp)),
|
|
", ".join(map(str, y_pred))))
|
|
|
|
# training set performance
|
|
if name != "ComplementNB":
|
|
# This is a pathological data set for ComplementNB.
|
|
# For some specific cases 'ComplementNB' predicts less classes
|
|
# than expected
|
|
assert_array_equal(np.unique(y), np.unique(y_pred))
|
|
assert_array_equal(classes, classifier.classes_,
|
|
err_msg="Unexpected classes_ attribute for %r: "
|
|
"expected '%s', got '%s'" %
|
|
(classifier, ", ".join(map(str, classes)),
|
|
", ".join(map(str, classifier.classes_))))
|
|
|
|
|
|
# TODO: remove in 0.24
|
|
@deprecated("choose_check_classifiers_labels is deprecated in version "
|
|
"0.22 and will be removed in version 0.24.")
|
|
def choose_check_classifiers_labels(name, y, y_names):
|
|
return _choose_check_classifiers_labels(name, y, y_names)
|
|
|
|
|
|
def _choose_check_classifiers_labels(name, y, y_names):
|
|
return y if name in ["LabelPropagation", "LabelSpreading"] else y_names
|
|
|
|
|
|
def check_classifiers_classes(name, classifier_orig):
|
|
X_multiclass, y_multiclass = make_blobs(n_samples=30, random_state=0,
|
|
cluster_std=0.1)
|
|
X_multiclass, y_multiclass = shuffle(X_multiclass, y_multiclass,
|
|
random_state=7)
|
|
X_multiclass = StandardScaler().fit_transform(X_multiclass)
|
|
# We need to make sure that we have non negative data, for things
|
|
# like NMF
|
|
X_multiclass -= X_multiclass.min() - .1
|
|
|
|
X_binary = X_multiclass[y_multiclass != 2]
|
|
y_binary = y_multiclass[y_multiclass != 2]
|
|
|
|
X_multiclass = _pairwise_estimator_convert_X(X_multiclass, classifier_orig)
|
|
X_binary = _pairwise_estimator_convert_X(X_binary, classifier_orig)
|
|
|
|
labels_multiclass = ["one", "two", "three"]
|
|
labels_binary = ["one", "two"]
|
|
|
|
y_names_multiclass = np.take(labels_multiclass, y_multiclass)
|
|
y_names_binary = np.take(labels_binary, y_binary)
|
|
|
|
problems = [(X_binary, y_binary, y_names_binary)]
|
|
if not classifier_orig._get_tags()['binary_only']:
|
|
problems.append((X_multiclass, y_multiclass, y_names_multiclass))
|
|
|
|
for X, y, y_names in problems:
|
|
for y_names_i in [y_names, y_names.astype('O')]:
|
|
y_ = _choose_check_classifiers_labels(name, y, y_names_i)
|
|
check_classifiers_predictions(X, y_, name, classifier_orig)
|
|
|
|
labels_binary = [-1, 1]
|
|
y_names_binary = np.take(labels_binary, y_binary)
|
|
y_binary = _choose_check_classifiers_labels(name, y_binary, y_names_binary)
|
|
check_classifiers_predictions(X_binary, y_binary, name, classifier_orig)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_regressors_int(name, regressor_orig):
|
|
X, _ = _boston_subset()
|
|
X = _pairwise_estimator_convert_X(X[:50], regressor_orig)
|
|
rnd = np.random.RandomState(0)
|
|
y = rnd.randint(3, size=X.shape[0])
|
|
y = _enforce_estimator_tags_y(regressor_orig, y)
|
|
rnd = np.random.RandomState(0)
|
|
# separate estimators to control random seeds
|
|
regressor_1 = clone(regressor_orig)
|
|
regressor_2 = clone(regressor_orig)
|
|
set_random_state(regressor_1)
|
|
set_random_state(regressor_2)
|
|
|
|
if name in CROSS_DECOMPOSITION:
|
|
y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])
|
|
y_ = y_.T
|
|
else:
|
|
y_ = y
|
|
|
|
# fit
|
|
regressor_1.fit(X, y_)
|
|
pred1 = regressor_1.predict(X)
|
|
regressor_2.fit(X, y_.astype(np.float))
|
|
pred2 = regressor_2.predict(X)
|
|
assert_allclose(pred1, pred2, atol=1e-2, err_msg=name)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_regressors_train(name, regressor_orig, readonly_memmap=False,
|
|
X_dtype=np.float64):
|
|
X, y = _boston_subset()
|
|
X = X.astype(X_dtype)
|
|
X = _pairwise_estimator_convert_X(X, regressor_orig)
|
|
y = StandardScaler().fit_transform(y.reshape(-1, 1)) # X is already scaled
|
|
y = y.ravel()
|
|
regressor = clone(regressor_orig)
|
|
y = _enforce_estimator_tags_y(regressor, y)
|
|
if name in CROSS_DECOMPOSITION:
|
|
rnd = np.random.RandomState(0)
|
|
y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])
|
|
y_ = y_.T
|
|
else:
|
|
y_ = y
|
|
|
|
if readonly_memmap:
|
|
X, y, y_ = create_memmap_backed_data([X, y, y_])
|
|
|
|
if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'):
|
|
# linear regressors need to set alpha, but not generalized CV ones
|
|
regressor.alpha = 0.01
|
|
if name == 'PassiveAggressiveRegressor':
|
|
regressor.C = 0.01
|
|
|
|
# raises error on malformed input for fit
|
|
with assert_raises(ValueError, msg="The classifier {} does not"
|
|
" raise an error when incorrect/malformed input "
|
|
"data for fit is passed. The number of training "
|
|
"examples is not the same as the number of "
|
|
"labels. Perhaps use check_X_y in fit.".format(name)):
|
|
regressor.fit(X, y[:-1])
|
|
# fit
|
|
set_random_state(regressor)
|
|
regressor.fit(X, y_)
|
|
regressor.fit(X.tolist(), y_.tolist())
|
|
y_pred = regressor.predict(X)
|
|
assert y_pred.shape == y_.shape
|
|
|
|
# TODO: find out why PLS and CCA fail. RANSAC is random
|
|
# and furthermore assumes the presence of outliers, hence
|
|
# skipped
|
|
if not regressor._get_tags()["poor_score"]:
|
|
assert regressor.score(X, y_) > 0.5
|
|
|
|
|
|
@ignore_warnings
|
|
def check_regressors_no_decision_function(name, regressor_orig):
|
|
# checks whether regressors have decision_function or predict_proba
|
|
rng = np.random.RandomState(0)
|
|
regressor = clone(regressor_orig)
|
|
|
|
X = rng.normal(size=(10, 4))
|
|
X = _pairwise_estimator_convert_X(X, regressor_orig)
|
|
y = _enforce_estimator_tags_y(regressor, X[:, 0])
|
|
|
|
if hasattr(regressor, "n_components"):
|
|
# FIXME CCA, PLS is not robust to rank 1 effects
|
|
regressor.n_components = 1
|
|
|
|
regressor.fit(X, y)
|
|
funcs = ["decision_function", "predict_proba", "predict_log_proba"]
|
|
for func_name in funcs:
|
|
func = getattr(regressor, func_name, None)
|
|
if func is None:
|
|
# doesn't have function
|
|
continue
|
|
# has function. Should raise deprecation warning
|
|
msg = func_name
|
|
assert_warns_message(FutureWarning, msg, func, X)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_class_weight_classifiers(name, classifier_orig):
|
|
|
|
if classifier_orig._get_tags()['binary_only']:
|
|
problems = [2]
|
|
else:
|
|
problems = [2, 3]
|
|
|
|
for n_centers in problems:
|
|
# create a very noisy dataset
|
|
X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20)
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
|
|
random_state=0)
|
|
|
|
# can't use gram_if_pairwise() here, setting up gram matrix manually
|
|
if _is_pairwise(classifier_orig):
|
|
X_test = rbf_kernel(X_test, X_train)
|
|
X_train = rbf_kernel(X_train, X_train)
|
|
|
|
n_centers = len(np.unique(y_train))
|
|
|
|
if n_centers == 2:
|
|
class_weight = {0: 1000, 1: 0.0001}
|
|
else:
|
|
class_weight = {0: 1000, 1: 0.0001, 2: 0.0001}
|
|
|
|
classifier = clone(classifier_orig).set_params(
|
|
class_weight=class_weight)
|
|
if hasattr(classifier, "n_iter"):
|
|
classifier.set_params(n_iter=100)
|
|
if hasattr(classifier, "max_iter"):
|
|
classifier.set_params(max_iter=1000)
|
|
if hasattr(classifier, "min_weight_fraction_leaf"):
|
|
classifier.set_params(min_weight_fraction_leaf=0.01)
|
|
if hasattr(classifier, "n_iter_no_change"):
|
|
classifier.set_params(n_iter_no_change=20)
|
|
|
|
set_random_state(classifier)
|
|
classifier.fit(X_train, y_train)
|
|
y_pred = classifier.predict(X_test)
|
|
# XXX: Generally can use 0.89 here. On Windows, LinearSVC gets
|
|
# 0.88 (Issue #9111)
|
|
assert np.mean(y_pred == 0) > 0.87
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_class_weight_balanced_classifiers(name, classifier_orig, X_train,
|
|
y_train, X_test, y_test, weights):
|
|
classifier = clone(classifier_orig)
|
|
if hasattr(classifier, "n_iter"):
|
|
classifier.set_params(n_iter=100)
|
|
if hasattr(classifier, "max_iter"):
|
|
classifier.set_params(max_iter=1000)
|
|
|
|
set_random_state(classifier)
|
|
classifier.fit(X_train, y_train)
|
|
y_pred = classifier.predict(X_test)
|
|
|
|
classifier.set_params(class_weight='balanced')
|
|
classifier.fit(X_train, y_train)
|
|
y_pred_balanced = classifier.predict(X_test)
|
|
assert (f1_score(y_test, y_pred_balanced, average='weighted') >
|
|
f1_score(y_test, y_pred, average='weighted'))
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_class_weight_balanced_linear_classifier(name, Classifier):
|
|
"""Test class weights with non-contiguous class labels."""
|
|
# this is run on classes, not instances, though this should be changed
|
|
X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
|
|
[1.0, 1.0], [1.0, 0.0]])
|
|
y = np.array([1, 1, 1, -1, -1])
|
|
|
|
classifier = Classifier()
|
|
|
|
if hasattr(classifier, "n_iter"):
|
|
# This is a very small dataset, default n_iter are likely to prevent
|
|
# convergence
|
|
classifier.set_params(n_iter=1000)
|
|
if hasattr(classifier, "max_iter"):
|
|
classifier.set_params(max_iter=1000)
|
|
if hasattr(classifier, 'cv'):
|
|
classifier.set_params(cv=3)
|
|
set_random_state(classifier)
|
|
|
|
# Let the model compute the class frequencies
|
|
classifier.set_params(class_weight='balanced')
|
|
coef_balanced = classifier.fit(X, y).coef_.copy()
|
|
|
|
# Count each label occurrence to reweight manually
|
|
n_samples = len(y)
|
|
n_classes = float(len(np.unique(y)))
|
|
|
|
class_weight = {1: n_samples / (np.sum(y == 1) * n_classes),
|
|
-1: n_samples / (np.sum(y == -1) * n_classes)}
|
|
classifier.set_params(class_weight=class_weight)
|
|
coef_manual = classifier.fit(X, y).coef_.copy()
|
|
|
|
assert_allclose(coef_balanced, coef_manual,
|
|
err_msg="Classifier %s is not computing"
|
|
" class_weight=balanced properly."
|
|
% name)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_estimators_overwrite_params(name, estimator_orig):
|
|
X, y = make_blobs(random_state=0, n_samples=21)
|
|
# some want non-negative input
|
|
X -= X.min()
|
|
X = _pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
|
|
estimator = clone(estimator_orig)
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
|
|
set_random_state(estimator)
|
|
|
|
# Make a physical copy of the original estimator parameters before fitting.
|
|
params = estimator.get_params()
|
|
original_params = deepcopy(params)
|
|
|
|
# Fit the model
|
|
estimator.fit(X, y)
|
|
|
|
# Compare the state of the model parameters with the original parameters
|
|
new_params = estimator.get_params()
|
|
for param_name, original_value in original_params.items():
|
|
new_value = new_params[param_name]
|
|
|
|
# We should never change or mutate the internal state of input
|
|
# parameters by default. To check this we use the joblib.hash function
|
|
# that introspects recursively any subobjects to compute a checksum.
|
|
# The only exception to this rule of immutable constructor parameters
|
|
# is possible RandomState instance but in this check we explicitly
|
|
# fixed the random_state params recursively to be integer seeds.
|
|
assert joblib.hash(new_value) == joblib.hash(original_value), (
|
|
"Estimator %s should not change or mutate "
|
|
" the parameter %s from %s to %s during fit."
|
|
% (name, param_name, original_value, new_value))
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_no_attributes_set_in_init(name, estimator_orig):
|
|
"""Check setting during init. """
|
|
estimator = clone(estimator_orig)
|
|
if hasattr(type(estimator).__init__, "deprecated_original"):
|
|
return
|
|
|
|
init_params = _get_args(type(estimator).__init__)
|
|
if IS_PYPY:
|
|
# __init__ signature has additional objects in PyPy
|
|
for key in ['obj']:
|
|
if key in init_params:
|
|
init_params.remove(key)
|
|
parents_init_params = [param for params_parent in
|
|
(_get_args(parent) for parent in
|
|
type(estimator).__mro__)
|
|
for param in params_parent]
|
|
|
|
# Test for no setting apart from parameters during init
|
|
invalid_attr = (set(vars(estimator)) - set(init_params)
|
|
- set(parents_init_params))
|
|
assert not invalid_attr, (
|
|
"Estimator %s should not set any attribute apart"
|
|
" from parameters during init. Found attributes %s."
|
|
% (name, sorted(invalid_attr)))
|
|
# Ensure that each parameter is set in init
|
|
invalid_attr = set(init_params) - set(vars(estimator)) - {"self"}
|
|
assert not invalid_attr, (
|
|
"Estimator %s should store all parameters"
|
|
" as an attribute during init. Did not find "
|
|
"attributes %s."
|
|
% (name, sorted(invalid_attr)))
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_sparsify_coefficients(name, estimator_orig):
|
|
X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1],
|
|
[-1, -2], [2, 2], [-2, -2]])
|
|
y = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3])
|
|
y = _enforce_estimator_tags_y(estimator_orig, y)
|
|
est = clone(estimator_orig)
|
|
|
|
est.fit(X, y)
|
|
pred_orig = est.predict(X)
|
|
|
|
# test sparsify with dense inputs
|
|
est.sparsify()
|
|
assert sparse.issparse(est.coef_)
|
|
pred = est.predict(X)
|
|
assert_array_equal(pred, pred_orig)
|
|
|
|
# pickle and unpickle with sparse coef_
|
|
est = pickle.loads(pickle.dumps(est))
|
|
assert sparse.issparse(est.coef_)
|
|
pred = est.predict(X)
|
|
assert_array_equal(pred, pred_orig)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_classifier_data_not_an_array(name, estimator_orig):
|
|
X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1],
|
|
[0, 3], [1, 0], [2, 0], [4, 4], [2, 3], [3, 2]])
|
|
X = _pairwise_estimator_convert_X(X, estimator_orig)
|
|
y = np.array([1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2])
|
|
y = _enforce_estimator_tags_y(estimator_orig, y)
|
|
for obj_type in ["NotAnArray", "PandasDataframe"]:
|
|
check_estimators_data_not_an_array(name, estimator_orig, X, y,
|
|
obj_type)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_regressor_data_not_an_array(name, estimator_orig):
|
|
X, y = _boston_subset(n_samples=50)
|
|
X = _pairwise_estimator_convert_X(X, estimator_orig)
|
|
y = _enforce_estimator_tags_y(estimator_orig, y)
|
|
for obj_type in ["NotAnArray", "PandasDataframe"]:
|
|
check_estimators_data_not_an_array(name, estimator_orig, X, y,
|
|
obj_type)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type):
|
|
if name in CROSS_DECOMPOSITION:
|
|
raise SkipTest("Skipping check_estimators_data_not_an_array "
|
|
"for cross decomposition module as estimators "
|
|
"are not deterministic.")
|
|
# separate estimators to control random seeds
|
|
estimator_1 = clone(estimator_orig)
|
|
estimator_2 = clone(estimator_orig)
|
|
set_random_state(estimator_1)
|
|
set_random_state(estimator_2)
|
|
|
|
if obj_type not in ["NotAnArray", 'PandasDataframe']:
|
|
raise ValueError("Data type {0} not supported".format(obj_type))
|
|
|
|
if obj_type == "NotAnArray":
|
|
y_ = _NotAnArray(np.asarray(y))
|
|
X_ = _NotAnArray(np.asarray(X))
|
|
else:
|
|
# Here pandas objects (Series and DataFrame) are tested explicitly
|
|
# because some estimators may handle them (especially their indexing)
|
|
# specially.
|
|
try:
|
|
import pandas as pd
|
|
y_ = np.asarray(y)
|
|
if y_.ndim == 1:
|
|
y_ = pd.Series(y_)
|
|
else:
|
|
y_ = pd.DataFrame(y_)
|
|
X_ = pd.DataFrame(np.asarray(X))
|
|
|
|
except ImportError:
|
|
raise SkipTest("pandas is not installed: not checking estimators "
|
|
"for pandas objects.")
|
|
|
|
# fit
|
|
estimator_1.fit(X_, y_)
|
|
pred1 = estimator_1.predict(X_)
|
|
estimator_2.fit(X, y)
|
|
pred2 = estimator_2.predict(X)
|
|
assert_allclose(pred1, pred2, atol=1e-2, err_msg=name)
|
|
|
|
|
|
def check_parameters_default_constructible(name, Estimator):
|
|
# this check works on classes, not instances
|
|
# test default-constructibility
|
|
# get rid of deprecation warnings
|
|
if isinstance(Estimator, BaseEstimator):
|
|
# Convert estimator instance to its class
|
|
# TODO: Always convert to class in 0.24, because check_estimator() will
|
|
# only accept instances, not classes
|
|
Estimator = Estimator.__class__
|
|
|
|
with ignore_warnings(category=FutureWarning):
|
|
estimator = _construct_instance(Estimator)
|
|
# test cloning
|
|
clone(estimator)
|
|
# test __repr__
|
|
repr(estimator)
|
|
# test that set_params returns self
|
|
assert estimator.set_params() is estimator
|
|
|
|
# test if init does nothing but set parameters
|
|
# this is important for grid_search etc.
|
|
# We get the default parameters from init and then
|
|
# compare these against the actual values of the attributes.
|
|
|
|
# this comes from getattr. Gets rid of deprecation decorator.
|
|
init = getattr(estimator.__init__, 'deprecated_original',
|
|
estimator.__init__)
|
|
|
|
try:
|
|
def param_filter(p):
|
|
"""Identify hyper parameters of an estimator"""
|
|
return (p.name != 'self' and
|
|
p.kind != p.VAR_KEYWORD and
|
|
p.kind != p.VAR_POSITIONAL)
|
|
|
|
init_params = [p for p in signature(init).parameters.values()
|
|
if param_filter(p)]
|
|
|
|
except (TypeError, ValueError):
|
|
# init is not a python function.
|
|
# true for mixins
|
|
return
|
|
params = estimator.get_params()
|
|
# they can need a non-default argument
|
|
init_params = init_params[len(getattr(
|
|
estimator, '_required_parameters', [])):]
|
|
|
|
for init_param in init_params:
|
|
assert init_param.default != init_param.empty, (
|
|
"parameter %s for %s has no default value"
|
|
% (init_param.name, type(estimator).__name__))
|
|
if type(init_param.default) is type:
|
|
assert init_param.default in [np.float64, np.int64]
|
|
else:
|
|
assert (type(init_param.default) in
|
|
[str, int, float, bool, tuple, type(None),
|
|
np.float64, types.FunctionType, joblib.Memory])
|
|
if init_param.name not in params.keys():
|
|
# deprecated parameter, not in get_params
|
|
assert init_param.default is None
|
|
continue
|
|
|
|
param_value = params[init_param.name]
|
|
if isinstance(param_value, np.ndarray):
|
|
assert_array_equal(param_value, init_param.default)
|
|
else:
|
|
if is_scalar_nan(param_value):
|
|
# Allows to set default parameters to np.nan
|
|
assert param_value is init_param.default, init_param.name
|
|
else:
|
|
assert param_value == init_param.default, init_param.name
|
|
|
|
|
|
# TODO: remove in 0.24
|
|
@deprecated("enforce_estimator_tags_y is deprecated in version "
|
|
"0.22 and will be removed in version 0.24.")
|
|
def enforce_estimator_tags_y(estimator, y):
|
|
return _enforce_estimator_tags_y(estimator, y)
|
|
|
|
|
|
def _enforce_estimator_tags_y(estimator, y):
|
|
# Estimators with a `requires_positive_y` tag only accept strictly positive
|
|
# data
|
|
if estimator._get_tags()["requires_positive_y"]:
|
|
# Create strictly positive y. The minimal increment above 0 is 1, as
|
|
# y could be of integer dtype.
|
|
y += 1 + abs(y.min())
|
|
# Estimators with a `binary_only` tag only accept up to two unique y values
|
|
if estimator._get_tags()["binary_only"] and y.size > 0:
|
|
y = np.where(y == y.flat[0], y, y.flat[0] + 1)
|
|
# Estimators in mono_output_task_error raise ValueError if y is of 1-D
|
|
# Convert into a 2-D y for those estimators.
|
|
if estimator._get_tags()["multioutput_only"]:
|
|
return np.reshape(y, (-1, 1))
|
|
return y
|
|
|
|
|
|
def _enforce_estimator_tags_x(estimator, X):
|
|
# Estimators with a `_pairwise` tag only accept
|
|
# X of shape (`n_samples`, `n_samples`)
|
|
if hasattr(estimator, '_pairwise'):
|
|
X = X.dot(X.T)
|
|
# Estimators with `1darray` in `X_types` tag only accept
|
|
# X of shape (`n_samples`,)
|
|
if '1darray' in estimator._get_tags()['X_types']:
|
|
X = X[:, 0]
|
|
# Estimators with a `requires_positive_X` tag only accept
|
|
# strictly positive data
|
|
if estimator._get_tags()['requires_positive_X']:
|
|
X -= X.min()
|
|
return X
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_non_transformer_estimators_n_iter(name, estimator_orig):
|
|
# Test that estimators that are not transformers with a parameter
|
|
# max_iter, return the attribute of n_iter_ at least 1.
|
|
|
|
# These models are dependent on external solvers like
|
|
# libsvm and accessing the iter parameter is non-trivial.
|
|
not_run_check_n_iter = ['Ridge', 'SVR', 'NuSVR', 'NuSVC',
|
|
'RidgeClassifier', 'SVC', 'RandomizedLasso',
|
|
'LogisticRegressionCV', 'LinearSVC',
|
|
'LogisticRegression']
|
|
|
|
# Tested in test_transformer_n_iter
|
|
not_run_check_n_iter += CROSS_DECOMPOSITION
|
|
if name in not_run_check_n_iter:
|
|
return
|
|
|
|
# LassoLars stops early for the default alpha=1.0 the iris dataset.
|
|
if name == 'LassoLars':
|
|
estimator = clone(estimator_orig).set_params(alpha=0.)
|
|
else:
|
|
estimator = clone(estimator_orig)
|
|
if hasattr(estimator, 'max_iter'):
|
|
iris = load_iris()
|
|
X, y_ = iris.data, iris.target
|
|
y_ = _enforce_estimator_tags_y(estimator, y_)
|
|
|
|
set_random_state(estimator, 0)
|
|
|
|
estimator.fit(X, y_)
|
|
|
|
assert estimator.n_iter_ >= 1
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_transformer_n_iter(name, estimator_orig):
|
|
# Test that transformers with a parameter max_iter, return the
|
|
# attribute of n_iter_ at least 1.
|
|
estimator = clone(estimator_orig)
|
|
if hasattr(estimator, "max_iter"):
|
|
if name in CROSS_DECOMPOSITION:
|
|
# Check using default data
|
|
X = [[0., 0., 1.], [1., 0., 0.], [2., 2., 2.], [2., 5., 4.]]
|
|
y_ = [[0.1, -0.2], [0.9, 1.1], [0.1, -0.5], [0.3, -0.2]]
|
|
|
|
else:
|
|
X, y_ = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
|
|
random_state=0, n_features=2, cluster_std=0.1)
|
|
X -= X.min() - 0.1
|
|
set_random_state(estimator, 0)
|
|
estimator.fit(X, y_)
|
|
|
|
# These return a n_iter per component.
|
|
if name in CROSS_DECOMPOSITION:
|
|
for iter_ in estimator.n_iter_:
|
|
assert iter_ >= 1
|
|
else:
|
|
assert estimator.n_iter_ >= 1
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_get_params_invariance(name, estimator_orig):
|
|
# Checks if get_params(deep=False) is a subset of get_params(deep=True)
|
|
e = clone(estimator_orig)
|
|
|
|
shallow_params = e.get_params(deep=False)
|
|
deep_params = e.get_params(deep=True)
|
|
|
|
assert all(item in deep_params.items() for item in
|
|
shallow_params.items())
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_set_params(name, estimator_orig):
|
|
# Check that get_params() returns the same thing
|
|
# before and after set_params() with some fuzz
|
|
estimator = clone(estimator_orig)
|
|
|
|
orig_params = estimator.get_params(deep=False)
|
|
msg = ("get_params result does not match what was passed to set_params")
|
|
|
|
estimator.set_params(**orig_params)
|
|
curr_params = estimator.get_params(deep=False)
|
|
assert set(orig_params.keys()) == set(curr_params.keys()), msg
|
|
for k, v in curr_params.items():
|
|
assert orig_params[k] is v, msg
|
|
|
|
# some fuzz values
|
|
test_values = [-np.inf, np.inf, None]
|
|
|
|
test_params = deepcopy(orig_params)
|
|
for param_name in orig_params.keys():
|
|
default_value = orig_params[param_name]
|
|
for value in test_values:
|
|
test_params[param_name] = value
|
|
try:
|
|
estimator.set_params(**test_params)
|
|
except (TypeError, ValueError) as e:
|
|
e_type = e.__class__.__name__
|
|
# Exception occurred, possibly parameter validation
|
|
warnings.warn("{0} occurred during set_params of param {1} on "
|
|
"{2}. It is recommended to delay parameter "
|
|
"validation until fit.".format(e_type,
|
|
param_name,
|
|
name))
|
|
|
|
change_warning_msg = "Estimator's parameters changed after " \
|
|
"set_params raised {}".format(e_type)
|
|
params_before_exception = curr_params
|
|
curr_params = estimator.get_params(deep=False)
|
|
try:
|
|
assert (set(params_before_exception.keys()) ==
|
|
set(curr_params.keys()))
|
|
for k, v in curr_params.items():
|
|
assert params_before_exception[k] is v
|
|
except AssertionError:
|
|
warnings.warn(change_warning_msg)
|
|
else:
|
|
curr_params = estimator.get_params(deep=False)
|
|
assert (set(test_params.keys()) ==
|
|
set(curr_params.keys())), msg
|
|
for k, v in curr_params.items():
|
|
assert test_params[k] is v, msg
|
|
test_params[param_name] = default_value
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_classifiers_regression_target(name, estimator_orig):
|
|
# Check if classifier throws an exception when fed regression targets
|
|
|
|
X, y = load_boston(return_X_y=True)
|
|
e = clone(estimator_orig)
|
|
msg = 'Unknown label type: '
|
|
if not e._get_tags()["no_validation"]:
|
|
assert_raises_regex(ValueError, msg, e.fit, X, y)
|
|
|
|
|
|
@ignore_warnings(category=FutureWarning)
|
|
def check_decision_proba_consistency(name, estimator_orig):
|
|
# Check whether an estimator having both decision_function and
|
|
# predict_proba methods has outputs with perfect rank correlation.
|
|
|
|
centers = [(2, 2), (4, 4)]
|
|
X, y = make_blobs(n_samples=100, random_state=0, n_features=4,
|
|
centers=centers, cluster_std=1.0, shuffle=True)
|
|
X_test = np.random.randn(20, 2) + 4
|
|
estimator = clone(estimator_orig)
|
|
|
|
if (hasattr(estimator, "decision_function") and
|
|
hasattr(estimator, "predict_proba")):
|
|
|
|
estimator.fit(X, y)
|
|
# Since the link function from decision_function() to predict_proba()
|
|
# is sometimes not precise enough (typically expit), we round to the
|
|
# 10th decimal to avoid numerical issues.
|
|
a = estimator.predict_proba(X_test)[:, 1].round(decimals=10)
|
|
b = estimator.decision_function(X_test).round(decimals=10)
|
|
assert_array_equal(rankdata(a), rankdata(b))
|
|
|
|
|
|
def check_outliers_fit_predict(name, estimator_orig):
|
|
# Check fit_predict for outlier detectors.
|
|
|
|
n_samples = 300
|
|
X, _ = make_blobs(n_samples=n_samples, random_state=0)
|
|
X = shuffle(X, random_state=7)
|
|
n_samples, n_features = X.shape
|
|
estimator = clone(estimator_orig)
|
|
|
|
set_random_state(estimator)
|
|
|
|
y_pred = estimator.fit_predict(X)
|
|
assert y_pred.shape == (n_samples,)
|
|
assert y_pred.dtype.kind == 'i'
|
|
assert_array_equal(np.unique(y_pred), np.array([-1, 1]))
|
|
|
|
# check fit_predict = fit.predict when the estimator has both a predict and
|
|
# a fit_predict method. recall that it is already assumed here that the
|
|
# estimator has a fit_predict method
|
|
if hasattr(estimator, 'predict'):
|
|
y_pred_2 = estimator.fit(X).predict(X)
|
|
assert_array_equal(y_pred, y_pred_2)
|
|
|
|
if hasattr(estimator, "contamination"):
|
|
# proportion of outliers equal to contamination parameter when not
|
|
# set to 'auto'
|
|
expected_outliers = 30
|
|
contamination = float(expected_outliers)/n_samples
|
|
estimator.set_params(contamination=contamination)
|
|
y_pred = estimator.fit_predict(X)
|
|
|
|
num_outliers = np.sum(y_pred != 1)
|
|
# num_outliers should be equal to expected_outliers unless
|
|
# there are ties in the decision_function values. this can
|
|
# only be tested for estimators with a decision_function
|
|
# method
|
|
if (num_outliers != expected_outliers and
|
|
hasattr(estimator, 'decision_function')):
|
|
decision = estimator.decision_function(X)
|
|
check_outlier_corruption(num_outliers, expected_outliers, decision)
|
|
|
|
# raises error when contamination is a scalar and not in [0,1]
|
|
for contamination in [-0.5, 2.3]:
|
|
estimator.set_params(contamination=contamination)
|
|
assert_raises(ValueError, estimator.fit_predict, X)
|
|
|
|
|
|
def check_fit_non_negative(name, estimator_orig):
|
|
# Check that proper warning is raised for non-negative X
|
|
# when tag requires_positive_X is present
|
|
X = np.array([[-1., 1], [-1., 1]])
|
|
y = np.array([1, 2])
|
|
estimator = clone(estimator_orig)
|
|
assert_raises_regex(ValueError, "Negative values in data passed to",
|
|
estimator.fit, X, y)
|
|
|
|
|
|
def check_fit_idempotent(name, estimator_orig):
|
|
# Check that est.fit(X) is the same as est.fit(X).fit(X). Ideally we would
|
|
# check that the estimated parameters during training (e.g. coefs_) are
|
|
# the same, but having a universal comparison function for those
|
|
# attributes is difficult and full of edge cases. So instead we check that
|
|
# predict(), predict_proba(), decision_function() and transform() return
|
|
# the same results.
|
|
|
|
check_methods = ["predict", "transform", "decision_function",
|
|
"predict_proba"]
|
|
rng = np.random.RandomState(0)
|
|
|
|
estimator = clone(estimator_orig)
|
|
set_random_state(estimator)
|
|
if 'warm_start' in estimator.get_params().keys():
|
|
estimator.set_params(warm_start=False)
|
|
|
|
n_samples = 100
|
|
X = rng.normal(loc=100, size=(n_samples, 2))
|
|
X = _pairwise_estimator_convert_X(X, estimator)
|
|
if is_regressor(estimator_orig):
|
|
y = rng.normal(size=n_samples)
|
|
else:
|
|
y = rng.randint(low=0, high=2, size=n_samples)
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
|
|
train, test = next(ShuffleSplit(test_size=.2, random_state=rng).split(X))
|
|
X_train, y_train = _safe_split(estimator, X, y, train)
|
|
X_test, y_test = _safe_split(estimator, X, y, test, train)
|
|
|
|
# Fit for the first time
|
|
estimator.fit(X_train, y_train)
|
|
|
|
result = {method: getattr(estimator, method)(X_test)
|
|
for method in check_methods
|
|
if hasattr(estimator, method)}
|
|
|
|
# Fit again
|
|
set_random_state(estimator)
|
|
estimator.fit(X_train, y_train)
|
|
|
|
for method in check_methods:
|
|
if hasattr(estimator, method):
|
|
new_result = getattr(estimator, method)(X_test)
|
|
if np.issubdtype(new_result.dtype, np.floating):
|
|
tol = 2*np.finfo(new_result.dtype).eps
|
|
else:
|
|
tol = 2*np.finfo(np.float64).eps
|
|
assert_allclose_dense_sparse(
|
|
result[method], new_result,
|
|
atol=max(tol, 1e-9), rtol=max(tol, 1e-7),
|
|
err_msg="Idempotency check failed for method {}".format(method)
|
|
)
|
|
|
|
|
|
def check_n_features_in(name, estimator_orig):
|
|
# Make sure that n_features_in_ attribute doesn't exist until fit is
|
|
# called, and that its value is correct.
|
|
|
|
rng = np.random.RandomState(0)
|
|
|
|
estimator = clone(estimator_orig)
|
|
set_random_state(estimator)
|
|
if 'warm_start' in estimator.get_params():
|
|
estimator.set_params(warm_start=False)
|
|
|
|
n_samples = 100
|
|
X = rng.normal(loc=100, size=(n_samples, 2))
|
|
X = _pairwise_estimator_convert_X(X, estimator)
|
|
if is_regressor(estimator_orig):
|
|
y = rng.normal(size=n_samples)
|
|
else:
|
|
y = rng.randint(low=0, high=2, size=n_samples)
|
|
y = _enforce_estimator_tags_y(estimator, y)
|
|
|
|
assert not hasattr(estimator, 'n_features_in_')
|
|
estimator.fit(X, y)
|
|
if hasattr(estimator, 'n_features_in_'):
|
|
assert estimator.n_features_in_ == X.shape[1]
|
|
else:
|
|
warnings.warn(
|
|
"As of scikit-learn 0.23, estimators should expose a "
|
|
"n_features_in_ attribute, unless the 'no_validation' tag is "
|
|
"True. This attribute should be equal to the number of features "
|
|
"passed to the fit method. "
|
|
"An error will be raised from version 0.25 when calling "
|
|
"check_estimator(). "
|
|
"See SLEP010: "
|
|
"https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html", # noqa
|
|
FutureWarning
|
|
)
|
|
|
|
|
|
def check_requires_y_none(name, estimator_orig):
|
|
# Make sure that an estimator with requires_y=True fails gracefully when
|
|
# given y=None
|
|
|
|
rng = np.random.RandomState(0)
|
|
|
|
estimator = clone(estimator_orig)
|
|
set_random_state(estimator)
|
|
|
|
n_samples = 100
|
|
X = rng.normal(loc=100, size=(n_samples, 2))
|
|
X = _pairwise_estimator_convert_X(X, estimator)
|
|
|
|
warning_msg = ("As of scikit-learn 0.23, estimators should have a "
|
|
"'requires_y' tag set to the appropriate value. "
|
|
"The default value of the tag is False. "
|
|
"An error will be raised from version 0.25 when calling "
|
|
"check_estimator() if the tag isn't properly set.")
|
|
|
|
expected_err_msgs = (
|
|
"requires y to be passed, but the target y is None",
|
|
"Expected array-like (array or non-string sequence), got None",
|
|
"y should be a 1d array"
|
|
)
|
|
|
|
try:
|
|
estimator.fit(X, None)
|
|
except ValueError as ve:
|
|
if not any(msg in str(ve) for msg in expected_err_msgs):
|
|
warnings.warn(warning_msg, FutureWarning)
|