Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
0
venv/Lib/site-packages/sklearn/tests/__init__.py
Normal file
0
venv/Lib/site-packages/sklearn/tests/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
539
venv/Lib/site-packages/sklearn/tests/test_base.py
Normal file
539
venv/Lib/site-packages/sklearn/tests/test_base.py
Normal file
|
@ -0,0 +1,539 @@
|
|||
# Author: Gael Varoquaux
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
import pytest
|
||||
|
||||
import sklearn
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_raises
|
||||
from sklearn.utils._testing import assert_no_warnings
|
||||
from sklearn.utils._testing import assert_warns_message
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
|
||||
from sklearn.base import BaseEstimator, clone, is_classifier
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
from sklearn import datasets
|
||||
|
||||
from sklearn.base import TransformerMixin
|
||||
from sklearn.utils._mocking import MockDataFrame
|
||||
from sklearn import config_context
|
||||
import pickle
|
||||
|
||||
|
||||
#############################################################################
|
||||
# A few test classes
|
||||
class MyEstimator(BaseEstimator):
|
||||
|
||||
def __init__(self, l1=0, empty=None):
|
||||
self.l1 = l1
|
||||
self.empty = empty
|
||||
|
||||
|
||||
class K(BaseEstimator):
|
||||
def __init__(self, c=None, d=None):
|
||||
self.c = c
|
||||
self.d = d
|
||||
|
||||
|
||||
class T(BaseEstimator):
|
||||
def __init__(self, a=None, b=None):
|
||||
self.a = a
|
||||
self.b = b
|
||||
|
||||
|
||||
class NaNTag(BaseEstimator):
|
||||
def _more_tags(self):
|
||||
return {'allow_nan': True}
|
||||
|
||||
|
||||
class NoNaNTag(BaseEstimator):
|
||||
def _more_tags(self):
|
||||
return {'allow_nan': False}
|
||||
|
||||
|
||||
class OverrideTag(NaNTag):
|
||||
def _more_tags(self):
|
||||
return {'allow_nan': False}
|
||||
|
||||
|
||||
class DiamondOverwriteTag(NaNTag, NoNaNTag):
|
||||
def _more_tags(self):
|
||||
return dict()
|
||||
|
||||
|
||||
class InheritDiamondOverwriteTag(DiamondOverwriteTag):
|
||||
pass
|
||||
|
||||
|
||||
class ModifyInitParams(BaseEstimator):
|
||||
"""Deprecated behavior.
|
||||
Equal parameters but with a type cast.
|
||||
Doesn't fulfill a is a
|
||||
"""
|
||||
def __init__(self, a=np.array([0])):
|
||||
self.a = a.copy()
|
||||
|
||||
|
||||
class Buggy(BaseEstimator):
|
||||
" A buggy estimator that does not set its parameters right. "
|
||||
|
||||
def __init__(self, a=None):
|
||||
self.a = 1
|
||||
|
||||
|
||||
class NoEstimator:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def fit(self, X=None, y=None):
|
||||
return self
|
||||
|
||||
def predict(self, X=None):
|
||||
return None
|
||||
|
||||
|
||||
class VargEstimator(BaseEstimator):
|
||||
"""scikit-learn estimators shouldn't have vargs."""
|
||||
def __init__(self, *vargs):
|
||||
pass
|
||||
|
||||
|
||||
#############################################################################
|
||||
# The tests
|
||||
|
||||
def test_clone():
|
||||
# Tests that clone creates a correct deep copy.
|
||||
# We create an estimator, make a copy of its original state
|
||||
# (which, in this case, is the current state of the estimator),
|
||||
# and check that the obtained copy is a correct deep copy.
|
||||
|
||||
from sklearn.feature_selection import SelectFpr, f_classif
|
||||
|
||||
selector = SelectFpr(f_classif, alpha=0.1)
|
||||
new_selector = clone(selector)
|
||||
assert selector is not new_selector
|
||||
assert selector.get_params() == new_selector.get_params()
|
||||
|
||||
selector = SelectFpr(f_classif, alpha=np.zeros((10, 2)))
|
||||
new_selector = clone(selector)
|
||||
assert selector is not new_selector
|
||||
|
||||
|
||||
def test_clone_2():
|
||||
# Tests that clone doesn't copy everything.
|
||||
# We first create an estimator, give it an own attribute, and
|
||||
# make a copy of its original state. Then we check that the copy doesn't
|
||||
# have the specific attribute we manually added to the initial estimator.
|
||||
|
||||
from sklearn.feature_selection import SelectFpr, f_classif
|
||||
|
||||
selector = SelectFpr(f_classif, alpha=0.1)
|
||||
selector.own_attribute = "test"
|
||||
new_selector = clone(selector)
|
||||
assert not hasattr(new_selector, "own_attribute")
|
||||
|
||||
|
||||
def test_clone_buggy():
|
||||
# Check that clone raises an error on buggy estimators.
|
||||
buggy = Buggy()
|
||||
buggy.a = 2
|
||||
assert_raises(RuntimeError, clone, buggy)
|
||||
|
||||
no_estimator = NoEstimator()
|
||||
assert_raises(TypeError, clone, no_estimator)
|
||||
|
||||
varg_est = VargEstimator()
|
||||
assert_raises(RuntimeError, clone, varg_est)
|
||||
|
||||
est = ModifyInitParams()
|
||||
assert_raises(RuntimeError, clone, est)
|
||||
|
||||
|
||||
def test_clone_empty_array():
|
||||
# Regression test for cloning estimators with empty arrays
|
||||
clf = MyEstimator(empty=np.array([]))
|
||||
clf2 = clone(clf)
|
||||
assert_array_equal(clf.empty, clf2.empty)
|
||||
|
||||
clf = MyEstimator(empty=sp.csr_matrix(np.array([[0]])))
|
||||
clf2 = clone(clf)
|
||||
assert_array_equal(clf.empty.data, clf2.empty.data)
|
||||
|
||||
|
||||
def test_clone_nan():
|
||||
# Regression test for cloning estimators with default parameter as np.nan
|
||||
clf = MyEstimator(empty=np.nan)
|
||||
clf2 = clone(clf)
|
||||
|
||||
assert clf.empty is clf2.empty
|
||||
|
||||
|
||||
def test_clone_sparse_matrices():
|
||||
sparse_matrix_classes = [
|
||||
getattr(sp, name)
|
||||
for name in dir(sp) if name.endswith('_matrix')]
|
||||
|
||||
for cls in sparse_matrix_classes:
|
||||
sparse_matrix = cls(np.eye(5))
|
||||
clf = MyEstimator(empty=sparse_matrix)
|
||||
clf_cloned = clone(clf)
|
||||
assert clf.empty.__class__ is clf_cloned.empty.__class__
|
||||
assert_array_equal(clf.empty.toarray(), clf_cloned.empty.toarray())
|
||||
|
||||
|
||||
def test_clone_estimator_types():
|
||||
# Check that clone works for parameters that are types rather than
|
||||
# instances
|
||||
clf = MyEstimator(empty=MyEstimator)
|
||||
clf2 = clone(clf)
|
||||
|
||||
assert clf.empty is clf2.empty
|
||||
|
||||
|
||||
def test_clone_class_rather_than_instance():
|
||||
# Check that clone raises expected error message when
|
||||
# cloning class rather than instance
|
||||
msg = "You should provide an instance of scikit-learn estimator"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
clone(MyEstimator)
|
||||
|
||||
|
||||
def test_repr():
|
||||
# Smoke test the repr of the base estimator.
|
||||
my_estimator = MyEstimator()
|
||||
repr(my_estimator)
|
||||
test = T(K(), K())
|
||||
assert (
|
||||
repr(test) ==
|
||||
"T(a=K(), b=K())")
|
||||
|
||||
some_est = T(a=["long_params"] * 1000)
|
||||
assert len(repr(some_est)) == 485
|
||||
|
||||
|
||||
def test_str():
|
||||
# Smoke test the str of the base estimator
|
||||
my_estimator = MyEstimator()
|
||||
str(my_estimator)
|
||||
|
||||
|
||||
def test_get_params():
|
||||
test = T(K(), K())
|
||||
|
||||
assert 'a__d' in test.get_params(deep=True)
|
||||
assert 'a__d' not in test.get_params(deep=False)
|
||||
|
||||
test.set_params(a__d=2)
|
||||
assert test.a.d == 2
|
||||
assert_raises(ValueError, test.set_params, a__a=2)
|
||||
|
||||
|
||||
def test_is_classifier():
|
||||
svc = SVC()
|
||||
assert is_classifier(svc)
|
||||
assert is_classifier(GridSearchCV(svc, {'C': [0.1, 1]}))
|
||||
assert is_classifier(Pipeline([('svc', svc)]))
|
||||
assert is_classifier(Pipeline(
|
||||
[('svc_cv', GridSearchCV(svc, {'C': [0.1, 1]}))]))
|
||||
|
||||
|
||||
def test_set_params():
|
||||
# test nested estimator parameter setting
|
||||
clf = Pipeline([("svc", SVC())])
|
||||
# non-existing parameter in svc
|
||||
assert_raises(ValueError, clf.set_params, svc__stupid_param=True)
|
||||
# non-existing parameter of pipeline
|
||||
assert_raises(ValueError, clf.set_params, svm__stupid_param=True)
|
||||
# we don't currently catch if the things in pipeline are estimators
|
||||
# bad_pipeline = Pipeline([("bad", NoEstimator())])
|
||||
# assert_raises(AttributeError, bad_pipeline.set_params,
|
||||
# bad__stupid_param=True)
|
||||
|
||||
|
||||
def test_set_params_passes_all_parameters():
|
||||
# Make sure all parameters are passed together to set_params
|
||||
# of nested estimator. Regression test for #9944
|
||||
|
||||
class TestDecisionTree(DecisionTreeClassifier):
|
||||
def set_params(self, **kwargs):
|
||||
super().set_params(**kwargs)
|
||||
# expected_kwargs is in test scope
|
||||
assert kwargs == expected_kwargs
|
||||
return self
|
||||
|
||||
expected_kwargs = {'max_depth': 5, 'min_samples_leaf': 2}
|
||||
for est in [Pipeline([('estimator', TestDecisionTree())]),
|
||||
GridSearchCV(TestDecisionTree(), {})]:
|
||||
est.set_params(estimator__max_depth=5,
|
||||
estimator__min_samples_leaf=2)
|
||||
|
||||
|
||||
def test_set_params_updates_valid_params():
|
||||
# Check that set_params tries to set SVC().C, not
|
||||
# DecisionTreeClassifier().C
|
||||
gscv = GridSearchCV(DecisionTreeClassifier(), {})
|
||||
gscv.set_params(estimator=SVC(), estimator__C=42.0)
|
||||
assert gscv.estimator.C == 42.0
|
||||
|
||||
|
||||
def test_score_sample_weight():
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
# test both ClassifierMixin and RegressorMixin
|
||||
estimators = [DecisionTreeClassifier(max_depth=2),
|
||||
DecisionTreeRegressor(max_depth=2)]
|
||||
sets = [datasets.load_iris(),
|
||||
datasets.load_boston()]
|
||||
|
||||
for est, ds in zip(estimators, sets):
|
||||
est.fit(ds.data, ds.target)
|
||||
# generate random sample weights
|
||||
sample_weight = rng.randint(1, 10, size=len(ds.target))
|
||||
# check that the score with and without sample weights are different
|
||||
assert (est.score(ds.data, ds.target) !=
|
||||
est.score(ds.data, ds.target,
|
||||
sample_weight=sample_weight)), (
|
||||
"Unweighted and weighted scores "
|
||||
"are unexpectedly equal")
|
||||
|
||||
|
||||
def test_clone_pandas_dataframe():
|
||||
|
||||
class DummyEstimator(TransformerMixin, BaseEstimator):
|
||||
"""This is a dummy class for generating numerical features
|
||||
|
||||
This feature extractor extracts numerical features from pandas data
|
||||
frame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
df: pandas data frame
|
||||
The pandas data frame parameter.
|
||||
|
||||
Notes
|
||||
-----
|
||||
"""
|
||||
def __init__(self, df=None, scalar_param=1):
|
||||
self.df = df
|
||||
self.scalar_param = scalar_param
|
||||
|
||||
def fit(self, X, y=None):
|
||||
pass
|
||||
|
||||
def transform(self, X):
|
||||
pass
|
||||
|
||||
# build and clone estimator
|
||||
d = np.arange(10)
|
||||
df = MockDataFrame(d)
|
||||
e = DummyEstimator(df, scalar_param=1)
|
||||
cloned_e = clone(e)
|
||||
|
||||
# the test
|
||||
assert (e.df == cloned_e.df).values.all()
|
||||
assert e.scalar_param == cloned_e.scalar_param
|
||||
|
||||
|
||||
def test_pickle_version_warning_is_not_raised_with_matching_version():
|
||||
iris = datasets.load_iris()
|
||||
tree = DecisionTreeClassifier().fit(iris.data, iris.target)
|
||||
tree_pickle = pickle.dumps(tree)
|
||||
assert b"version" in tree_pickle
|
||||
tree_restored = assert_no_warnings(pickle.loads, tree_pickle)
|
||||
|
||||
# test that we can predict with the restored decision tree classifier
|
||||
score_of_original = tree.score(iris.data, iris.target)
|
||||
score_of_restored = tree_restored.score(iris.data, iris.target)
|
||||
assert score_of_original == score_of_restored
|
||||
|
||||
|
||||
class TreeBadVersion(DecisionTreeClassifier):
|
||||
def __getstate__(self):
|
||||
return dict(self.__dict__.items(), _sklearn_version="something")
|
||||
|
||||
|
||||
pickle_error_message = (
|
||||
"Trying to unpickle estimator {estimator} from "
|
||||
"version {old_version} when using version "
|
||||
"{current_version}. This might "
|
||||
"lead to breaking code or invalid results. "
|
||||
"Use at your own risk.")
|
||||
|
||||
|
||||
def test_pickle_version_warning_is_issued_upon_different_version():
|
||||
iris = datasets.load_iris()
|
||||
tree = TreeBadVersion().fit(iris.data, iris.target)
|
||||
tree_pickle_other = pickle.dumps(tree)
|
||||
message = pickle_error_message.format(estimator="TreeBadVersion",
|
||||
old_version="something",
|
||||
current_version=sklearn.__version__)
|
||||
assert_warns_message(UserWarning, message, pickle.loads, tree_pickle_other)
|
||||
|
||||
|
||||
class TreeNoVersion(DecisionTreeClassifier):
|
||||
def __getstate__(self):
|
||||
return self.__dict__
|
||||
|
||||
|
||||
def test_pickle_version_warning_is_issued_when_no_version_info_in_pickle():
|
||||
iris = datasets.load_iris()
|
||||
# TreeNoVersion has no getstate, like pre-0.18
|
||||
tree = TreeNoVersion().fit(iris.data, iris.target)
|
||||
|
||||
tree_pickle_noversion = pickle.dumps(tree)
|
||||
assert b"version" not in tree_pickle_noversion
|
||||
message = pickle_error_message.format(estimator="TreeNoVersion",
|
||||
old_version="pre-0.18",
|
||||
current_version=sklearn.__version__)
|
||||
# check we got the warning about using pre-0.18 pickle
|
||||
assert_warns_message(UserWarning, message, pickle.loads,
|
||||
tree_pickle_noversion)
|
||||
|
||||
|
||||
def test_pickle_version_no_warning_is_issued_with_non_sklearn_estimator():
|
||||
iris = datasets.load_iris()
|
||||
tree = TreeNoVersion().fit(iris.data, iris.target)
|
||||
tree_pickle_noversion = pickle.dumps(tree)
|
||||
try:
|
||||
module_backup = TreeNoVersion.__module__
|
||||
TreeNoVersion.__module__ = "notsklearn"
|
||||
assert_no_warnings(pickle.loads, tree_pickle_noversion)
|
||||
finally:
|
||||
TreeNoVersion.__module__ = module_backup
|
||||
|
||||
|
||||
class DontPickleAttributeMixin:
|
||||
def __getstate__(self):
|
||||
data = self.__dict__.copy()
|
||||
data["_attribute_not_pickled"] = None
|
||||
return data
|
||||
|
||||
def __setstate__(self, state):
|
||||
state["_restored"] = True
|
||||
self.__dict__.update(state)
|
||||
|
||||
|
||||
class MultiInheritanceEstimator(DontPickleAttributeMixin, BaseEstimator):
|
||||
def __init__(self, attribute_pickled=5):
|
||||
self.attribute_pickled = attribute_pickled
|
||||
self._attribute_not_pickled = None
|
||||
|
||||
|
||||
def test_pickling_when_getstate_is_overwritten_by_mixin():
|
||||
estimator = MultiInheritanceEstimator()
|
||||
estimator._attribute_not_pickled = "this attribute should not be pickled"
|
||||
|
||||
serialized = pickle.dumps(estimator)
|
||||
estimator_restored = pickle.loads(serialized)
|
||||
assert estimator_restored.attribute_pickled == 5
|
||||
assert estimator_restored._attribute_not_pickled is None
|
||||
assert estimator_restored._restored
|
||||
|
||||
|
||||
def test_pickling_when_getstate_is_overwritten_by_mixin_outside_of_sklearn():
|
||||
try:
|
||||
estimator = MultiInheritanceEstimator()
|
||||
text = "this attribute should not be pickled"
|
||||
estimator._attribute_not_pickled = text
|
||||
old_mod = type(estimator).__module__
|
||||
type(estimator).__module__ = "notsklearn"
|
||||
|
||||
serialized = estimator.__getstate__()
|
||||
assert serialized == {'_attribute_not_pickled': None,
|
||||
'attribute_pickled': 5}
|
||||
|
||||
serialized['attribute_pickled'] = 4
|
||||
estimator.__setstate__(serialized)
|
||||
assert estimator.attribute_pickled == 4
|
||||
assert estimator._restored
|
||||
finally:
|
||||
type(estimator).__module__ = old_mod
|
||||
|
||||
|
||||
class SingleInheritanceEstimator(BaseEstimator):
|
||||
def __init__(self, attribute_pickled=5):
|
||||
self.attribute_pickled = attribute_pickled
|
||||
self._attribute_not_pickled = None
|
||||
|
||||
def __getstate__(self):
|
||||
data = self.__dict__.copy()
|
||||
data["_attribute_not_pickled"] = None
|
||||
return data
|
||||
|
||||
|
||||
@ignore_warnings(category=(UserWarning))
|
||||
def test_pickling_works_when_getstate_is_overwritten_in_the_child_class():
|
||||
estimator = SingleInheritanceEstimator()
|
||||
estimator._attribute_not_pickled = "this attribute should not be pickled"
|
||||
|
||||
serialized = pickle.dumps(estimator)
|
||||
estimator_restored = pickle.loads(serialized)
|
||||
assert estimator_restored.attribute_pickled == 5
|
||||
assert estimator_restored._attribute_not_pickled is None
|
||||
|
||||
|
||||
def test_tag_inheritance():
|
||||
# test that changing tags by inheritance is not allowed
|
||||
|
||||
nan_tag_est = NaNTag()
|
||||
no_nan_tag_est = NoNaNTag()
|
||||
assert nan_tag_est._get_tags()['allow_nan']
|
||||
assert not no_nan_tag_est._get_tags()['allow_nan']
|
||||
|
||||
redefine_tags_est = OverrideTag()
|
||||
assert not redefine_tags_est._get_tags()['allow_nan']
|
||||
|
||||
diamond_tag_est = DiamondOverwriteTag()
|
||||
assert diamond_tag_est._get_tags()['allow_nan']
|
||||
|
||||
inherit_diamond_tag_est = InheritDiamondOverwriteTag()
|
||||
assert inherit_diamond_tag_est._get_tags()['allow_nan']
|
||||
|
||||
|
||||
def test_warns_on_get_params_non_attribute():
|
||||
class MyEstimator(BaseEstimator):
|
||||
def __init__(self, param=5):
|
||||
pass
|
||||
|
||||
def fit(self, X, y=None):
|
||||
return self
|
||||
|
||||
est = MyEstimator()
|
||||
with pytest.warns(FutureWarning, match='AttributeError'):
|
||||
params = est.get_params()
|
||||
|
||||
assert params['param'] is None
|
||||
|
||||
|
||||
def test_repr_mimebundle_():
|
||||
# Checks the display configuration flag controls the json output
|
||||
tree = DecisionTreeClassifier()
|
||||
output = tree._repr_mimebundle_()
|
||||
assert "text/plain" in output
|
||||
assert "text/html" not in output
|
||||
|
||||
with config_context(display='diagram'):
|
||||
output = tree._repr_mimebundle_()
|
||||
assert "text/plain" in output
|
||||
assert "text/html" in output
|
||||
|
||||
|
||||
def test_repr_html_wraps():
|
||||
# Checks the display configuration flag controls the html output
|
||||
tree = DecisionTreeClassifier()
|
||||
msg = "_repr_html_ is only defined when"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
output = tree._repr_html_()
|
||||
|
||||
with config_context(display='diagram'):
|
||||
output = tree._repr_html_()
|
||||
assert "<style>" in output
|
32
venv/Lib/site-packages/sklearn/tests/test_build.py
Normal file
32
venv/Lib/site-packages/sklearn/tests/test_build.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
import os
|
||||
import pytest
|
||||
import textwrap
|
||||
|
||||
from sklearn import __version__
|
||||
from sklearn.utils._openmp_helpers import _openmp_parallelism_enabled
|
||||
|
||||
|
||||
def test_openmp_parallelism_enabled():
|
||||
# Check that sklearn is built with OpenMP-based parallelism enabled.
|
||||
# This test can be skipped by setting the environment variable
|
||||
# ``SKLEARN_SKIP_OPENMP_TEST``.
|
||||
if os.getenv("SKLEARN_SKIP_OPENMP_TEST"):
|
||||
pytest.skip("test explicitly skipped (SKLEARN_SKIP_OPENMP_TEST)")
|
||||
|
||||
base_url = "dev" if __version__.endswith(".dev0") else "stable"
|
||||
err_msg = textwrap.dedent(
|
||||
"""
|
||||
This test fails because scikit-learn has been built without OpenMP.
|
||||
This is not recommended since some estimators will run in sequential
|
||||
mode instead of leveraging thread-based parallelism.
|
||||
|
||||
You can find instructions to build scikit-learn with OpenMP at this
|
||||
address:
|
||||
|
||||
https://scikit-learn.org/{}/developers/advanced_installation.html
|
||||
|
||||
You can skip this test by setting the environment variable
|
||||
SKLEARN_SKIP_OPENMP_TEST to any value.
|
||||
""").format(base_url)
|
||||
|
||||
assert _openmp_parallelism_enabled(), err_msg
|
343
venv/Lib/site-packages/sklearn/tests/test_calibration.py
Normal file
343
venv/Lib/site-packages/sklearn/tests/test_calibration.py
Normal file
|
@ -0,0 +1,343 @@
|
|||
# Authors: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.model_selection import LeaveOneOut
|
||||
|
||||
from sklearn.utils._testing import (assert_array_almost_equal,
|
||||
assert_almost_equal,
|
||||
assert_array_equal,
|
||||
assert_raises, ignore_warnings)
|
||||
from sklearn.datasets import make_classification, make_blobs
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.metrics import brier_score_loss, log_loss
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.calibration import _sigmoid_calibration, _SigmoidCalibration
|
||||
from sklearn.calibration import calibration_curve
|
||||
|
||||
|
||||
def test_calibration():
|
||||
"""Test calibration objects with isotonic and sigmoid"""
|
||||
n_samples = 100
|
||||
X, y = make_classification(n_samples=2 * n_samples, n_features=6,
|
||||
random_state=42)
|
||||
sample_weight = np.random.RandomState(seed=42).uniform(size=y.size)
|
||||
|
||||
X -= X.min() # MultinomialNB only allows positive X
|
||||
|
||||
# split train and test
|
||||
X_train, y_train, sw_train = \
|
||||
X[:n_samples], y[:n_samples], sample_weight[:n_samples]
|
||||
X_test, y_test = X[n_samples:], y[n_samples:]
|
||||
|
||||
# Naive-Bayes
|
||||
clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train)
|
||||
prob_pos_clf = clf.predict_proba(X_test)[:, 1]
|
||||
|
||||
pc_clf = CalibratedClassifierCV(clf, cv=y.size + 1)
|
||||
assert_raises(ValueError, pc_clf.fit, X, y)
|
||||
|
||||
# Naive Bayes with calibration
|
||||
for this_X_train, this_X_test in [(X_train, X_test),
|
||||
(sparse.csr_matrix(X_train),
|
||||
sparse.csr_matrix(X_test))]:
|
||||
for method in ['isotonic', 'sigmoid']:
|
||||
pc_clf = CalibratedClassifierCV(clf, method=method, cv=2)
|
||||
# Note that this fit overwrites the fit on the entire training
|
||||
# set
|
||||
pc_clf.fit(this_X_train, y_train, sample_weight=sw_train)
|
||||
prob_pos_pc_clf = pc_clf.predict_proba(this_X_test)[:, 1]
|
||||
|
||||
# Check that brier score has improved after calibration
|
||||
assert (brier_score_loss(y_test, prob_pos_clf) >
|
||||
brier_score_loss(y_test, prob_pos_pc_clf))
|
||||
|
||||
# Check invariance against relabeling [0, 1] -> [1, 2]
|
||||
pc_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train)
|
||||
prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1]
|
||||
assert_array_almost_equal(prob_pos_pc_clf,
|
||||
prob_pos_pc_clf_relabeled)
|
||||
|
||||
# Check invariance against relabeling [0, 1] -> [-1, 1]
|
||||
pc_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train)
|
||||
prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1]
|
||||
assert_array_almost_equal(prob_pos_pc_clf,
|
||||
prob_pos_pc_clf_relabeled)
|
||||
|
||||
# Check invariance against relabeling [0, 1] -> [1, 0]
|
||||
pc_clf.fit(this_X_train, (y_train + 1) % 2,
|
||||
sample_weight=sw_train)
|
||||
prob_pos_pc_clf_relabeled = \
|
||||
pc_clf.predict_proba(this_X_test)[:, 1]
|
||||
if method == "sigmoid":
|
||||
assert_array_almost_equal(prob_pos_pc_clf,
|
||||
1 - prob_pos_pc_clf_relabeled)
|
||||
else:
|
||||
# Isotonic calibration is not invariant against relabeling
|
||||
# but should improve in both cases
|
||||
assert (brier_score_loss(y_test, prob_pos_clf) >
|
||||
brier_score_loss((y_test + 1) % 2,
|
||||
prob_pos_pc_clf_relabeled))
|
||||
|
||||
# Check failure cases:
|
||||
# only "isotonic" and "sigmoid" should be accepted as methods
|
||||
clf_invalid_method = CalibratedClassifierCV(clf, method="foo")
|
||||
assert_raises(ValueError, clf_invalid_method.fit, X_train, y_train)
|
||||
|
||||
# base-estimators should provide either decision_function or
|
||||
# predict_proba (most regressors, for instance, should fail)
|
||||
clf_base_regressor = \
|
||||
CalibratedClassifierCV(RandomForestRegressor(), method="sigmoid")
|
||||
assert_raises(RuntimeError, clf_base_regressor.fit, X_train, y_train)
|
||||
|
||||
|
||||
def test_sample_weight():
|
||||
n_samples = 100
|
||||
X, y = make_classification(n_samples=2 * n_samples, n_features=6,
|
||||
random_state=42)
|
||||
|
||||
sample_weight = np.random.RandomState(seed=42).uniform(size=len(y))
|
||||
X_train, y_train, sw_train = \
|
||||
X[:n_samples], y[:n_samples], sample_weight[:n_samples]
|
||||
X_test = X[n_samples:]
|
||||
|
||||
for method in ['sigmoid', 'isotonic']:
|
||||
base_estimator = LinearSVC(random_state=42)
|
||||
calibrated_clf = CalibratedClassifierCV(base_estimator, method=method)
|
||||
calibrated_clf.fit(X_train, y_train, sample_weight=sw_train)
|
||||
probs_with_sw = calibrated_clf.predict_proba(X_test)
|
||||
|
||||
# As the weights are used for the calibration, they should still yield
|
||||
# a different predictions
|
||||
calibrated_clf.fit(X_train, y_train)
|
||||
probs_without_sw = calibrated_clf.predict_proba(X_test)
|
||||
|
||||
diff = np.linalg.norm(probs_with_sw - probs_without_sw)
|
||||
assert diff > 0.1
|
||||
|
||||
|
||||
def test_calibration_multiclass():
|
||||
"""Test calibration for multiclass """
|
||||
# test multi-class setting with classifier that implements
|
||||
# only decision function
|
||||
clf = LinearSVC()
|
||||
X, y_idx = make_blobs(n_samples=100, n_features=2, random_state=42,
|
||||
centers=3, cluster_std=3.0)
|
||||
|
||||
# Use categorical labels to check that CalibratedClassifierCV supports
|
||||
# them correctly
|
||||
target_names = np.array(['a', 'b', 'c'])
|
||||
y = target_names[y_idx]
|
||||
|
||||
X_train, y_train = X[::2], y[::2]
|
||||
X_test, y_test = X[1::2], y[1::2]
|
||||
|
||||
clf.fit(X_train, y_train)
|
||||
for method in ['isotonic', 'sigmoid']:
|
||||
cal_clf = CalibratedClassifierCV(clf, method=method, cv=2)
|
||||
cal_clf.fit(X_train, y_train)
|
||||
probas = cal_clf.predict_proba(X_test)
|
||||
assert_array_almost_equal(np.sum(probas, axis=1), np.ones(len(X_test)))
|
||||
|
||||
# Check that log-loss of calibrated classifier is smaller than
|
||||
# log-loss of naively turned OvR decision function to probabilities
|
||||
# via softmax
|
||||
def softmax(y_pred):
|
||||
e = np.exp(-y_pred)
|
||||
return e / e.sum(axis=1).reshape(-1, 1)
|
||||
|
||||
uncalibrated_log_loss = \
|
||||
log_loss(y_test, softmax(clf.decision_function(X_test)))
|
||||
calibrated_log_loss = log_loss(y_test, probas)
|
||||
assert uncalibrated_log_loss >= calibrated_log_loss
|
||||
|
||||
# Test that calibration of a multiclass classifier decreases log-loss
|
||||
# for RandomForestClassifier
|
||||
X, y = make_blobs(n_samples=100, n_features=2, random_state=42,
|
||||
cluster_std=3.0)
|
||||
X_train, y_train = X[::2], y[::2]
|
||||
X_test, y_test = X[1::2], y[1::2]
|
||||
|
||||
clf = RandomForestClassifier(n_estimators=10, random_state=42)
|
||||
clf.fit(X_train, y_train)
|
||||
clf_probs = clf.predict_proba(X_test)
|
||||
loss = log_loss(y_test, clf_probs)
|
||||
|
||||
for method in ['isotonic', 'sigmoid']:
|
||||
cal_clf = CalibratedClassifierCV(clf, method=method, cv=3)
|
||||
cal_clf.fit(X_train, y_train)
|
||||
cal_clf_probs = cal_clf.predict_proba(X_test)
|
||||
cal_loss = log_loss(y_test, cal_clf_probs)
|
||||
assert loss > cal_loss
|
||||
|
||||
|
||||
def test_calibration_prefit():
|
||||
"""Test calibration for prefitted classifiers"""
|
||||
n_samples = 50
|
||||
X, y = make_classification(n_samples=3 * n_samples, n_features=6,
|
||||
random_state=42)
|
||||
sample_weight = np.random.RandomState(seed=42).uniform(size=y.size)
|
||||
|
||||
X -= X.min() # MultinomialNB only allows positive X
|
||||
|
||||
# split train and test
|
||||
X_train, y_train, sw_train = \
|
||||
X[:n_samples], y[:n_samples], sample_weight[:n_samples]
|
||||
X_calib, y_calib, sw_calib = \
|
||||
X[n_samples:2 * n_samples], y[n_samples:2 * n_samples], \
|
||||
sample_weight[n_samples:2 * n_samples]
|
||||
X_test, y_test = X[2 * n_samples:], y[2 * n_samples:]
|
||||
|
||||
# Naive-Bayes
|
||||
clf = MultinomialNB()
|
||||
clf.fit(X_train, y_train, sw_train)
|
||||
prob_pos_clf = clf.predict_proba(X_test)[:, 1]
|
||||
|
||||
# Naive Bayes with calibration
|
||||
for this_X_calib, this_X_test in [(X_calib, X_test),
|
||||
(sparse.csr_matrix(X_calib),
|
||||
sparse.csr_matrix(X_test))]:
|
||||
for method in ['isotonic', 'sigmoid']:
|
||||
pc_clf = CalibratedClassifierCV(clf, method=method, cv="prefit")
|
||||
|
||||
for sw in [sw_calib, None]:
|
||||
pc_clf.fit(this_X_calib, y_calib, sample_weight=sw)
|
||||
y_prob = pc_clf.predict_proba(this_X_test)
|
||||
y_pred = pc_clf.predict(this_X_test)
|
||||
prob_pos_pc_clf = y_prob[:, 1]
|
||||
assert_array_equal(y_pred,
|
||||
np.array([0, 1])[np.argmax(y_prob, axis=1)])
|
||||
|
||||
assert (brier_score_loss(y_test, prob_pos_clf) >
|
||||
brier_score_loss(y_test, prob_pos_pc_clf))
|
||||
|
||||
|
||||
def test_sigmoid_calibration():
|
||||
"""Test calibration values with Platt sigmoid model"""
|
||||
exF = np.array([5, -4, 1.0])
|
||||
exY = np.array([1, -1, -1])
|
||||
# computed from my python port of the C++ code in LibSVM
|
||||
AB_lin_libsvm = np.array([-0.20261354391187855, 0.65236314980010512])
|
||||
assert_array_almost_equal(AB_lin_libsvm,
|
||||
_sigmoid_calibration(exF, exY), 3)
|
||||
lin_prob = 1. / (1. + np.exp(AB_lin_libsvm[0] * exF + AB_lin_libsvm[1]))
|
||||
sk_prob = _SigmoidCalibration().fit(exF, exY).predict(exF)
|
||||
assert_array_almost_equal(lin_prob, sk_prob, 6)
|
||||
|
||||
# check that _SigmoidCalibration().fit only accepts 1d array or 2d column
|
||||
# arrays
|
||||
assert_raises(ValueError, _SigmoidCalibration().fit,
|
||||
np.vstack((exF, exF)), exY)
|
||||
|
||||
|
||||
def test_calibration_curve():
|
||||
"""Check calibration_curve function"""
|
||||
y_true = np.array([0, 0, 0, 1, 1, 1])
|
||||
y_pred = np.array([0., 0.1, 0.2, 0.8, 0.9, 1.])
|
||||
prob_true, prob_pred = calibration_curve(y_true, y_pred, n_bins=2)
|
||||
prob_true_unnormalized, prob_pred_unnormalized = \
|
||||
calibration_curve(y_true, y_pred * 2, n_bins=2, normalize=True)
|
||||
assert len(prob_true) == len(prob_pred)
|
||||
assert len(prob_true) == 2
|
||||
assert_almost_equal(prob_true, [0, 1])
|
||||
assert_almost_equal(prob_pred, [0.1, 0.9])
|
||||
assert_almost_equal(prob_true, prob_true_unnormalized)
|
||||
assert_almost_equal(prob_pred, prob_pred_unnormalized)
|
||||
|
||||
# probabilities outside [0, 1] should not be accepted when normalize
|
||||
# is set to False
|
||||
assert_raises(ValueError, calibration_curve, [1.1], [-0.1],
|
||||
normalize=False)
|
||||
|
||||
# test that quantiles work as expected
|
||||
y_true2 = np.array([0, 0, 0, 0, 1, 1])
|
||||
y_pred2 = np.array([0., 0.1, 0.2, 0.5, 0.9, 1.])
|
||||
prob_true_quantile, prob_pred_quantile = calibration_curve(
|
||||
y_true2, y_pred2, n_bins=2, strategy='quantile')
|
||||
|
||||
assert len(prob_true_quantile) == len(prob_pred_quantile)
|
||||
assert len(prob_true_quantile) == 2
|
||||
assert_almost_equal(prob_true_quantile, [0, 2 / 3])
|
||||
assert_almost_equal(prob_pred_quantile, [0.1, 0.8])
|
||||
|
||||
# Check that error is raised when invalid strategy is selected
|
||||
assert_raises(ValueError, calibration_curve, y_true2, y_pred2,
|
||||
strategy='percentile')
|
||||
|
||||
|
||||
def test_calibration_nan_imputer():
|
||||
"""Test that calibration can accept nan"""
|
||||
X, y = make_classification(n_samples=10, n_features=2,
|
||||
n_informative=2, n_redundant=0,
|
||||
random_state=42)
|
||||
X[0, 0] = np.nan
|
||||
clf = Pipeline(
|
||||
[('imputer', SimpleImputer()),
|
||||
('rf', RandomForestClassifier(n_estimators=1))])
|
||||
clf_c = CalibratedClassifierCV(clf, cv=2, method='isotonic')
|
||||
clf_c.fit(X, y)
|
||||
clf_c.predict(X)
|
||||
|
||||
|
||||
def test_calibration_prob_sum():
|
||||
# Test that sum of probabilities is 1. A non-regression test for
|
||||
# issue #7796
|
||||
num_classes = 2
|
||||
X, y = make_classification(n_samples=10, n_features=5,
|
||||
n_classes=num_classes)
|
||||
clf = LinearSVC(C=1.0)
|
||||
clf_prob = CalibratedClassifierCV(clf, method="sigmoid", cv=LeaveOneOut())
|
||||
clf_prob.fit(X, y)
|
||||
|
||||
probs = clf_prob.predict_proba(X)
|
||||
assert_array_almost_equal(probs.sum(axis=1), np.ones(probs.shape[0]))
|
||||
|
||||
|
||||
def test_calibration_less_classes():
|
||||
# Test to check calibration works fine when train set in a test-train
|
||||
# split does not contain all classes
|
||||
# Since this test uses LOO, at each iteration train set will not contain a
|
||||
# class label
|
||||
X = np.random.randn(10, 5)
|
||||
y = np.arange(10)
|
||||
clf = LinearSVC(C=1.0)
|
||||
cal_clf = CalibratedClassifierCV(clf, method="sigmoid", cv=LeaveOneOut())
|
||||
cal_clf.fit(X, y)
|
||||
|
||||
for i, calibrated_classifier in \
|
||||
enumerate(cal_clf.calibrated_classifiers_):
|
||||
proba = calibrated_classifier.predict_proba(X)
|
||||
assert_array_equal(proba[:, i], np.zeros(len(y)))
|
||||
assert np.all(np.hstack([proba[:, :i],
|
||||
proba[:, i + 1:]]))
|
||||
|
||||
|
||||
@ignore_warnings(category=FutureWarning)
|
||||
@pytest.mark.parametrize('X', [np.random.RandomState(42).randn(15, 5, 2),
|
||||
np.random.RandomState(42).randn(15, 5, 2, 6)])
|
||||
def test_calibration_accepts_ndarray(X):
|
||||
"""Test that calibration accepts n-dimensional arrays as input"""
|
||||
y = [1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0]
|
||||
|
||||
class MockTensorClassifier(BaseEstimator):
|
||||
"""A toy estimator that accepts tensor inputs"""
|
||||
|
||||
def fit(self, X, y):
|
||||
self.classes_ = np.unique(y)
|
||||
return self
|
||||
|
||||
def decision_function(self, X):
|
||||
# toy decision function that just needs to have the right shape:
|
||||
return X.reshape(X.shape[0], -1).sum(axis=1)
|
||||
|
||||
calibrated_clf = CalibratedClassifierCV(MockTensorClassifier())
|
||||
# we should be able to fit this classifier with no error
|
||||
calibrated_clf.fit(X, y)
|
14
venv/Lib/site-packages/sklearn/tests/test_check_build.py
Normal file
14
venv/Lib/site-packages/sklearn/tests/test_check_build.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
"""
|
||||
Smoke Test the check_build module
|
||||
"""
|
||||
|
||||
# Author: G Varoquaux
|
||||
# License: BSD 3 clause
|
||||
|
||||
from sklearn.__check_build import raise_build_error
|
||||
|
||||
from sklearn.utils._testing import assert_raises
|
||||
|
||||
|
||||
def test_raise_build_error():
|
||||
assert_raises(ImportError, raise_build_error, ImportError())
|
260
venv/Lib/site-packages/sklearn/tests/test_common.py
Normal file
260
venv/Lib/site-packages/sklearn/tests/test_common.py
Normal file
|
@ -0,0 +1,260 @@
|
|||
"""
|
||||
General tests for all estimators in sklearn.
|
||||
"""
|
||||
|
||||
# Authors: Andreas Mueller <amueller@ais.uni-bonn.de>
|
||||
# Gael Varoquaux gael.varoquaux@normalesup.org
|
||||
# License: BSD 3 clause
|
||||
|
||||
import os
|
||||
import warnings
|
||||
import sys
|
||||
import re
|
||||
import pkgutil
|
||||
from inspect import isgenerator
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
from sklearn.utils import all_estimators
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.utils.estimator_checks import check_estimator
|
||||
|
||||
import sklearn
|
||||
from sklearn.base import BiclusterMixin
|
||||
|
||||
from sklearn.linear_model._base import LinearClassifierMixin
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.utils import IS_PYPY
|
||||
from sklearn.utils._testing import SkipTest
|
||||
from sklearn.utils.estimator_checks import (
|
||||
_mark_xfail_checks,
|
||||
_construct_instance,
|
||||
_set_checking_parameters,
|
||||
_set_check_estimator_ids,
|
||||
check_parameters_default_constructible,
|
||||
check_class_weight_balanced_linear_classifier,
|
||||
parametrize_with_checks)
|
||||
|
||||
|
||||
def test_all_estimator_no_base_class():
|
||||
# test that all_estimators doesn't find abstract classes.
|
||||
for name, Estimator in all_estimators():
|
||||
msg = ("Base estimators such as {0} should not be included"
|
||||
" in all_estimators").format(name)
|
||||
assert not name.lower().startswith('base'), msg
|
||||
|
||||
|
||||
@ignore_warnings("Passing a class is depr", category=FutureWarning) # 0.24
|
||||
def test_estimator_cls_parameterize_with_checks():
|
||||
# TODO: remove test in 0.24
|
||||
# Non-regression test for #16707 to ensure that parametrize_with_checks
|
||||
# works with estimator classes
|
||||
param_checks = parametrize_with_checks([LogisticRegression])
|
||||
# Using the generator does not raise
|
||||
list(param_checks.args[1])
|
||||
|
||||
|
||||
def test_mark_xfail_checks_with_unconsructable_estimator():
|
||||
class MyEstimator:
|
||||
def __init__(self):
|
||||
raise ValueError("This is bad")
|
||||
|
||||
estimator, check = _mark_xfail_checks(MyEstimator, 42, None)
|
||||
assert estimator == MyEstimator
|
||||
assert check == 42
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'name, Estimator',
|
||||
all_estimators()
|
||||
)
|
||||
def test_parameters_default_constructible(name, Estimator):
|
||||
# Test that estimators are default-constructible
|
||||
check_parameters_default_constructible(name, Estimator)
|
||||
|
||||
|
||||
def _sample_func(x, y=1):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("val, expected", [
|
||||
(partial(_sample_func, y=1), "_sample_func(y=1)"),
|
||||
(_sample_func, "_sample_func"),
|
||||
(partial(_sample_func, 'world'), "_sample_func"),
|
||||
(LogisticRegression(C=2.0), "LogisticRegression(C=2.0)"),
|
||||
(LogisticRegression(random_state=1, solver='newton-cg',
|
||||
class_weight='balanced', warm_start=True),
|
||||
"LogisticRegression(class_weight='balanced',random_state=1,"
|
||||
"solver='newton-cg',warm_start=True)")
|
||||
])
|
||||
def test_set_check_estimator_ids(val, expected):
|
||||
assert _set_check_estimator_ids(val) == expected
|
||||
|
||||
|
||||
def _tested_estimators():
|
||||
for name, Estimator in all_estimators():
|
||||
if issubclass(Estimator, BiclusterMixin):
|
||||
continue
|
||||
try:
|
||||
estimator = _construct_instance(Estimator)
|
||||
except SkipTest:
|
||||
continue
|
||||
|
||||
yield estimator
|
||||
|
||||
|
||||
@parametrize_with_checks(list(_tested_estimators()))
|
||||
def test_estimators(estimator, check, request):
|
||||
# Common tests for estimator instances
|
||||
with ignore_warnings(category=(FutureWarning,
|
||||
ConvergenceWarning,
|
||||
UserWarning, FutureWarning)):
|
||||
_set_checking_parameters(estimator)
|
||||
check(estimator)
|
||||
|
||||
|
||||
@ignore_warnings("Passing a class is depr", category=FutureWarning) # 0.24
|
||||
def test_check_estimator_generate_only():
|
||||
# TODO in 0.24: remove checks on passing a class
|
||||
estimator_cls_gen_checks = check_estimator(LogisticRegression,
|
||||
generate_only=True)
|
||||
all_instance_gen_checks = check_estimator(LogisticRegression(),
|
||||
generate_only=True)
|
||||
assert isgenerator(estimator_cls_gen_checks)
|
||||
assert isgenerator(all_instance_gen_checks)
|
||||
|
||||
estimator_cls_checks = list(estimator_cls_gen_checks)
|
||||
all_instance_checks = list(all_instance_gen_checks)
|
||||
|
||||
# all classes checks include check_parameters_default_constructible
|
||||
assert len(estimator_cls_checks) == len(all_instance_checks) + 1
|
||||
|
||||
# TODO: meta-estimators like GridSearchCV has required parameters
|
||||
# that do not have default values. This is expected to change in the future
|
||||
with pytest.raises(SkipTest):
|
||||
for estimator, check in check_estimator(GridSearchCV,
|
||||
generate_only=True):
|
||||
check(estimator)
|
||||
|
||||
|
||||
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||||
# ignore deprecated open(.., 'U') in numpy distutils
|
||||
def test_configure():
|
||||
# Smoke test the 'configure' step of setup, this tests all the
|
||||
# 'configure' functions in the setup.pys in scikit-learn
|
||||
# This test requires Cython which is not necessarily there when running
|
||||
# the tests of an installed version of scikit-learn or when scikit-learn
|
||||
# is installed in editable mode by pip build isolation enabled.
|
||||
pytest.importorskip("Cython")
|
||||
cwd = os.getcwd()
|
||||
setup_path = os.path.abspath(os.path.join(sklearn.__path__[0], '..'))
|
||||
setup_filename = os.path.join(setup_path, 'setup.py')
|
||||
if not os.path.exists(setup_filename):
|
||||
pytest.skip('setup.py not available')
|
||||
# XXX unreached code as of v0.22
|
||||
try:
|
||||
os.chdir(setup_path)
|
||||
old_argv = sys.argv
|
||||
sys.argv = ['setup.py', 'config']
|
||||
|
||||
with warnings.catch_warnings():
|
||||
# The configuration spits out warnings when not finding
|
||||
# Blas/Atlas development headers
|
||||
warnings.simplefilter('ignore', UserWarning)
|
||||
with open('setup.py') as f:
|
||||
exec(f.read(), dict(__name__='__main__'))
|
||||
finally:
|
||||
sys.argv = old_argv
|
||||
os.chdir(cwd)
|
||||
|
||||
|
||||
def _tested_linear_classifiers():
|
||||
classifiers = all_estimators(type_filter='classifier')
|
||||
|
||||
with warnings.catch_warnings(record=True):
|
||||
for name, clazz in classifiers:
|
||||
required_parameters = getattr(clazz, "_required_parameters", [])
|
||||
if len(required_parameters):
|
||||
# FIXME
|
||||
continue
|
||||
|
||||
if ('class_weight' in clazz().get_params().keys() and
|
||||
issubclass(clazz, LinearClassifierMixin)):
|
||||
yield name, clazz
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name, Classifier",
|
||||
_tested_linear_classifiers())
|
||||
def test_class_weight_balanced_linear_classifiers(name, Classifier):
|
||||
check_class_weight_balanced_linear_classifier(name, Classifier)
|
||||
|
||||
|
||||
@ignore_warnings
|
||||
def test_import_all_consistency():
|
||||
# Smoke test to check that any name in a __all__ list is actually defined
|
||||
# in the namespace of the module or package.
|
||||
pkgs = pkgutil.walk_packages(path=sklearn.__path__, prefix='sklearn.',
|
||||
onerror=lambda _: None)
|
||||
submods = [modname for _, modname, _ in pkgs]
|
||||
for modname in submods + ['sklearn']:
|
||||
if ".tests." in modname:
|
||||
continue
|
||||
if IS_PYPY and ('_svmlight_format_io' in modname or
|
||||
'feature_extraction._hashing_fast' in modname):
|
||||
continue
|
||||
package = __import__(modname, fromlist="dummy")
|
||||
for name in getattr(package, '__all__', ()):
|
||||
assert hasattr(package, name),\
|
||||
"Module '{0}' has no attribute '{1}'".format(modname, name)
|
||||
|
||||
|
||||
def test_root_import_all_completeness():
|
||||
EXCEPTIONS = ('utils', 'tests', 'base', 'setup', 'conftest')
|
||||
for _, modname, _ in pkgutil.walk_packages(path=sklearn.__path__,
|
||||
onerror=lambda _: None):
|
||||
if '.' in modname or modname.startswith('_') or modname in EXCEPTIONS:
|
||||
continue
|
||||
assert modname in sklearn.__all__
|
||||
|
||||
|
||||
def test_all_tests_are_importable():
|
||||
# Ensure that for each contentful subpackage, there is a test directory
|
||||
# within it that is also a subpackage (i.e. a directory with __init__.py)
|
||||
|
||||
HAS_TESTS_EXCEPTIONS = re.compile(r'''(?x)
|
||||
\.externals(\.|$)|
|
||||
\.tests(\.|$)|
|
||||
\._
|
||||
''')
|
||||
lookup = {name: ispkg
|
||||
for _, name, ispkg
|
||||
in pkgutil.walk_packages(sklearn.__path__, prefix='sklearn.')}
|
||||
missing_tests = [name for name, ispkg in lookup.items()
|
||||
if ispkg
|
||||
and not HAS_TESTS_EXCEPTIONS.search(name)
|
||||
and name + '.tests' not in lookup]
|
||||
assert missing_tests == [], ('{0} do not have `tests` subpackages. '
|
||||
'Perhaps they require '
|
||||
'__init__.py or an add_subpackage directive '
|
||||
'in the parent '
|
||||
'setup.py'.format(missing_tests))
|
||||
|
||||
|
||||
# TODO: remove in 0.24
|
||||
def test_class_support_deprecated():
|
||||
# Make sure passing classes to check_estimator or parametrize_with_checks
|
||||
# is deprecated
|
||||
|
||||
msg = "Passing a class is deprecated"
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
check_estimator(LogisticRegression)
|
||||
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
parametrize_with_checks([LogisticRegression])
|
||||
|
||||
# Make sure check_parameters_default_constructible accepts instances now
|
||||
check_parameters_default_constructible('name', LogisticRegression())
|
74
venv/Lib/site-packages/sklearn/tests/test_config.py
Normal file
74
venv/Lib/site-packages/sklearn/tests/test_config.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
from sklearn import get_config, set_config, config_context
|
||||
from sklearn.utils._testing import assert_raises
|
||||
|
||||
|
||||
def test_config_context():
|
||||
assert get_config() == {'assume_finite': False, 'working_memory': 1024,
|
||||
'print_changed_only': True,
|
||||
'display': 'text'}
|
||||
|
||||
# Not using as a context manager affects nothing
|
||||
config_context(assume_finite=True)
|
||||
assert get_config()['assume_finite'] is False
|
||||
|
||||
with config_context(assume_finite=True):
|
||||
assert get_config() == {'assume_finite': True, 'working_memory': 1024,
|
||||
'print_changed_only': True,
|
||||
'display': 'text'}
|
||||
assert get_config()['assume_finite'] is False
|
||||
|
||||
with config_context(assume_finite=True):
|
||||
with config_context(assume_finite=None):
|
||||
assert get_config()['assume_finite'] is True
|
||||
|
||||
assert get_config()['assume_finite'] is True
|
||||
|
||||
with config_context(assume_finite=False):
|
||||
assert get_config()['assume_finite'] is False
|
||||
|
||||
with config_context(assume_finite=None):
|
||||
assert get_config()['assume_finite'] is False
|
||||
|
||||
# global setting will not be retained outside of context that
|
||||
# did not modify this setting
|
||||
set_config(assume_finite=True)
|
||||
assert get_config()['assume_finite'] is True
|
||||
|
||||
assert get_config()['assume_finite'] is False
|
||||
|
||||
assert get_config()['assume_finite'] is True
|
||||
|
||||
assert get_config() == {'assume_finite': False, 'working_memory': 1024,
|
||||
'print_changed_only': True,
|
||||
'display': 'text'}
|
||||
|
||||
# No positional arguments
|
||||
assert_raises(TypeError, config_context, True)
|
||||
# No unknown arguments
|
||||
assert_raises(TypeError, config_context(do_something_else=True).__enter__)
|
||||
|
||||
|
||||
def test_config_context_exception():
|
||||
assert get_config()['assume_finite'] is False
|
||||
try:
|
||||
with config_context(assume_finite=True):
|
||||
assert get_config()['assume_finite'] is True
|
||||
raise ValueError()
|
||||
except ValueError:
|
||||
pass
|
||||
assert get_config()['assume_finite'] is False
|
||||
|
||||
|
||||
def test_set_config():
|
||||
assert get_config()['assume_finite'] is False
|
||||
set_config(assume_finite=None)
|
||||
assert get_config()['assume_finite'] is False
|
||||
set_config(assume_finite=True)
|
||||
assert get_config()['assume_finite'] is True
|
||||
set_config(assume_finite=None)
|
||||
assert get_config()['assume_finite'] is True
|
||||
set_config(assume_finite=False)
|
||||
assert get_config()['assume_finite'] is False
|
||||
|
||||
# No unknown arguments
|
||||
assert_raises(TypeError, set_config, do_something_else=True)
|
|
@ -0,0 +1,489 @@
|
|||
import numpy as np
|
||||
|
||||
import pytest
|
||||
|
||||
from scipy import linalg
|
||||
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import assert_array_equal, assert_no_warnings
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
from sklearn.utils._testing import assert_raises
|
||||
from sklearn.utils._testing import assert_raise_message
|
||||
from sklearn.utils._testing import assert_warns
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
|
||||
from sklearn.discriminant_analysis import _cov
|
||||
|
||||
|
||||
# Data is just 6 separable points in the plane
|
||||
X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]], dtype='f')
|
||||
y = np.array([1, 1, 1, 2, 2, 2])
|
||||
y3 = np.array([1, 1, 2, 2, 3, 3])
|
||||
|
||||
# Degenerate data with only one feature (still should be separable)
|
||||
X1 = np.array([[-2, ], [-1, ], [-1, ], [1, ], [1, ], [2, ]], dtype='f')
|
||||
|
||||
# Data is just 9 separable points in the plane
|
||||
X6 = np.array([[0, 0], [-2, -2], [-2, -1], [-1, -1], [-1, -2],
|
||||
[1, 3], [1, 2], [2, 1], [2, 2]])
|
||||
y6 = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2])
|
||||
y7 = np.array([1, 2, 3, 2, 3, 1, 2, 3, 1])
|
||||
|
||||
# Degenerate data with 1 feature (still should be separable)
|
||||
X7 = np.array([[-3, ], [-2, ], [-1, ], [-1, ], [0, ], [1, ], [1, ],
|
||||
[2, ], [3, ]])
|
||||
|
||||
# Data that has zero variance in one dimension and needs regularization
|
||||
X2 = np.array([[-3, 0], [-2, 0], [-1, 0], [-1, 0], [0, 0], [1, 0], [1, 0],
|
||||
[2, 0], [3, 0]])
|
||||
|
||||
# One element class
|
||||
y4 = np.array([1, 1, 1, 1, 1, 1, 1, 1, 2])
|
||||
|
||||
# Data with less samples in a class than n_features
|
||||
X5 = np.c_[np.arange(8), np.zeros((8, 3))]
|
||||
y5 = np.array([0, 0, 0, 0, 0, 1, 1, 1])
|
||||
|
||||
solver_shrinkage = [('svd', None), ('lsqr', None), ('eigen', None),
|
||||
('lsqr', 'auto'), ('lsqr', 0), ('lsqr', 0.43),
|
||||
('eigen', 'auto'), ('eigen', 0), ('eigen', 0.43)]
|
||||
|
||||
|
||||
def test_lda_predict():
|
||||
# Test LDA classification.
|
||||
# This checks that LDA implements fit and predict and returns correct
|
||||
# values for simple toy data.
|
||||
for test_case in solver_shrinkage:
|
||||
solver, shrinkage = test_case
|
||||
clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
|
||||
y_pred = clf.fit(X, y).predict(X)
|
||||
assert_array_equal(y_pred, y, 'solver %s' % solver)
|
||||
|
||||
# Assert that it works with 1D data
|
||||
y_pred1 = clf.fit(X1, y).predict(X1)
|
||||
assert_array_equal(y_pred1, y, 'solver %s' % solver)
|
||||
|
||||
# Test probability estimates
|
||||
y_proba_pred1 = clf.predict_proba(X1)
|
||||
assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y,
|
||||
'solver %s' % solver)
|
||||
y_log_proba_pred1 = clf.predict_log_proba(X1)
|
||||
assert_allclose(np.exp(y_log_proba_pred1), y_proba_pred1,
|
||||
rtol=1e-6, err_msg='solver %s' % solver)
|
||||
|
||||
# Primarily test for commit 2f34950 -- "reuse" of priors
|
||||
y_pred3 = clf.fit(X, y3).predict(X)
|
||||
# LDA shouldn't be able to separate those
|
||||
assert np.any(y_pred3 != y3), 'solver %s' % solver
|
||||
|
||||
# Test invalid shrinkages
|
||||
clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231)
|
||||
assert_raises(ValueError, clf.fit, X, y)
|
||||
clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy")
|
||||
assert_raises(ValueError, clf.fit, X, y)
|
||||
clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto")
|
||||
assert_raises(NotImplementedError, clf.fit, X, y)
|
||||
# Test unknown solver
|
||||
clf = LinearDiscriminantAnalysis(solver="dummy")
|
||||
assert_raises(ValueError, clf.fit, X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_classes", [2, 3])
|
||||
@pytest.mark.parametrize("solver", ["svd", "lsqr", "eigen"])
|
||||
def test_lda_predict_proba(solver, n_classes):
|
||||
def generate_dataset(n_samples, centers, covariances, random_state=None):
|
||||
"""Generate a multivariate normal data given some centers and
|
||||
covariances"""
|
||||
rng = check_random_state(random_state)
|
||||
X = np.vstack([rng.multivariate_normal(mean, cov,
|
||||
size=n_samples // len(centers))
|
||||
for mean, cov in zip(centers, covariances)])
|
||||
y = np.hstack([[clazz] * (n_samples // len(centers))
|
||||
for clazz in range(len(centers))])
|
||||
return X, y
|
||||
|
||||
blob_centers = np.array([[0, 0], [-10, 40], [-30, 30]])[:n_classes]
|
||||
blob_stds = np.array([[[10, 10], [10, 100]]] * len(blob_centers))
|
||||
X, y = generate_dataset(
|
||||
n_samples=90000, centers=blob_centers, covariances=blob_stds,
|
||||
random_state=42
|
||||
)
|
||||
lda = LinearDiscriminantAnalysis(solver=solver, store_covariance=True,
|
||||
shrinkage=None).fit(X, y)
|
||||
# check that the empirical means and covariances are close enough to the
|
||||
# one used to generate the data
|
||||
assert_allclose(lda.means_, blob_centers, atol=1e-1)
|
||||
assert_allclose(lda.covariance_, blob_stds[0], atol=1)
|
||||
|
||||
# implement the method to compute the probability given in The Elements
|
||||
# of Statistical Learning (cf. p.127, Sect. 4.4.5 "Logistic Regression
|
||||
# or LDA?")
|
||||
precision = linalg.inv(blob_stds[0])
|
||||
alpha_k = []
|
||||
alpha_k_0 = []
|
||||
for clazz in range(len(blob_centers) - 1):
|
||||
alpha_k.append(
|
||||
np.dot(precision,
|
||||
(blob_centers[clazz] - blob_centers[-1])[:, np.newaxis]))
|
||||
alpha_k_0.append(
|
||||
np.dot(- 0.5 * (blob_centers[clazz] +
|
||||
blob_centers[-1])[np.newaxis, :], alpha_k[-1]))
|
||||
|
||||
sample = np.array([[-22, 22]])
|
||||
|
||||
def discriminant_func(sample, coef, intercept, clazz):
|
||||
return np.exp(intercept[clazz] + np.dot(sample, coef[clazz]))
|
||||
|
||||
prob = np.array([float(
|
||||
discriminant_func(sample, alpha_k, alpha_k_0, clazz) /
|
||||
(1 + sum([discriminant_func(sample, alpha_k, alpha_k_0, clazz)
|
||||
for clazz in range(n_classes - 1)]))) for clazz in range(
|
||||
n_classes - 1)])
|
||||
|
||||
prob_ref = 1 - np.sum(prob)
|
||||
|
||||
# check the consistency of the computed probability
|
||||
# all probabilities should sum to one
|
||||
prob_ref_2 = float(
|
||||
1 / (1 + sum([discriminant_func(sample, alpha_k, alpha_k_0, clazz)
|
||||
for clazz in range(n_classes - 1)]))
|
||||
)
|
||||
|
||||
assert prob_ref == pytest.approx(prob_ref_2)
|
||||
# check that the probability of LDA are close to the theoretical
|
||||
# probabilties
|
||||
assert_allclose(lda.predict_proba(sample),
|
||||
np.hstack([prob, prob_ref])[np.newaxis],
|
||||
atol=1e-2)
|
||||
|
||||
|
||||
def test_lda_priors():
|
||||
# Test priors (negative priors)
|
||||
priors = np.array([0.5, -0.5])
|
||||
clf = LinearDiscriminantAnalysis(priors=priors)
|
||||
msg = "priors must be non-negative"
|
||||
assert_raise_message(ValueError, msg, clf.fit, X, y)
|
||||
|
||||
# Test that priors passed as a list are correctly handled (run to see if
|
||||
# failure)
|
||||
clf = LinearDiscriminantAnalysis(priors=[0.5, 0.5])
|
||||
clf.fit(X, y)
|
||||
|
||||
# Test that priors always sum to 1
|
||||
priors = np.array([0.5, 0.6])
|
||||
prior_norm = np.array([0.45, 0.55])
|
||||
clf = LinearDiscriminantAnalysis(priors=priors)
|
||||
assert_warns(UserWarning, clf.fit, X, y)
|
||||
assert_array_almost_equal(clf.priors_, prior_norm, 2)
|
||||
|
||||
|
||||
def test_lda_coefs():
|
||||
# Test if the coefficients of the solvers are approximately the same.
|
||||
n_features = 2
|
||||
n_classes = 2
|
||||
n_samples = 1000
|
||||
X, y = make_blobs(n_samples=n_samples, n_features=n_features,
|
||||
centers=n_classes, random_state=11)
|
||||
|
||||
clf_lda_svd = LinearDiscriminantAnalysis(solver="svd")
|
||||
clf_lda_lsqr = LinearDiscriminantAnalysis(solver="lsqr")
|
||||
clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen")
|
||||
|
||||
clf_lda_svd.fit(X, y)
|
||||
clf_lda_lsqr.fit(X, y)
|
||||
clf_lda_eigen.fit(X, y)
|
||||
|
||||
assert_array_almost_equal(clf_lda_svd.coef_, clf_lda_lsqr.coef_, 1)
|
||||
assert_array_almost_equal(clf_lda_svd.coef_, clf_lda_eigen.coef_, 1)
|
||||
assert_array_almost_equal(clf_lda_eigen.coef_, clf_lda_lsqr.coef_, 1)
|
||||
|
||||
|
||||
def test_lda_transform():
|
||||
# Test LDA transform.
|
||||
clf = LinearDiscriminantAnalysis(solver="svd", n_components=1)
|
||||
X_transformed = clf.fit(X, y).transform(X)
|
||||
assert X_transformed.shape[1] == 1
|
||||
clf = LinearDiscriminantAnalysis(solver="eigen", n_components=1)
|
||||
X_transformed = clf.fit(X, y).transform(X)
|
||||
assert X_transformed.shape[1] == 1
|
||||
|
||||
clf = LinearDiscriminantAnalysis(solver="lsqr", n_components=1)
|
||||
clf.fit(X, y)
|
||||
msg = "transform not implemented for 'lsqr'"
|
||||
assert_raise_message(NotImplementedError, msg, clf.transform, X)
|
||||
|
||||
|
||||
def test_lda_explained_variance_ratio():
|
||||
# Test if the sum of the normalized eigen vectors values equals 1,
|
||||
# Also tests whether the explained_variance_ratio_ formed by the
|
||||
# eigen solver is the same as the explained_variance_ratio_ formed
|
||||
# by the svd solver
|
||||
|
||||
state = np.random.RandomState(0)
|
||||
X = state.normal(loc=0, scale=100, size=(40, 20))
|
||||
y = state.randint(0, 3, size=(40,))
|
||||
|
||||
clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen")
|
||||
clf_lda_eigen.fit(X, y)
|
||||
assert_almost_equal(clf_lda_eigen.explained_variance_ratio_.sum(), 1.0, 3)
|
||||
assert clf_lda_eigen.explained_variance_ratio_.shape == (2,), (
|
||||
"Unexpected length for explained_variance_ratio_")
|
||||
|
||||
clf_lda_svd = LinearDiscriminantAnalysis(solver="svd")
|
||||
clf_lda_svd.fit(X, y)
|
||||
assert_almost_equal(clf_lda_svd.explained_variance_ratio_.sum(), 1.0, 3)
|
||||
assert clf_lda_svd.explained_variance_ratio_.shape == (2,), (
|
||||
"Unexpected length for explained_variance_ratio_")
|
||||
|
||||
assert_array_almost_equal(clf_lda_svd.explained_variance_ratio_,
|
||||
clf_lda_eigen.explained_variance_ratio_)
|
||||
|
||||
|
||||
def test_lda_orthogonality():
|
||||
# arrange four classes with their means in a kite-shaped pattern
|
||||
# the longer distance should be transformed to the first component, and
|
||||
# the shorter distance to the second component.
|
||||
means = np.array([[0, 0, -1], [0, 2, 0], [0, -2, 0], [0, 0, 5]])
|
||||
|
||||
# We construct perfectly symmetric distributions, so the LDA can estimate
|
||||
# precise means.
|
||||
scatter = np.array([[0.1, 0, 0], [-0.1, 0, 0], [0, 0.1, 0], [0, -0.1, 0],
|
||||
[0, 0, 0.1], [0, 0, -0.1]])
|
||||
|
||||
X = (means[:, np.newaxis, :] + scatter[np.newaxis, :, :]).reshape((-1, 3))
|
||||
y = np.repeat(np.arange(means.shape[0]), scatter.shape[0])
|
||||
|
||||
# Fit LDA and transform the means
|
||||
clf = LinearDiscriminantAnalysis(solver="svd").fit(X, y)
|
||||
means_transformed = clf.transform(means)
|
||||
|
||||
d1 = means_transformed[3] - means_transformed[0]
|
||||
d2 = means_transformed[2] - means_transformed[1]
|
||||
d1 /= np.sqrt(np.sum(d1 ** 2))
|
||||
d2 /= np.sqrt(np.sum(d2 ** 2))
|
||||
|
||||
# the transformed within-class covariance should be the identity matrix
|
||||
assert_almost_equal(np.cov(clf.transform(scatter).T), np.eye(2))
|
||||
|
||||
# the means of classes 0 and 3 should lie on the first component
|
||||
assert_almost_equal(np.abs(np.dot(d1[:2], [1, 0])), 1.0)
|
||||
|
||||
# the means of classes 1 and 2 should lie on the second component
|
||||
assert_almost_equal(np.abs(np.dot(d2[:2], [0, 1])), 1.0)
|
||||
|
||||
|
||||
def test_lda_scaling():
|
||||
# Test if classification works correctly with differently scaled features.
|
||||
n = 100
|
||||
rng = np.random.RandomState(1234)
|
||||
# use uniform distribution of features to make sure there is absolutely no
|
||||
# overlap between classes.
|
||||
x1 = rng.uniform(-1, 1, (n, 3)) + [-10, 0, 0]
|
||||
x2 = rng.uniform(-1, 1, (n, 3)) + [10, 0, 0]
|
||||
x = np.vstack((x1, x2)) * [1, 100, 10000]
|
||||
y = [-1] * n + [1] * n
|
||||
|
||||
for solver in ('svd', 'lsqr', 'eigen'):
|
||||
clf = LinearDiscriminantAnalysis(solver=solver)
|
||||
# should be able to separate the data perfectly
|
||||
assert clf.fit(x, y).score(x, y) == 1.0, (
|
||||
'using covariance: %s' % solver)
|
||||
|
||||
|
||||
def test_lda_store_covariance():
|
||||
# Test for solver 'lsqr' and 'eigen'
|
||||
# 'store_covariance' has no effect on 'lsqr' and 'eigen' solvers
|
||||
for solver in ('lsqr', 'eigen'):
|
||||
clf = LinearDiscriminantAnalysis(solver=solver).fit(X6, y6)
|
||||
assert hasattr(clf, 'covariance_')
|
||||
|
||||
# Test the actual attribute:
|
||||
clf = LinearDiscriminantAnalysis(solver=solver,
|
||||
store_covariance=True).fit(X6, y6)
|
||||
assert hasattr(clf, 'covariance_')
|
||||
|
||||
assert_array_almost_equal(
|
||||
clf.covariance_,
|
||||
np.array([[0.422222, 0.088889], [0.088889, 0.533333]])
|
||||
)
|
||||
|
||||
# Test for SVD solver, the default is to not set the covariances_ attribute
|
||||
clf = LinearDiscriminantAnalysis(solver='svd').fit(X6, y6)
|
||||
assert not hasattr(clf, 'covariance_')
|
||||
|
||||
# Test the actual attribute:
|
||||
clf = LinearDiscriminantAnalysis(solver=solver,
|
||||
store_covariance=True).fit(X6, y6)
|
||||
assert hasattr(clf, 'covariance_')
|
||||
|
||||
assert_array_almost_equal(
|
||||
clf.covariance_,
|
||||
np.array([[0.422222, 0.088889], [0.088889, 0.533333]])
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('n_features', [3, 5])
|
||||
@pytest.mark.parametrize('n_classes', [5, 3])
|
||||
def test_lda_dimension_warning(n_classes, n_features):
|
||||
rng = check_random_state(0)
|
||||
n_samples = 10
|
||||
X = rng.randn(n_samples, n_features)
|
||||
# we create n_classes labels by repeating and truncating a
|
||||
# range(n_classes) until n_samples
|
||||
y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
|
||||
max_components = min(n_features, n_classes - 1)
|
||||
|
||||
for n_components in [max_components - 1, None, max_components]:
|
||||
# if n_components <= min(n_classes - 1, n_features), no warning
|
||||
lda = LinearDiscriminantAnalysis(n_components=n_components)
|
||||
assert_no_warnings(lda.fit, X, y)
|
||||
|
||||
for n_components in [max_components + 1,
|
||||
max(n_features, n_classes - 1) + 1]:
|
||||
# if n_components > min(n_classes - 1, n_features), raise error.
|
||||
# We test one unit higher than max_components, and then something
|
||||
# larger than both n_features and n_classes - 1 to ensure the test
|
||||
# works for any value of n_component
|
||||
lda = LinearDiscriminantAnalysis(n_components=n_components)
|
||||
msg = "n_components cannot be larger than "
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
lda.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data_type, expected_type", [
|
||||
(np.float32, np.float32),
|
||||
(np.float64, np.float64),
|
||||
(np.int32, np.float64),
|
||||
(np.int64, np.float64)
|
||||
])
|
||||
def test_lda_dtype_match(data_type, expected_type):
|
||||
for (solver, shrinkage) in solver_shrinkage:
|
||||
clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
|
||||
clf.fit(X.astype(data_type), y.astype(data_type))
|
||||
assert clf.coef_.dtype == expected_type
|
||||
|
||||
|
||||
def test_lda_numeric_consistency_float32_float64():
|
||||
for (solver, shrinkage) in solver_shrinkage:
|
||||
clf_32 = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
|
||||
clf_32.fit(X.astype(np.float32), y.astype(np.float32))
|
||||
clf_64 = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
|
||||
clf_64.fit(X.astype(np.float64), y.astype(np.float64))
|
||||
|
||||
# Check value consistency between types
|
||||
rtol = 1e-6
|
||||
assert_allclose(clf_32.coef_, clf_64.coef_, rtol=rtol)
|
||||
|
||||
|
||||
def test_qda():
|
||||
# QDA classification.
|
||||
# This checks that QDA implements fit and predict and returns
|
||||
# correct values for a simple toy dataset.
|
||||
clf = QuadraticDiscriminantAnalysis()
|
||||
y_pred = clf.fit(X6, y6).predict(X6)
|
||||
assert_array_equal(y_pred, y6)
|
||||
|
||||
# Assure that it works with 1D data
|
||||
y_pred1 = clf.fit(X7, y6).predict(X7)
|
||||
assert_array_equal(y_pred1, y6)
|
||||
|
||||
# Test probas estimates
|
||||
y_proba_pred1 = clf.predict_proba(X7)
|
||||
assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y6)
|
||||
y_log_proba_pred1 = clf.predict_log_proba(X7)
|
||||
assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8)
|
||||
|
||||
y_pred3 = clf.fit(X6, y7).predict(X6)
|
||||
# QDA shouldn't be able to separate those
|
||||
assert np.any(y_pred3 != y7)
|
||||
|
||||
# Classes should have at least 2 elements
|
||||
assert_raises(ValueError, clf.fit, X6, y4)
|
||||
|
||||
|
||||
def test_qda_priors():
|
||||
clf = QuadraticDiscriminantAnalysis()
|
||||
y_pred = clf.fit(X6, y6).predict(X6)
|
||||
n_pos = np.sum(y_pred == 2)
|
||||
|
||||
neg = 1e-10
|
||||
clf = QuadraticDiscriminantAnalysis(priors=np.array([neg, 1 - neg]))
|
||||
y_pred = clf.fit(X6, y6).predict(X6)
|
||||
n_pos2 = np.sum(y_pred == 2)
|
||||
|
||||
assert n_pos2 > n_pos
|
||||
|
||||
|
||||
def test_qda_store_covariance():
|
||||
# The default is to not set the covariances_ attribute
|
||||
clf = QuadraticDiscriminantAnalysis().fit(X6, y6)
|
||||
assert not hasattr(clf, 'covariance_')
|
||||
|
||||
# Test the actual attribute:
|
||||
clf = QuadraticDiscriminantAnalysis(store_covariance=True).fit(X6, y6)
|
||||
assert hasattr(clf, 'covariance_')
|
||||
|
||||
assert_array_almost_equal(
|
||||
clf.covariance_[0],
|
||||
np.array([[0.7, 0.45], [0.45, 0.7]])
|
||||
)
|
||||
|
||||
assert_array_almost_equal(
|
||||
clf.covariance_[1],
|
||||
np.array([[0.33333333, -0.33333333], [-0.33333333, 0.66666667]])
|
||||
)
|
||||
|
||||
|
||||
def test_qda_regularization():
|
||||
# the default is reg_param=0. and will cause issues
|
||||
# when there is a constant variable
|
||||
clf = QuadraticDiscriminantAnalysis()
|
||||
with ignore_warnings():
|
||||
y_pred = clf.fit(X2, y6).predict(X2)
|
||||
assert np.any(y_pred != y6)
|
||||
|
||||
# adding a little regularization fixes the problem
|
||||
clf = QuadraticDiscriminantAnalysis(reg_param=0.01)
|
||||
with ignore_warnings():
|
||||
clf.fit(X2, y6)
|
||||
y_pred = clf.predict(X2)
|
||||
assert_array_equal(y_pred, y6)
|
||||
|
||||
# Case n_samples_in_a_class < n_features
|
||||
clf = QuadraticDiscriminantAnalysis(reg_param=0.1)
|
||||
with ignore_warnings():
|
||||
clf.fit(X5, y5)
|
||||
y_pred5 = clf.predict(X5)
|
||||
assert_array_equal(y_pred5, y5)
|
||||
|
||||
|
||||
def test_covariance():
|
||||
x, y = make_blobs(n_samples=100, n_features=5,
|
||||
centers=1, random_state=42)
|
||||
|
||||
# make features correlated
|
||||
x = np.dot(x, np.arange(x.shape[1] ** 2).reshape(x.shape[1], x.shape[1]))
|
||||
|
||||
c_e = _cov(x, 'empirical')
|
||||
assert_almost_equal(c_e, c_e.T)
|
||||
|
||||
c_s = _cov(x, 'auto')
|
||||
assert_almost_equal(c_s, c_s.T)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("solver", ['svd, lsqr', 'eigen'])
|
||||
def test_raises_value_error_on_same_number_of_classes_and_samples(solver):
|
||||
"""
|
||||
Tests that if the number of samples equals the number
|
||||
of classes, a ValueError is raised.
|
||||
"""
|
||||
X = np.array([[0.5, 0.6], [0.6, 0.5]])
|
||||
y = np.array(["a", "b"])
|
||||
clf = LinearDiscriminantAnalysis(solver=solver)
|
||||
with pytest.raises(ValueError, match="The number of samples must be more"):
|
||||
clf.fit(X, y)
|
|
@ -0,0 +1,258 @@
|
|||
# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# Raghav RV <rvraghav93@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import inspect
|
||||
import warnings
|
||||
import importlib
|
||||
|
||||
from pkgutil import walk_packages
|
||||
from inspect import signature
|
||||
|
||||
import numpy as np
|
||||
|
||||
import sklearn
|
||||
from sklearn.utils import IS_PYPY
|
||||
from sklearn.utils._testing import check_docstring_parameters
|
||||
from sklearn.utils._testing import _get_func_name
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
from sklearn.utils._testing import all_estimators
|
||||
from sklearn.utils.estimator_checks import _enforce_estimator_tags_y
|
||||
from sklearn.utils.estimator_checks import _enforce_estimator_tags_x
|
||||
from sklearn.utils.deprecation import _is_deprecated
|
||||
from sklearn.externals._pep562 import Pep562
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# walk_packages() ignores DeprecationWarnings, now we need to ignore
|
||||
# FutureWarnings
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter('ignore', FutureWarning)
|
||||
PUBLIC_MODULES = set([
|
||||
pckg[1] for pckg in walk_packages(
|
||||
prefix='sklearn.',
|
||||
# mypy error: Module has no attribute "__path__"
|
||||
path=sklearn.__path__) # type: ignore # mypy issue #1422
|
||||
if not ("._" in pckg[1] or ".tests." in pckg[1])
|
||||
])
|
||||
|
||||
# functions to ignore args / docstring of
|
||||
_DOCSTRING_IGNORES = [
|
||||
'sklearn.utils.deprecation.load_mlcomp',
|
||||
'sklearn.pipeline.make_pipeline',
|
||||
'sklearn.pipeline.make_union',
|
||||
'sklearn.utils.extmath.safe_sparse_dot',
|
||||
'sklearn.utils._joblib'
|
||||
]
|
||||
|
||||
# Methods where y param should be ignored if y=None by default
|
||||
_METHODS_IGNORE_NONE_Y = [
|
||||
'fit',
|
||||
'score',
|
||||
'fit_predict',
|
||||
'fit_transform',
|
||||
'partial_fit',
|
||||
'predict'
|
||||
]
|
||||
|
||||
|
||||
# numpydoc 0.8.0's docscrape tool raises because of collections.abc under
|
||||
# Python 3.7
|
||||
@pytest.mark.filterwarnings('ignore::FutureWarning')
|
||||
@pytest.mark.filterwarnings('ignore::DeprecationWarning')
|
||||
@pytest.mark.skipif(IS_PYPY, reason='test segfaults on PyPy')
|
||||
def test_docstring_parameters():
|
||||
# Test module docstring formatting
|
||||
|
||||
# Skip test if numpydoc is not found
|
||||
pytest.importorskip('numpydoc',
|
||||
reason="numpydoc is required to test the docstrings")
|
||||
|
||||
# XXX unreached code as of v0.22
|
||||
from numpydoc import docscrape
|
||||
|
||||
incorrect = []
|
||||
for name in PUBLIC_MODULES:
|
||||
if name == 'sklearn.utils.fixes':
|
||||
# We cannot always control these docstrings
|
||||
continue
|
||||
with warnings.catch_warnings(record=True):
|
||||
module = importlib.import_module(name)
|
||||
classes = inspect.getmembers(module, inspect.isclass)
|
||||
# Exclude imported classes
|
||||
classes = [cls for cls in classes if cls[1].__module__ == name]
|
||||
for cname, cls in classes:
|
||||
this_incorrect = []
|
||||
if cname in _DOCSTRING_IGNORES or cname.startswith('_'):
|
||||
continue
|
||||
if inspect.isabstract(cls):
|
||||
continue
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
cdoc = docscrape.ClassDoc(cls)
|
||||
if len(w):
|
||||
raise RuntimeError('Error for __init__ of %s in %s:\n%s'
|
||||
% (cls, name, w[0]))
|
||||
|
||||
cls_init = getattr(cls, '__init__', None)
|
||||
|
||||
if _is_deprecated(cls_init):
|
||||
continue
|
||||
elif cls_init is not None:
|
||||
this_incorrect += check_docstring_parameters(
|
||||
cls.__init__, cdoc)
|
||||
|
||||
for method_name in cdoc.methods:
|
||||
method = getattr(cls, method_name)
|
||||
if _is_deprecated(method):
|
||||
continue
|
||||
param_ignore = None
|
||||
# Now skip docstring test for y when y is None
|
||||
# by default for API reason
|
||||
if method_name in _METHODS_IGNORE_NONE_Y:
|
||||
sig = signature(method)
|
||||
if ('y' in sig.parameters and
|
||||
sig.parameters['y'].default is None):
|
||||
param_ignore = ['y'] # ignore y for fit and score
|
||||
result = check_docstring_parameters(
|
||||
method, ignore=param_ignore)
|
||||
this_incorrect += result
|
||||
|
||||
incorrect += this_incorrect
|
||||
|
||||
functions = inspect.getmembers(module, inspect.isfunction)
|
||||
# Exclude imported functions
|
||||
functions = [fn for fn in functions if fn[1].__module__ == name]
|
||||
for fname, func in functions:
|
||||
# Don't test private methods / functions
|
||||
if fname.startswith('_'):
|
||||
continue
|
||||
if fname == "configuration" and name.endswith("setup"):
|
||||
continue
|
||||
name_ = _get_func_name(func)
|
||||
if (not any(d in name_ for d in _DOCSTRING_IGNORES) and
|
||||
not _is_deprecated(func)):
|
||||
incorrect += check_docstring_parameters(func)
|
||||
|
||||
msg = '\n'.join(incorrect)
|
||||
if len(incorrect) > 0:
|
||||
raise AssertionError("Docstring Error:\n" + msg)
|
||||
|
||||
|
||||
@ignore_warnings(category=FutureWarning)
|
||||
def test_tabs():
|
||||
# Test that there are no tabs in our source files
|
||||
for importer, modname, ispkg in walk_packages(sklearn.__path__,
|
||||
prefix='sklearn.'):
|
||||
|
||||
if IS_PYPY and ('_svmlight_format_io' in modname or
|
||||
'feature_extraction._hashing_fast' in modname):
|
||||
continue
|
||||
|
||||
# because we don't import
|
||||
mod = importlib.import_module(modname)
|
||||
|
||||
# TODO: Remove when minimum python version is 3.7
|
||||
# unwrap to get module because Pep562 backport wraps the original
|
||||
# module
|
||||
if isinstance(mod, Pep562):
|
||||
mod = mod._module
|
||||
|
||||
try:
|
||||
source = inspect.getsource(mod)
|
||||
except IOError: # user probably should have run "make clean"
|
||||
continue
|
||||
assert '\t' not in source, ('"%s" has tabs, please remove them ',
|
||||
'or add it to the ignore list'
|
||||
% modname)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name, Estimator',
|
||||
all_estimators())
|
||||
def test_fit_docstring_attributes(name, Estimator):
|
||||
pytest.importorskip('numpydoc')
|
||||
from numpydoc import docscrape
|
||||
|
||||
doc = docscrape.ClassDoc(Estimator)
|
||||
attributes = doc['Attributes']
|
||||
|
||||
IGNORED = {'ClassifierChain', 'ColumnTransformer', 'CountVectorizer',
|
||||
'DictVectorizer', 'FeatureUnion', 'GaussianRandomProjection',
|
||||
'GridSearchCV', 'MultiOutputClassifier', 'MultiOutputRegressor',
|
||||
'NoSampleWeightWrapper', 'OneVsOneClassifier',
|
||||
'OneVsRestClassifier', 'OutputCodeClassifier', 'Pipeline',
|
||||
'RFE', 'RFECV', 'RandomizedSearchCV', 'RegressorChain',
|
||||
'SelectFromModel', 'SparseCoder', 'SparseRandomProjection',
|
||||
'SpectralBiclustering', 'StackingClassifier',
|
||||
'StackingRegressor', 'TfidfVectorizer', 'VotingClassifier',
|
||||
'VotingRegressor'}
|
||||
if Estimator.__name__ in IGNORED or Estimator.__name__.startswith('_'):
|
||||
pytest.skip("Estimator cannot be fit easily to test fit attributes")
|
||||
|
||||
est = Estimator()
|
||||
|
||||
if Estimator.__name__ == 'SelectKBest':
|
||||
est.k = 2
|
||||
|
||||
if Estimator.__name__ == 'DummyClassifier':
|
||||
est.strategy = "stratified"
|
||||
|
||||
# TO BE REMOVED for v0.25 (avoid FutureWarning)
|
||||
if Estimator.__name__ == 'AffinityPropagation':
|
||||
est.random_state = 63
|
||||
|
||||
X, y = make_classification(n_samples=20, n_features=3,
|
||||
n_redundant=0, n_classes=2,
|
||||
random_state=2)
|
||||
|
||||
y = _enforce_estimator_tags_y(est, y)
|
||||
X = _enforce_estimator_tags_x(est, X)
|
||||
|
||||
if '1dlabels' in est._get_tags()['X_types']:
|
||||
est.fit(y)
|
||||
elif '2dlabels' in est._get_tags()['X_types']:
|
||||
est.fit(np.c_[y, y])
|
||||
else:
|
||||
est.fit(X, y)
|
||||
|
||||
skipped_attributes = {'n_features_in_'}
|
||||
|
||||
for attr in attributes:
|
||||
if attr.name in skipped_attributes:
|
||||
continue
|
||||
desc = ' '.join(attr.desc).lower()
|
||||
# As certain attributes are present "only" if a certain parameter is
|
||||
# provided, this checks if the word "only" is present in the attribute
|
||||
# description, and if not the attribute is required to be present.
|
||||
if 'only ' not in desc:
|
||||
assert hasattr(est, attr.name)
|
||||
|
||||
IGNORED = {'BayesianRidge', 'Birch', 'CCA', 'CategoricalNB', 'ElasticNet',
|
||||
'ElasticNetCV', 'GaussianProcessClassifier',
|
||||
'GradientBoostingRegressor', 'HistGradientBoostingClassifier',
|
||||
'HistGradientBoostingRegressor', 'IsolationForest',
|
||||
'KNeighborsClassifier', 'KNeighborsRegressor',
|
||||
'KNeighborsTransformer', 'KernelCenterer', 'KernelDensity',
|
||||
'LarsCV', 'Lasso', 'LassoLarsCV', 'LassoLarsIC',
|
||||
'LatentDirichletAllocation', 'LocalOutlierFactor', 'MDS',
|
||||
'MiniBatchKMeans', 'MLPClassifier', 'MLPRegressor',
|
||||
'MultiTaskElasticNet', 'MultiTaskElasticNetCV',
|
||||
'MultiTaskLasso', 'MultiTaskLassoCV', 'NearestNeighbors',
|
||||
'NuSVR', 'OneClassSVM', 'OrthogonalMatchingPursuit',
|
||||
'PLSCanonical', 'PLSRegression', 'PLSSVD',
|
||||
'PassiveAggressiveClassifier', 'Perceptron', 'RBFSampler',
|
||||
'RadiusNeighborsClassifier', 'RadiusNeighborsRegressor',
|
||||
'RadiusNeighborsTransformer', 'RandomTreesEmbedding', 'SVR',
|
||||
'SkewedChi2Sampler'}
|
||||
if Estimator.__name__ in IGNORED:
|
||||
pytest.xfail(
|
||||
reason="Classifier has too many undocumented attributes.")
|
||||
|
||||
fit_attr = [k for k in est.__dict__.keys() if k.endswith('_')
|
||||
and not k.startswith('_')]
|
||||
fit_attr_names = [attr.name for attr in attributes]
|
||||
undocumented_attrs = set(fit_attr).difference(fit_attr_names)
|
||||
undocumented_attrs = set(undocumented_attrs).difference(skipped_attributes)
|
||||
assert not undocumented_attrs,\
|
||||
"Undocumented attributes: {}".format(undocumented_attrs)
|
786
venv/Lib/site-packages/sklearn/tests/test_dummy.py
Normal file
786
venv/Lib/site-packages/sklearn/tests/test_dummy.py
Normal file
|
@ -0,0 +1,786 @@
|
|||
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from sklearn.base import clone
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
from sklearn.utils._testing import assert_raises
|
||||
from sklearn.utils._testing import assert_warns_message
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
from sklearn.utils.stats import _weighted_percentile
|
||||
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from sklearn.exceptions import NotFittedError
|
||||
|
||||
|
||||
@ignore_warnings
|
||||
def _check_predict_proba(clf, X, y):
|
||||
proba = clf.predict_proba(X)
|
||||
# We know that we can have division by zero
|
||||
log_proba = clf.predict_log_proba(X)
|
||||
|
||||
y = np.atleast_1d(y)
|
||||
if y.ndim == 1:
|
||||
y = np.reshape(y, (-1, 1))
|
||||
|
||||
n_outputs = y.shape[1]
|
||||
n_samples = len(X)
|
||||
|
||||
if n_outputs == 1:
|
||||
proba = [proba]
|
||||
log_proba = [log_proba]
|
||||
|
||||
for k in range(n_outputs):
|
||||
assert proba[k].shape[0] == n_samples
|
||||
assert proba[k].shape[1] == len(np.unique(y[:, k]))
|
||||
assert_array_almost_equal(proba[k].sum(axis=1), np.ones(len(X)))
|
||||
# We know that we can have division by zero
|
||||
assert_array_almost_equal(np.log(proba[k]), log_proba[k])
|
||||
|
||||
|
||||
def _check_behavior_2d(clf):
|
||||
# 1d case
|
||||
X = np.array([[0], [0], [0], [0]]) # ignored
|
||||
y = np.array([1, 2, 1, 1])
|
||||
est = clone(clf)
|
||||
est.fit(X, y)
|
||||
y_pred = est.predict(X)
|
||||
assert y.shape == y_pred.shape
|
||||
|
||||
# 2d case
|
||||
y = np.array([[1, 0],
|
||||
[2, 0],
|
||||
[1, 0],
|
||||
[1, 3]])
|
||||
est = clone(clf)
|
||||
est.fit(X, y)
|
||||
y_pred = est.predict(X)
|
||||
assert y.shape == y_pred.shape
|
||||
|
||||
|
||||
def _check_behavior_2d_for_constant(clf):
|
||||
# 2d case only
|
||||
X = np.array([[0], [0], [0], [0]]) # ignored
|
||||
y = np.array([[1, 0, 5, 4, 3],
|
||||
[2, 0, 1, 2, 5],
|
||||
[1, 0, 4, 5, 2],
|
||||
[1, 3, 3, 2, 0]])
|
||||
est = clone(clf)
|
||||
est.fit(X, y)
|
||||
y_pred = est.predict(X)
|
||||
assert y.shape == y_pred.shape
|
||||
|
||||
|
||||
def _check_equality_regressor(statistic, y_learn, y_pred_learn,
|
||||
y_test, y_pred_test):
|
||||
assert_array_almost_equal(np.tile(statistic, (y_learn.shape[0], 1)),
|
||||
y_pred_learn)
|
||||
assert_array_almost_equal(np.tile(statistic, (y_test.shape[0], 1)),
|
||||
y_pred_test)
|
||||
|
||||
|
||||
def test_most_frequent_and_prior_strategy():
|
||||
X = [[0], [0], [0], [0]] # ignored
|
||||
y = [1, 2, 1, 1]
|
||||
|
||||
for strategy in ("most_frequent", "prior"):
|
||||
clf = DummyClassifier(strategy=strategy, random_state=0)
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(X), np.ones(len(X)))
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
if strategy == "prior":
|
||||
assert_array_almost_equal(clf.predict_proba([X[0]]),
|
||||
clf.class_prior_.reshape((1, -1)))
|
||||
else:
|
||||
assert_array_almost_equal(clf.predict_proba([X[0]]),
|
||||
clf.class_prior_.reshape((1, -1)) > 0.5)
|
||||
|
||||
|
||||
def test_most_frequent_and_prior_strategy_with_2d_column_y():
|
||||
# non-regression test added in
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/13545
|
||||
X = [[0], [0], [0], [0]]
|
||||
y_1d = [1, 2, 1, 1]
|
||||
y_2d = [[1], [2], [1], [1]]
|
||||
|
||||
for strategy in ("most_frequent", "prior"):
|
||||
clf_1d = DummyClassifier(strategy=strategy, random_state=0)
|
||||
clf_2d = DummyClassifier(strategy=strategy, random_state=0)
|
||||
|
||||
clf_1d.fit(X, y_1d)
|
||||
clf_2d.fit(X, y_2d)
|
||||
assert_array_equal(clf_1d.predict(X), clf_2d.predict(X))
|
||||
|
||||
|
||||
def test_most_frequent_and_prior_strategy_multioutput():
|
||||
X = [[0], [0], [0], [0]] # ignored
|
||||
y = np.array([[1, 0],
|
||||
[2, 0],
|
||||
[1, 0],
|
||||
[1, 3]])
|
||||
|
||||
n_samples = len(X)
|
||||
|
||||
for strategy in ("prior", "most_frequent"):
|
||||
clf = DummyClassifier(strategy=strategy, random_state=0)
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(X),
|
||||
np.hstack([np.ones((n_samples, 1)),
|
||||
np.zeros((n_samples, 1))]))
|
||||
_check_predict_proba(clf, X, y)
|
||||
_check_behavior_2d(clf)
|
||||
|
||||
|
||||
def test_stratified_strategy():
|
||||
X = [[0]] * 5 # ignored
|
||||
y = [1, 2, 1, 1, 2]
|
||||
clf = DummyClassifier(strategy="stratified", random_state=0)
|
||||
clf.fit(X, y)
|
||||
|
||||
X = [[0]] * 500
|
||||
y_pred = clf.predict(X)
|
||||
p = np.bincount(y_pred) / float(len(X))
|
||||
assert_almost_equal(p[1], 3. / 5, decimal=1)
|
||||
assert_almost_equal(p[2], 2. / 5, decimal=1)
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
|
||||
def test_stratified_strategy_multioutput():
|
||||
X = [[0]] * 5 # ignored
|
||||
y = np.array([[2, 1],
|
||||
[2, 2],
|
||||
[1, 1],
|
||||
[1, 2],
|
||||
[1, 1]])
|
||||
|
||||
clf = DummyClassifier(strategy="stratified", random_state=0)
|
||||
clf.fit(X, y)
|
||||
|
||||
X = [[0]] * 500
|
||||
y_pred = clf.predict(X)
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
p = np.bincount(y_pred[:, k]) / float(len(X))
|
||||
assert_almost_equal(p[1], 3. / 5, decimal=1)
|
||||
assert_almost_equal(p[2], 2. / 5, decimal=1)
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
_check_behavior_2d(clf)
|
||||
|
||||
|
||||
def test_uniform_strategy():
|
||||
X = [[0]] * 4 # ignored
|
||||
y = [1, 2, 1, 1]
|
||||
clf = DummyClassifier(strategy="uniform", random_state=0)
|
||||
clf.fit(X, y)
|
||||
|
||||
X = [[0]] * 500
|
||||
y_pred = clf.predict(X)
|
||||
p = np.bincount(y_pred) / float(len(X))
|
||||
assert_almost_equal(p[1], 0.5, decimal=1)
|
||||
assert_almost_equal(p[2], 0.5, decimal=1)
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
|
||||
def test_uniform_strategy_multioutput():
|
||||
X = [[0]] * 4 # ignored
|
||||
y = np.array([[2, 1],
|
||||
[2, 2],
|
||||
[1, 2],
|
||||
[1, 1]])
|
||||
clf = DummyClassifier(strategy="uniform", random_state=0)
|
||||
clf.fit(X, y)
|
||||
|
||||
X = [[0]] * 500
|
||||
y_pred = clf.predict(X)
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
p = np.bincount(y_pred[:, k]) / float(len(X))
|
||||
assert_almost_equal(p[1], 0.5, decimal=1)
|
||||
assert_almost_equal(p[2], 0.5, decimal=1)
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
_check_behavior_2d(clf)
|
||||
|
||||
|
||||
def test_string_labels():
|
||||
X = [[0]] * 5
|
||||
y = ["paris", "paris", "tokyo", "amsterdam", "berlin"]
|
||||
clf = DummyClassifier(strategy="most_frequent")
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(X), ["paris"] * 5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("y,y_test", [
|
||||
([2, 1, 1, 1], [2, 2, 1, 1]),
|
||||
(np.array([[2, 2],
|
||||
[1, 1],
|
||||
[1, 1],
|
||||
[1, 1]]),
|
||||
np.array([[2, 2],
|
||||
[2, 2],
|
||||
[1, 1],
|
||||
[1, 1]]))
|
||||
])
|
||||
def test_classifier_score_with_None(y, y_test):
|
||||
clf = DummyClassifier(strategy="most_frequent")
|
||||
clf.fit(None, y)
|
||||
assert clf.score(None, y_test) == 0.5
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strategy", [
|
||||
"stratified",
|
||||
"most_frequent",
|
||||
"prior",
|
||||
"uniform",
|
||||
"constant"
|
||||
])
|
||||
def test_classifier_prediction_independent_of_X(strategy):
|
||||
y = [0, 2, 1, 1]
|
||||
X1 = [[0]] * 4
|
||||
clf1 = DummyClassifier(strategy=strategy, random_state=0, constant=0)
|
||||
clf1.fit(X1, y)
|
||||
predictions1 = clf1.predict(X1)
|
||||
|
||||
X2 = [[1]] * 4
|
||||
clf2 = DummyClassifier(strategy=strategy, random_state=0, constant=0)
|
||||
clf2.fit(X2, y)
|
||||
predictions2 = clf2.predict(X2)
|
||||
|
||||
assert_array_equal(predictions1, predictions2)
|
||||
|
||||
|
||||
def test_classifier_exceptions():
|
||||
clf = DummyClassifier(strategy="unknown")
|
||||
assert_raises(ValueError, clf.fit, [], [])
|
||||
|
||||
assert_raises(NotFittedError, clf.predict, [])
|
||||
assert_raises(NotFittedError, clf.predict_proba, [])
|
||||
|
||||
|
||||
def test_mean_strategy_regressor():
|
||||
|
||||
random_state = np.random.RandomState(seed=1)
|
||||
|
||||
X = [[0]] * 4 # ignored
|
||||
y = random_state.randn(4)
|
||||
|
||||
reg = DummyRegressor()
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [np.mean(y)] * len(X))
|
||||
|
||||
|
||||
def test_mean_strategy_multioutput_regressor():
|
||||
|
||||
random_state = np.random.RandomState(seed=1)
|
||||
|
||||
X_learn = random_state.randn(10, 10)
|
||||
y_learn = random_state.randn(10, 5)
|
||||
|
||||
mean = np.mean(y_learn, axis=0).reshape((1, -1))
|
||||
|
||||
X_test = random_state.randn(20, 10)
|
||||
y_test = random_state.randn(20, 5)
|
||||
|
||||
# Correctness oracle
|
||||
est = DummyRegressor()
|
||||
est.fit(X_learn, y_learn)
|
||||
y_pred_learn = est.predict(X_learn)
|
||||
y_pred_test = est.predict(X_test)
|
||||
|
||||
_check_equality_regressor(mean, y_learn, y_pred_learn, y_test, y_pred_test)
|
||||
_check_behavior_2d(est)
|
||||
|
||||
|
||||
def test_regressor_exceptions():
|
||||
reg = DummyRegressor()
|
||||
assert_raises(NotFittedError, reg.predict, [])
|
||||
|
||||
|
||||
def test_median_strategy_regressor():
|
||||
|
||||
random_state = np.random.RandomState(seed=1)
|
||||
|
||||
X = [[0]] * 5 # ignored
|
||||
y = random_state.randn(5)
|
||||
|
||||
reg = DummyRegressor(strategy="median")
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
|
||||
|
||||
|
||||
def test_median_strategy_multioutput_regressor():
|
||||
|
||||
random_state = np.random.RandomState(seed=1)
|
||||
|
||||
X_learn = random_state.randn(10, 10)
|
||||
y_learn = random_state.randn(10, 5)
|
||||
|
||||
median = np.median(y_learn, axis=0).reshape((1, -1))
|
||||
|
||||
X_test = random_state.randn(20, 10)
|
||||
y_test = random_state.randn(20, 5)
|
||||
|
||||
# Correctness oracle
|
||||
est = DummyRegressor(strategy="median")
|
||||
est.fit(X_learn, y_learn)
|
||||
y_pred_learn = est.predict(X_learn)
|
||||
y_pred_test = est.predict(X_test)
|
||||
|
||||
_check_equality_regressor(
|
||||
median, y_learn, y_pred_learn, y_test, y_pred_test)
|
||||
_check_behavior_2d(est)
|
||||
|
||||
|
||||
def test_quantile_strategy_regressor():
|
||||
|
||||
random_state = np.random.RandomState(seed=1)
|
||||
|
||||
X = [[0]] * 5 # ignored
|
||||
y = random_state.randn(5)
|
||||
|
||||
reg = DummyRegressor(strategy="quantile", quantile=0.5)
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
|
||||
|
||||
reg = DummyRegressor(strategy="quantile", quantile=0)
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [np.min(y)] * len(X))
|
||||
|
||||
reg = DummyRegressor(strategy="quantile", quantile=1)
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [np.max(y)] * len(X))
|
||||
|
||||
reg = DummyRegressor(strategy="quantile", quantile=0.3)
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [np.percentile(y, q=30)] * len(X))
|
||||
|
||||
|
||||
def test_quantile_strategy_multioutput_regressor():
|
||||
|
||||
random_state = np.random.RandomState(seed=1)
|
||||
|
||||
X_learn = random_state.randn(10, 10)
|
||||
y_learn = random_state.randn(10, 5)
|
||||
|
||||
median = np.median(y_learn, axis=0).reshape((1, -1))
|
||||
quantile_values = np.percentile(y_learn, axis=0, q=80).reshape((1, -1))
|
||||
|
||||
X_test = random_state.randn(20, 10)
|
||||
y_test = random_state.randn(20, 5)
|
||||
|
||||
# Correctness oracle
|
||||
est = DummyRegressor(strategy="quantile", quantile=0.5)
|
||||
est.fit(X_learn, y_learn)
|
||||
y_pred_learn = est.predict(X_learn)
|
||||
y_pred_test = est.predict(X_test)
|
||||
|
||||
_check_equality_regressor(
|
||||
median, y_learn, y_pred_learn, y_test, y_pred_test)
|
||||
_check_behavior_2d(est)
|
||||
|
||||
# Correctness oracle
|
||||
est = DummyRegressor(strategy="quantile", quantile=0.8)
|
||||
est.fit(X_learn, y_learn)
|
||||
y_pred_learn = est.predict(X_learn)
|
||||
y_pred_test = est.predict(X_test)
|
||||
|
||||
_check_equality_regressor(
|
||||
quantile_values, y_learn, y_pred_learn, y_test, y_pred_test)
|
||||
_check_behavior_2d(est)
|
||||
|
||||
|
||||
def test_quantile_invalid():
|
||||
|
||||
X = [[0]] * 5 # ignored
|
||||
y = [0] * 5 # ignored
|
||||
|
||||
est = DummyRegressor(strategy="quantile")
|
||||
assert_raises(ValueError, est.fit, X, y)
|
||||
|
||||
est = DummyRegressor(strategy="quantile", quantile=None)
|
||||
assert_raises(ValueError, est.fit, X, y)
|
||||
|
||||
est = DummyRegressor(strategy="quantile", quantile=[0])
|
||||
assert_raises(ValueError, est.fit, X, y)
|
||||
|
||||
est = DummyRegressor(strategy="quantile", quantile=-0.1)
|
||||
assert_raises(ValueError, est.fit, X, y)
|
||||
|
||||
est = DummyRegressor(strategy="quantile", quantile=1.1)
|
||||
assert_raises(ValueError, est.fit, X, y)
|
||||
|
||||
est = DummyRegressor(strategy="quantile", quantile='abc')
|
||||
assert_raises(TypeError, est.fit, X, y)
|
||||
|
||||
|
||||
def test_quantile_strategy_empty_train():
|
||||
est = DummyRegressor(strategy="quantile", quantile=0.4)
|
||||
assert_raises(ValueError, est.fit, [], [])
|
||||
|
||||
|
||||
def test_constant_strategy_regressor():
|
||||
|
||||
random_state = np.random.RandomState(seed=1)
|
||||
|
||||
X = [[0]] * 5 # ignored
|
||||
y = random_state.randn(5)
|
||||
|
||||
reg = DummyRegressor(strategy="constant", constant=[43])
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [43] * len(X))
|
||||
|
||||
reg = DummyRegressor(strategy="constant", constant=43)
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [43] * len(X))
|
||||
|
||||
|
||||
def test_constant_strategy_multioutput_regressor():
|
||||
|
||||
random_state = np.random.RandomState(seed=1)
|
||||
|
||||
X_learn = random_state.randn(10, 10)
|
||||
y_learn = random_state.randn(10, 5)
|
||||
|
||||
# test with 2d array
|
||||
constants = random_state.randn(5)
|
||||
|
||||
X_test = random_state.randn(20, 10)
|
||||
y_test = random_state.randn(20, 5)
|
||||
|
||||
# Correctness oracle
|
||||
est = DummyRegressor(strategy="constant", constant=constants)
|
||||
est.fit(X_learn, y_learn)
|
||||
y_pred_learn = est.predict(X_learn)
|
||||
y_pred_test = est.predict(X_test)
|
||||
|
||||
_check_equality_regressor(
|
||||
constants, y_learn, y_pred_learn, y_test, y_pred_test)
|
||||
_check_behavior_2d_for_constant(est)
|
||||
|
||||
|
||||
def test_y_mean_attribute_regressor():
|
||||
X = [[0]] * 5
|
||||
y = [1, 2, 4, 6, 8]
|
||||
# when strategy = 'mean'
|
||||
est = DummyRegressor(strategy='mean')
|
||||
est.fit(X, y)
|
||||
|
||||
assert est.constant_ == np.mean(y)
|
||||
|
||||
|
||||
def test_unknown_strategey_regressor():
|
||||
X = [[0]] * 5
|
||||
y = [1, 2, 4, 6, 8]
|
||||
|
||||
est = DummyRegressor(strategy='gona')
|
||||
assert_raises(ValueError, est.fit, X, y)
|
||||
|
||||
|
||||
def test_constants_not_specified_regressor():
|
||||
X = [[0]] * 5
|
||||
y = [1, 2, 4, 6, 8]
|
||||
|
||||
est = DummyRegressor(strategy='constant')
|
||||
assert_raises(TypeError, est.fit, X, y)
|
||||
|
||||
|
||||
def test_constant_size_multioutput_regressor():
|
||||
random_state = np.random.RandomState(seed=1)
|
||||
X = random_state.randn(10, 10)
|
||||
y = random_state.randn(10, 5)
|
||||
|
||||
est = DummyRegressor(strategy='constant', constant=[1, 2, 3, 4])
|
||||
assert_raises(ValueError, est.fit, X, y)
|
||||
|
||||
|
||||
def test_constant_strategy():
|
||||
X = [[0], [0], [0], [0]] # ignored
|
||||
y = [2, 1, 2, 2]
|
||||
|
||||
clf = DummyClassifier(strategy="constant", random_state=0, constant=1)
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(X), np.ones(len(X)))
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
X = [[0], [0], [0], [0]] # ignored
|
||||
y = ['two', 'one', 'two', 'two']
|
||||
clf = DummyClassifier(strategy="constant", random_state=0, constant='one')
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(X), np.array(['one'] * 4))
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
|
||||
def test_constant_strategy_multioutput():
|
||||
X = [[0], [0], [0], [0]] # ignored
|
||||
y = np.array([[2, 3],
|
||||
[1, 3],
|
||||
[2, 3],
|
||||
[2, 0]])
|
||||
|
||||
n_samples = len(X)
|
||||
|
||||
clf = DummyClassifier(strategy="constant", random_state=0,
|
||||
constant=[1, 0])
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(X),
|
||||
np.hstack([np.ones((n_samples, 1)),
|
||||
np.zeros((n_samples, 1))]))
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('y, params, err_msg', [
|
||||
([2, 1, 2, 2],
|
||||
{'random_state': 0},
|
||||
"Constant.*has to be specified"),
|
||||
([2, 1, 2, 2],
|
||||
{'constant': [2, 0]},
|
||||
"Constant.*should have shape"),
|
||||
(np.transpose([[2, 1, 2, 2], [2, 1, 2, 2]]),
|
||||
{'constant': 2},
|
||||
"Constant.*should have shape"),
|
||||
([2, 1, 2, 2],
|
||||
{'constant': 'my-constant'},
|
||||
"constant=my-constant.*Possible values.*\\[1, 2]"),
|
||||
(np.transpose([[2, 1, 2, 2], [2, 1, 2, 2]]),
|
||||
{'constant': [2, 'unknown']},
|
||||
"constant=\\[2, 'unknown'].*Possible values.*\\[1, 2]")],
|
||||
ids=["no-constant", "too-many-constant", "not-enough-output",
|
||||
"single-output", "multi-output"]
|
||||
)
|
||||
def test_constant_strategy_exceptions(y, params, err_msg):
|
||||
X = [[0], [0], [0], [0]]
|
||||
|
||||
clf = DummyClassifier(strategy="constant", **params)
|
||||
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_classification_sample_weight():
|
||||
X = [[0], [0], [1]]
|
||||
y = [0, 1, 0]
|
||||
sample_weight = [0.1, 1., 0.1]
|
||||
|
||||
clf = DummyClassifier(strategy="stratified").fit(X, y, sample_weight)
|
||||
assert_array_almost_equal(clf.class_prior_, [0.2 / 1.2, 1. / 1.2])
|
||||
|
||||
|
||||
def test_constant_strategy_sparse_target():
|
||||
X = [[0]] * 5 # ignored
|
||||
y = sp.csc_matrix(np.array([[0, 1],
|
||||
[4, 0],
|
||||
[1, 1],
|
||||
[1, 4],
|
||||
[1, 1]]))
|
||||
|
||||
n_samples = len(X)
|
||||
|
||||
clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0])
|
||||
clf.fit(X, y)
|
||||
y_pred = clf.predict(X)
|
||||
assert sp.issparse(y_pred)
|
||||
assert_array_equal(y_pred.toarray(), np.hstack([np.ones((n_samples, 1)),
|
||||
np.zeros((n_samples, 1))]))
|
||||
|
||||
|
||||
def test_uniform_strategy_sparse_target_warning():
|
||||
X = [[0]] * 5 # ignored
|
||||
y = sp.csc_matrix(np.array([[2, 1],
|
||||
[2, 2],
|
||||
[1, 4],
|
||||
[4, 2],
|
||||
[1, 1]]))
|
||||
|
||||
clf = DummyClassifier(strategy="uniform", random_state=0)
|
||||
assert_warns_message(UserWarning,
|
||||
"the uniform strategy would not save memory",
|
||||
clf.fit, X, y)
|
||||
|
||||
X = [[0]] * 500
|
||||
y_pred = clf.predict(X)
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
p = np.bincount(y_pred[:, k]) / float(len(X))
|
||||
assert_almost_equal(p[1], 1 / 3, decimal=1)
|
||||
assert_almost_equal(p[2], 1 / 3, decimal=1)
|
||||
assert_almost_equal(p[4], 1 / 3, decimal=1)
|
||||
|
||||
|
||||
def test_stratified_strategy_sparse_target():
|
||||
X = [[0]] * 5 # ignored
|
||||
y = sp.csc_matrix(np.array([[4, 1],
|
||||
[0, 0],
|
||||
[1, 1],
|
||||
[1, 4],
|
||||
[1, 1]]))
|
||||
|
||||
clf = DummyClassifier(strategy="stratified", random_state=0)
|
||||
clf.fit(X, y)
|
||||
|
||||
X = [[0]] * 500
|
||||
y_pred = clf.predict(X)
|
||||
assert sp.issparse(y_pred)
|
||||
y_pred = y_pred.toarray()
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
p = np.bincount(y_pred[:, k]) / float(len(X))
|
||||
assert_almost_equal(p[1], 3. / 5, decimal=1)
|
||||
assert_almost_equal(p[0], 1. / 5, decimal=1)
|
||||
assert_almost_equal(p[4], 1. / 5, decimal=1)
|
||||
|
||||
|
||||
def test_most_frequent_and_prior_strategy_sparse_target():
|
||||
X = [[0]] * 5 # ignored
|
||||
y = sp.csc_matrix(np.array([[1, 0],
|
||||
[1, 3],
|
||||
[4, 0],
|
||||
[0, 1],
|
||||
[1, 0]]))
|
||||
|
||||
n_samples = len(X)
|
||||
y_expected = np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
|
||||
for strategy in ("most_frequent", "prior"):
|
||||
clf = DummyClassifier(strategy=strategy, random_state=0)
|
||||
clf.fit(X, y)
|
||||
|
||||
y_pred = clf.predict(X)
|
||||
assert sp.issparse(y_pred)
|
||||
assert_array_equal(y_pred.toarray(), y_expected)
|
||||
|
||||
|
||||
def test_dummy_regressor_sample_weight(n_samples=10):
|
||||
random_state = np.random.RandomState(seed=1)
|
||||
|
||||
X = [[0]] * n_samples
|
||||
y = random_state.rand(n_samples)
|
||||
sample_weight = random_state.rand(n_samples)
|
||||
|
||||
est = DummyRegressor(strategy="mean").fit(X, y, sample_weight)
|
||||
assert est.constant_ == np.average(y, weights=sample_weight)
|
||||
|
||||
est = DummyRegressor(strategy="median").fit(X, y, sample_weight)
|
||||
assert est.constant_ == _weighted_percentile(y, sample_weight, 50.)
|
||||
|
||||
est = DummyRegressor(strategy="quantile", quantile=.95).fit(X, y,
|
||||
sample_weight)
|
||||
assert est.constant_ == _weighted_percentile(y, sample_weight, 95.)
|
||||
|
||||
|
||||
def test_dummy_regressor_on_3D_array():
|
||||
X = np.array([[['foo']], [['bar']], [['baz']]])
|
||||
y = np.array([2, 2, 2])
|
||||
y_expected = np.array([2, 2, 2])
|
||||
cls = DummyRegressor()
|
||||
cls.fit(X, y)
|
||||
y_pred = cls.predict(X)
|
||||
assert_array_equal(y_pred, y_expected)
|
||||
|
||||
|
||||
def test_dummy_classifier_on_3D_array():
|
||||
X = np.array([[['foo']], [['bar']], [['baz']]])
|
||||
y = [2, 2, 2]
|
||||
y_expected = [2, 2, 2]
|
||||
y_proba_expected = [[1], [1], [1]]
|
||||
cls = DummyClassifier(strategy="stratified")
|
||||
cls.fit(X, y)
|
||||
y_pred = cls.predict(X)
|
||||
y_pred_proba = cls.predict_proba(X)
|
||||
assert_array_equal(y_pred, y_expected)
|
||||
assert_array_equal(y_pred_proba, y_proba_expected)
|
||||
|
||||
|
||||
def test_dummy_regressor_return_std():
|
||||
X = [[0]] * 3 # ignored
|
||||
y = np.array([2, 2, 2])
|
||||
y_std_expected = np.array([0, 0, 0])
|
||||
cls = DummyRegressor()
|
||||
cls.fit(X, y)
|
||||
y_pred_list = cls.predict(X, return_std=True)
|
||||
# there should be two elements when return_std is True
|
||||
assert len(y_pred_list) == 2
|
||||
# the second element should be all zeros
|
||||
assert_array_equal(y_pred_list[1], y_std_expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("y,y_test", [
|
||||
([1, 1, 1, 2], [1.25] * 4),
|
||||
(np.array([[2, 2],
|
||||
[1, 1],
|
||||
[1, 1],
|
||||
[1, 1]]),
|
||||
[[1.25, 1.25]] * 4)
|
||||
|
||||
])
|
||||
def test_regressor_score_with_None(y, y_test):
|
||||
reg = DummyRegressor()
|
||||
reg.fit(None, y)
|
||||
assert reg.score(None, y_test) == 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strategy", [
|
||||
"mean",
|
||||
"median",
|
||||
"quantile",
|
||||
"constant"
|
||||
])
|
||||
def test_regressor_prediction_independent_of_X(strategy):
|
||||
y = [0, 2, 1, 1]
|
||||
X1 = [[0]] * 4
|
||||
reg1 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
|
||||
reg1.fit(X1, y)
|
||||
predictions1 = reg1.predict(X1)
|
||||
|
||||
X2 = [[1]] * 4
|
||||
reg2 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
|
||||
reg2.fit(X2, y)
|
||||
predictions2 = reg2.predict(X2)
|
||||
|
||||
assert_array_equal(predictions1, predictions2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy", ["stratified", "most_frequent", "prior", "uniform", "constant"]
|
||||
)
|
||||
def test_dtype_of_classifier_probas(strategy):
|
||||
y = [0, 2, 1, 1]
|
||||
X = np.zeros(4)
|
||||
model = DummyClassifier(strategy=strategy, random_state=0, constant=0)
|
||||
probas = model.fit(X, y).predict_proba(X)
|
||||
|
||||
assert probas.dtype == np.float64
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:The default value of strategy.*") # 0.24
|
||||
@pytest.mark.parametrize('Dummy', (DummyRegressor, DummyClassifier))
|
||||
def test_n_features_in_(Dummy):
|
||||
X = [[1, 2]]
|
||||
y = [0]
|
||||
d = Dummy()
|
||||
assert not hasattr(d, 'n_features_in_')
|
||||
d.fit(X, y)
|
||||
assert d.n_features_in_ is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Dummy", (DummyRegressor, DummyClassifier))
|
||||
def test_outputs_2d_deprecation(Dummy):
|
||||
X = [[1, 2]]
|
||||
y = [0]
|
||||
with pytest.warns(FutureWarning,
|
||||
match="will be removed in version 0.24"):
|
||||
Dummy().fit(X, y).outputs_2d_
|
||||
|
||||
|
||||
# TODO: Remove in 0.24 when DummyClassifier's `strategy` default updates
|
||||
def test_strategy_stratified_deprecated_for_prior():
|
||||
X, y = [[1, 2]], [0]
|
||||
|
||||
msg = ("The default value of strategy will change from "
|
||||
"stratified to prior in 0.24")
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
DummyClassifier().fit(X, y)
|
|
@ -0,0 +1,53 @@
|
|||
import textwrap
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._testing import assert_run_python_script
|
||||
from sklearn._build_utils.deprecated_modules import _DEPRECATED_MODULES
|
||||
|
||||
|
||||
# We are deprecating importing anything that isn't in an __init__ file and
|
||||
# remaming most file.py into _file.py.
|
||||
# This test makes sure imports are still possible but deprecated, with the
|
||||
# appropriate error message.
|
||||
|
||||
|
||||
@pytest.mark.parametrize('deprecated_path, importee', [
|
||||
(deprecated_path, importee)
|
||||
for _, deprecated_path, _, importee in _DEPRECATED_MODULES
|
||||
])
|
||||
def test_import_is_deprecated(deprecated_path, importee):
|
||||
# Make sure that "from deprecated_path import importee" is still possible
|
||||
# but raises a warning
|
||||
# We only need one entry per file, no need to check multiple imports from
|
||||
# the same file.
|
||||
|
||||
# TODO: remove in 0.24
|
||||
|
||||
# Special case for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/15842
|
||||
if deprecated_path in ("sklearn.decomposition.dict_learning",
|
||||
"sklearn.inspection.partial_dependence"):
|
||||
pytest.skip("No warning can be raised for " + deprecated_path)
|
||||
|
||||
expected_message = (
|
||||
"The {deprecated_path} module is deprecated in version "
|
||||
"0.22 and will be removed in version 0.24. "
|
||||
"The corresponding classes / functions "
|
||||
"should instead be imported from .*. "
|
||||
"Anything that cannot be imported from .* is now "
|
||||
"part of the private API."
|
||||
).format(deprecated_path=deprecated_path)
|
||||
|
||||
script = """
|
||||
import pytest
|
||||
|
||||
with pytest.warns(FutureWarning,
|
||||
match="{expected_message}"):
|
||||
from {deprecated_path} import {importee}
|
||||
""".format(
|
||||
expected_message=expected_message,
|
||||
deprecated_path=deprecated_path,
|
||||
importee=importee
|
||||
)
|
||||
assert_run_python_script(textwrap.dedent(script))
|
19
venv/Lib/site-packages/sklearn/tests/test_init.py
Normal file
19
venv/Lib/site-packages/sklearn/tests/test_init.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
# Basic unittests to test functioning of module's top-level
|
||||
|
||||
|
||||
__author__ = 'Yaroslav Halchenko'
|
||||
__license__ = 'BSD'
|
||||
|
||||
|
||||
try:
|
||||
from sklearn import * # noqa
|
||||
_top_import_error = None
|
||||
except Exception as e:
|
||||
_top_import_error = e
|
||||
|
||||
|
||||
def test_import_skl():
|
||||
# Test either above import has failed for some reason
|
||||
# "import *" is discouraged outside of the module level, hence we
|
||||
# rely on setting up the variable above
|
||||
assert _top_import_error is None
|
510
venv/Lib/site-packages/sklearn/tests/test_isotonic.py
Normal file
510
venv/Lib/site-packages/sklearn/tests/test_isotonic.py
Normal file
|
@ -0,0 +1,510 @@
|
|||
import warnings
|
||||
import numpy as np
|
||||
import pickle
|
||||
import copy
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.isotonic import (check_increasing, isotonic_regression,
|
||||
IsotonicRegression, _make_unique)
|
||||
|
||||
from sklearn.utils.validation import check_array
|
||||
from sklearn.utils._testing import (assert_raises, assert_array_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_warns_message, assert_no_warnings)
|
||||
from sklearn.utils import shuffle
|
||||
|
||||
from scipy.special import expit
|
||||
|
||||
|
||||
def test_permutation_invariance():
|
||||
# check that fit is permutation invariant.
|
||||
# regression test of missing sorting of sample-weights
|
||||
ir = IsotonicRegression()
|
||||
x = [1, 2, 3, 4, 5, 6, 7]
|
||||
y = [1, 41, 51, 1, 2, 5, 24]
|
||||
sample_weight = [1, 2, 3, 4, 5, 6, 7]
|
||||
x_s, y_s, sample_weight_s = shuffle(x, y, sample_weight, random_state=0)
|
||||
y_transformed = ir.fit_transform(x, y, sample_weight=sample_weight)
|
||||
y_transformed_s = \
|
||||
ir.fit(x_s, y_s, sample_weight=sample_weight_s).transform(x)
|
||||
|
||||
assert_array_equal(y_transformed, y_transformed_s)
|
||||
|
||||
|
||||
def test_check_increasing_small_number_of_samples():
|
||||
x = [0, 1, 2]
|
||||
y = [1, 1.1, 1.05]
|
||||
|
||||
is_increasing = assert_no_warnings(check_increasing, x, y)
|
||||
assert is_increasing
|
||||
|
||||
|
||||
def test_check_increasing_up():
|
||||
x = [0, 1, 2, 3, 4, 5]
|
||||
y = [0, 1.5, 2.77, 8.99, 8.99, 50]
|
||||
|
||||
# Check that we got increasing=True and no warnings
|
||||
is_increasing = assert_no_warnings(check_increasing, x, y)
|
||||
assert is_increasing
|
||||
|
||||
|
||||
def test_check_increasing_up_extreme():
|
||||
x = [0, 1, 2, 3, 4, 5]
|
||||
y = [0, 1, 2, 3, 4, 5]
|
||||
|
||||
# Check that we got increasing=True and no warnings
|
||||
is_increasing = assert_no_warnings(check_increasing, x, y)
|
||||
assert is_increasing
|
||||
|
||||
|
||||
def test_check_increasing_down():
|
||||
x = [0, 1, 2, 3, 4, 5]
|
||||
y = [0, -1.5, -2.77, -8.99, -8.99, -50]
|
||||
|
||||
# Check that we got increasing=False and no warnings
|
||||
is_increasing = assert_no_warnings(check_increasing, x, y)
|
||||
assert not is_increasing
|
||||
|
||||
|
||||
def test_check_increasing_down_extreme():
|
||||
x = [0, 1, 2, 3, 4, 5]
|
||||
y = [0, -1, -2, -3, -4, -5]
|
||||
|
||||
# Check that we got increasing=False and no warnings
|
||||
is_increasing = assert_no_warnings(check_increasing, x, y)
|
||||
assert not is_increasing
|
||||
|
||||
|
||||
def test_check_ci_warn():
|
||||
x = [0, 1, 2, 3, 4, 5]
|
||||
y = [0, -1, 2, -3, 4, -5]
|
||||
|
||||
# Check that we got increasing=False and CI interval warning
|
||||
is_increasing = assert_warns_message(UserWarning, "interval",
|
||||
check_increasing,
|
||||
x, y)
|
||||
|
||||
assert not is_increasing
|
||||
|
||||
|
||||
def test_isotonic_regression():
|
||||
y = np.array([3, 7, 5, 9, 8, 7, 10])
|
||||
y_ = np.array([3, 6, 6, 8, 8, 8, 10])
|
||||
assert_array_equal(y_, isotonic_regression(y))
|
||||
|
||||
y = np.array([10, 0, 2])
|
||||
y_ = np.array([4, 4, 4])
|
||||
assert_array_equal(y_, isotonic_regression(y))
|
||||
|
||||
x = np.arange(len(y))
|
||||
ir = IsotonicRegression(y_min=0., y_max=1.)
|
||||
ir.fit(x, y)
|
||||
assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
|
||||
assert_array_equal(ir.transform(x), ir.predict(x))
|
||||
|
||||
# check that it is immune to permutation
|
||||
perm = np.random.permutation(len(y))
|
||||
ir = IsotonicRegression(y_min=0., y_max=1.)
|
||||
assert_array_equal(ir.fit_transform(x[perm], y[perm]),
|
||||
ir.fit_transform(x, y)[perm])
|
||||
assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])
|
||||
|
||||
# check we don't crash when all x are equal:
|
||||
ir = IsotonicRegression()
|
||||
assert_array_equal(ir.fit_transform(np.ones(len(x)), y), np.mean(y))
|
||||
|
||||
|
||||
def test_isotonic_regression_ties_min():
|
||||
# Setup examples with ties on minimum
|
||||
x = [1, 1, 2, 3, 4, 5]
|
||||
y = [1, 2, 3, 4, 5, 6]
|
||||
y_true = [1.5, 1.5, 3, 4, 5, 6]
|
||||
|
||||
# Check that we get identical results for fit/transform and fit_transform
|
||||
ir = IsotonicRegression()
|
||||
ir.fit(x, y)
|
||||
assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
|
||||
assert_array_equal(y_true, ir.fit_transform(x, y))
|
||||
|
||||
|
||||
def test_isotonic_regression_ties_max():
|
||||
# Setup examples with ties on maximum
|
||||
x = [1, 2, 3, 4, 5, 5]
|
||||
y = [1, 2, 3, 4, 5, 6]
|
||||
y_true = [1, 2, 3, 4, 5.5, 5.5]
|
||||
|
||||
# Check that we get identical results for fit/transform and fit_transform
|
||||
ir = IsotonicRegression()
|
||||
ir.fit(x, y)
|
||||
assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
|
||||
assert_array_equal(y_true, ir.fit_transform(x, y))
|
||||
|
||||
|
||||
def test_isotonic_regression_ties_secondary_():
|
||||
"""
|
||||
Test isotonic regression fit, transform and fit_transform
|
||||
against the "secondary" ties method and "pituitary" data from R
|
||||
"isotone" package, as detailed in: J. d. Leeuw, K. Hornik, P. Mair,
|
||||
Isotone Optimization in R: Pool-Adjacent-Violators Algorithm
|
||||
(PAVA) and Active Set Methods
|
||||
|
||||
Set values based on pituitary example and
|
||||
the following R command detailed in the paper above:
|
||||
> library("isotone")
|
||||
> data("pituitary")
|
||||
> res1 <- gpava(pituitary$age, pituitary$size, ties="secondary")
|
||||
> res1$x
|
||||
|
||||
`isotone` version: 1.0-2, 2014-09-07
|
||||
R version: R version 3.1.1 (2014-07-10)
|
||||
"""
|
||||
x = [8, 8, 8, 10, 10, 10, 12, 12, 12, 14, 14]
|
||||
y = [21, 23.5, 23, 24, 21, 25, 21.5, 22, 19, 23.5, 25]
|
||||
y_true = [22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222,
|
||||
22.22222, 22.22222, 22.22222, 24.25, 24.25]
|
||||
|
||||
# Check fit, transform and fit_transform
|
||||
ir = IsotonicRegression()
|
||||
ir.fit(x, y)
|
||||
assert_array_almost_equal(ir.transform(x), y_true, 4)
|
||||
assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4)
|
||||
|
||||
|
||||
def test_isotonic_regression_with_ties_in_differently_sized_groups():
|
||||
"""
|
||||
Non-regression test to handle issue 9432:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/9432
|
||||
|
||||
Compare against output in R:
|
||||
> library("isotone")
|
||||
> x <- c(0, 1, 1, 2, 3, 4)
|
||||
> y <- c(0, 0, 1, 0, 0, 1)
|
||||
> res1 <- gpava(x, y, ties="secondary")
|
||||
> res1$x
|
||||
|
||||
`isotone` version: 1.1-0, 2015-07-24
|
||||
R version: R version 3.3.2 (2016-10-31)
|
||||
"""
|
||||
x = np.array([0, 1, 1, 2, 3, 4])
|
||||
y = np.array([0, 0, 1, 0, 0, 1])
|
||||
y_true = np.array([0., 0.25, 0.25, 0.25, 0.25, 1.])
|
||||
ir = IsotonicRegression()
|
||||
ir.fit(x, y)
|
||||
assert_array_almost_equal(ir.transform(x), y_true)
|
||||
assert_array_almost_equal(ir.fit_transform(x, y), y_true)
|
||||
|
||||
|
||||
def test_isotonic_regression_reversed():
|
||||
y = np.array([10, 9, 10, 7, 6, 6.1, 5])
|
||||
y_ = IsotonicRegression(increasing=False).fit_transform(
|
||||
np.arange(len(y)), y)
|
||||
assert_array_equal(np.ones(y_[:-1].shape), ((y_[:-1] - y_[1:]) >= 0))
|
||||
|
||||
|
||||
def test_isotonic_regression_auto_decreasing():
|
||||
# Set y and x for decreasing
|
||||
y = np.array([10, 9, 10, 7, 6, 6.1, 5])
|
||||
x = np.arange(len(y))
|
||||
|
||||
# Create model and fit_transform
|
||||
ir = IsotonicRegression(increasing='auto')
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
warnings.simplefilter("always")
|
||||
y_ = ir.fit_transform(x, y)
|
||||
# work-around for pearson divide warnings in scipy <= 0.17.0
|
||||
assert all(["invalid value encountered in "
|
||||
in str(warn.message) for warn in w])
|
||||
|
||||
# Check that relationship decreases
|
||||
is_increasing = y_[0] < y_[-1]
|
||||
assert not is_increasing
|
||||
|
||||
|
||||
def test_isotonic_regression_auto_increasing():
|
||||
# Set y and x for decreasing
|
||||
y = np.array([5, 6.1, 6, 7, 10, 9, 10])
|
||||
x = np.arange(len(y))
|
||||
|
||||
# Create model and fit_transform
|
||||
ir = IsotonicRegression(increasing='auto')
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
warnings.simplefilter("always")
|
||||
y_ = ir.fit_transform(x, y)
|
||||
# work-around for pearson divide warnings in scipy <= 0.17.0
|
||||
assert all(["invalid value encountered in "
|
||||
in str(warn.message) for warn in w])
|
||||
|
||||
# Check that relationship increases
|
||||
is_increasing = y_[0] < y_[-1]
|
||||
assert is_increasing
|
||||
|
||||
|
||||
def test_assert_raises_exceptions():
|
||||
ir = IsotonicRegression()
|
||||
rng = np.random.RandomState(42)
|
||||
assert_raises(ValueError, ir.fit, [0, 1, 2], [5, 7, 3], [0.1, 0.6])
|
||||
assert_raises(ValueError, ir.fit, [0, 1, 2], [5, 7])
|
||||
assert_raises(ValueError, ir.fit, rng.randn(3, 10), [0, 1, 2])
|
||||
assert_raises(ValueError, ir.transform, rng.randn(3, 10))
|
||||
|
||||
|
||||
def test_isotonic_sample_weight_parameter_default_value():
|
||||
# check if default value of sample_weight parameter is one
|
||||
ir = IsotonicRegression()
|
||||
# random test data
|
||||
rng = np.random.RandomState(42)
|
||||
n = 100
|
||||
x = np.arange(n)
|
||||
y = rng.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))
|
||||
# check if value is correctly used
|
||||
weights = np.ones(n)
|
||||
y_set_value = ir.fit_transform(x, y, sample_weight=weights)
|
||||
y_default_value = ir.fit_transform(x, y)
|
||||
|
||||
assert_array_equal(y_set_value, y_default_value)
|
||||
|
||||
|
||||
def test_isotonic_min_max_boundaries():
|
||||
# check if min value is used correctly
|
||||
ir = IsotonicRegression(y_min=2, y_max=4)
|
||||
n = 6
|
||||
x = np.arange(n)
|
||||
y = np.arange(n)
|
||||
y_test = [2, 2, 2, 3, 4, 4]
|
||||
y_result = np.round(ir.fit_transform(x, y))
|
||||
assert_array_equal(y_result, y_test)
|
||||
|
||||
|
||||
def test_isotonic_sample_weight():
|
||||
ir = IsotonicRegression()
|
||||
x = [1, 2, 3, 4, 5, 6, 7]
|
||||
y = [1, 41, 51, 1, 2, 5, 24]
|
||||
sample_weight = [1, 2, 3, 4, 5, 6, 7]
|
||||
expected_y = [1, 13.95, 13.95, 13.95, 13.95, 13.95, 24]
|
||||
received_y = ir.fit_transform(x, y, sample_weight=sample_weight)
|
||||
|
||||
assert_array_equal(expected_y, received_y)
|
||||
|
||||
|
||||
def test_isotonic_regression_oob_raise():
|
||||
# Set y and x
|
||||
y = np.array([3, 7, 5, 9, 8, 7, 10])
|
||||
x = np.arange(len(y))
|
||||
|
||||
# Create model and fit
|
||||
ir = IsotonicRegression(increasing='auto', out_of_bounds="raise")
|
||||
ir.fit(x, y)
|
||||
|
||||
# Check that an exception is thrown
|
||||
assert_raises(ValueError, ir.predict, [min(x) - 10, max(x) + 10])
|
||||
|
||||
|
||||
def test_isotonic_regression_oob_clip():
|
||||
# Set y and x
|
||||
y = np.array([3, 7, 5, 9, 8, 7, 10])
|
||||
x = np.arange(len(y))
|
||||
|
||||
# Create model and fit
|
||||
ir = IsotonicRegression(increasing='auto', out_of_bounds="clip")
|
||||
ir.fit(x, y)
|
||||
|
||||
# Predict from training and test x and check that min/max match.
|
||||
y1 = ir.predict([min(x) - 10, max(x) + 10])
|
||||
y2 = ir.predict(x)
|
||||
assert max(y1) == max(y2)
|
||||
assert min(y1) == min(y2)
|
||||
|
||||
|
||||
def test_isotonic_regression_oob_nan():
|
||||
# Set y and x
|
||||
y = np.array([3, 7, 5, 9, 8, 7, 10])
|
||||
x = np.arange(len(y))
|
||||
|
||||
# Create model and fit
|
||||
ir = IsotonicRegression(increasing='auto', out_of_bounds="nan")
|
||||
ir.fit(x, y)
|
||||
|
||||
# Predict from training and test x and check that we have two NaNs.
|
||||
y1 = ir.predict([min(x) - 10, max(x) + 10])
|
||||
assert sum(np.isnan(y1)) == 2
|
||||
|
||||
|
||||
def test_isotonic_regression_oob_bad():
|
||||
# Set y and x
|
||||
y = np.array([3, 7, 5, 9, 8, 7, 10])
|
||||
x = np.arange(len(y))
|
||||
|
||||
# Create model and fit
|
||||
ir = IsotonicRegression(increasing='auto', out_of_bounds="xyz")
|
||||
|
||||
# Make sure that we throw an error for bad out_of_bounds value
|
||||
assert_raises(ValueError, ir.fit, x, y)
|
||||
|
||||
|
||||
def test_isotonic_regression_oob_bad_after():
|
||||
# Set y and x
|
||||
y = np.array([3, 7, 5, 9, 8, 7, 10])
|
||||
x = np.arange(len(y))
|
||||
|
||||
# Create model and fit
|
||||
ir = IsotonicRegression(increasing='auto', out_of_bounds="raise")
|
||||
|
||||
# Make sure that we throw an error for bad out_of_bounds value in transform
|
||||
ir.fit(x, y)
|
||||
ir.out_of_bounds = "xyz"
|
||||
assert_raises(ValueError, ir.transform, x)
|
||||
|
||||
|
||||
def test_isotonic_regression_pickle():
|
||||
y = np.array([3, 7, 5, 9, 8, 7, 10])
|
||||
x = np.arange(len(y))
|
||||
|
||||
# Create model and fit
|
||||
ir = IsotonicRegression(increasing='auto', out_of_bounds="clip")
|
||||
ir.fit(x, y)
|
||||
|
||||
ir_ser = pickle.dumps(ir, pickle.HIGHEST_PROTOCOL)
|
||||
ir2 = pickle.loads(ir_ser)
|
||||
np.testing.assert_array_equal(ir.predict(x), ir2.predict(x))
|
||||
|
||||
|
||||
def test_isotonic_duplicate_min_entry():
|
||||
x = [0, 0, 1]
|
||||
y = [0, 0, 1]
|
||||
|
||||
ir = IsotonicRegression(increasing=True, out_of_bounds="clip")
|
||||
ir.fit(x, y)
|
||||
all_predictions_finite = np.all(np.isfinite(ir.predict(x)))
|
||||
assert all_predictions_finite
|
||||
|
||||
|
||||
def test_isotonic_ymin_ymax():
|
||||
# Test from @NelleV's issue:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/6921
|
||||
x = np.array([1.263, 1.318, -0.572, 0.307, -0.707, -0.176, -1.599, 1.059,
|
||||
1.396, 1.906, 0.210, 0.028, -0.081, 0.444, 0.018, -0.377,
|
||||
-0.896, -0.377, -1.327, 0.180])
|
||||
y = isotonic_regression(x, y_min=0., y_max=0.1)
|
||||
|
||||
assert np.all(y >= 0)
|
||||
assert np.all(y <= 0.1)
|
||||
|
||||
# Also test decreasing case since the logic there is different
|
||||
y = isotonic_regression(x, y_min=0., y_max=0.1, increasing=False)
|
||||
|
||||
assert np.all(y >= 0)
|
||||
assert np.all(y <= 0.1)
|
||||
|
||||
# Finally, test with only one bound
|
||||
y = isotonic_regression(x, y_min=0., increasing=False)
|
||||
|
||||
assert np.all(y >= 0)
|
||||
|
||||
|
||||
def test_isotonic_zero_weight_loop():
|
||||
# Test from @ogrisel's issue:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/4297
|
||||
|
||||
# Get deterministic RNG with seed
|
||||
rng = np.random.RandomState(42)
|
||||
|
||||
# Create regression and samples
|
||||
regression = IsotonicRegression()
|
||||
n_samples = 50
|
||||
x = np.linspace(-3, 3, n_samples)
|
||||
y = x + rng.uniform(size=n_samples)
|
||||
|
||||
# Get some random weights and zero out
|
||||
w = rng.uniform(size=n_samples)
|
||||
w[5:8] = 0
|
||||
regression.fit(x, y, sample_weight=w)
|
||||
|
||||
# This will hang in failure case.
|
||||
regression.fit(x, y, sample_weight=w)
|
||||
|
||||
|
||||
def test_fast_predict():
|
||||
# test that the faster prediction change doesn't
|
||||
# affect out-of-sample predictions:
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/6206
|
||||
rng = np.random.RandomState(123)
|
||||
n_samples = 10 ** 3
|
||||
# X values over the -10,10 range
|
||||
X_train = 20.0 * rng.rand(n_samples) - 10
|
||||
y_train = np.less(rng.rand(n_samples),
|
||||
expit(X_train)).astype('int64').astype('float64')
|
||||
|
||||
weights = rng.rand(n_samples)
|
||||
# we also want to test that everything still works when some weights are 0
|
||||
weights[rng.rand(n_samples) < 0.1] = 0
|
||||
|
||||
slow_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip")
|
||||
fast_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip")
|
||||
|
||||
# Build interpolation function with ALL input data, not just the
|
||||
# non-redundant subset. The following 2 lines are taken from the
|
||||
# .fit() method, without removing unnecessary points
|
||||
X_train_fit, y_train_fit = slow_model._build_y(X_train, y_train,
|
||||
sample_weight=weights,
|
||||
trim_duplicates=False)
|
||||
slow_model._build_f(X_train_fit, y_train_fit)
|
||||
|
||||
# fit with just the necessary data
|
||||
fast_model.fit(X_train, y_train, sample_weight=weights)
|
||||
|
||||
X_test = 20.0 * rng.rand(n_samples) - 10
|
||||
y_pred_slow = slow_model.predict(X_test)
|
||||
y_pred_fast = fast_model.predict(X_test)
|
||||
|
||||
assert_array_equal(y_pred_slow, y_pred_fast)
|
||||
|
||||
|
||||
def test_isotonic_copy_before_fit():
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/6628
|
||||
ir = IsotonicRegression()
|
||||
copy.copy(ir)
|
||||
|
||||
|
||||
def test_isotonic_dtype():
|
||||
y = [2, 1, 4, 3, 5]
|
||||
weights = np.array([.9, .9, .9, .9, .9], dtype=np.float64)
|
||||
reg = IsotonicRegression()
|
||||
|
||||
for dtype in (np.int32, np.int64, np.float32, np.float64):
|
||||
for sample_weight in (None, weights.astype(np.float32), weights):
|
||||
y_np = np.array(y, dtype=dtype)
|
||||
expected_dtype = \
|
||||
check_array(y_np, dtype=[np.float64, np.float32],
|
||||
ensure_2d=False).dtype
|
||||
|
||||
res = isotonic_regression(y_np, sample_weight=sample_weight)
|
||||
assert res.dtype == expected_dtype
|
||||
|
||||
X = np.arange(len(y)).astype(dtype)
|
||||
reg.fit(X, y_np, sample_weight=sample_weight)
|
||||
res = reg.predict(X)
|
||||
assert res.dtype == expected_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y_dtype", [np.int32, np.int64, np.float32, np.float64]
|
||||
)
|
||||
def test_isotonic_mismatched_dtype(y_dtype):
|
||||
# regression test for #15004
|
||||
# check that data are converted when X and y dtype differ
|
||||
reg = IsotonicRegression()
|
||||
y = np.array([2, 1, 4, 3, 5], dtype=y_dtype)
|
||||
X = np.arange(len(y), dtype=np.float32)
|
||||
reg.fit(X, y)
|
||||
assert reg.predict(X).dtype == X.dtype
|
||||
|
||||
|
||||
def test_make_unique_dtype():
|
||||
x_list = [2, 2, 2, 3, 5]
|
||||
for dtype in (np.float32, np.float64):
|
||||
x = np.array(x_list, dtype=dtype)
|
||||
y = x.copy()
|
||||
w = np.ones_like(x)
|
||||
x, y, w = _make_unique(x, y, w)
|
||||
assert_array_equal(x, [2, 3, 5])
|
|
@ -0,0 +1,288 @@
|
|||
import numpy as np
|
||||
from scipy.sparse import csr_matrix
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal, assert_raises
|
||||
|
||||
from sklearn.metrics.pairwise import kernel_metrics
|
||||
from sklearn.kernel_approximation import RBFSampler
|
||||
from sklearn.kernel_approximation import AdditiveChi2Sampler
|
||||
from sklearn.kernel_approximation import SkewedChi2Sampler
|
||||
from sklearn.kernel_approximation import Nystroem
|
||||
from sklearn.metrics.pairwise import polynomial_kernel, rbf_kernel, chi2_kernel
|
||||
|
||||
# generate data
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.random_sample(size=(300, 50))
|
||||
Y = rng.random_sample(size=(300, 50))
|
||||
X /= X.sum(axis=1)[:, np.newaxis]
|
||||
Y /= Y.sum(axis=1)[:, np.newaxis]
|
||||
|
||||
|
||||
def _linear_kernel(X, Y):
|
||||
return np.dot(X, Y.T)
|
||||
|
||||
|
||||
def test_additive_chi2_sampler():
|
||||
# test that AdditiveChi2Sampler approximates kernel on random data
|
||||
|
||||
# compute exact kernel
|
||||
# abbreviations for easier formula
|
||||
X_ = X[:, np.newaxis, :]
|
||||
Y_ = Y[np.newaxis, :, :]
|
||||
|
||||
large_kernel = 2 * X_ * Y_ / (X_ + Y_)
|
||||
|
||||
# reduce to n_samples_x x n_samples_y by summing over features
|
||||
kernel = (large_kernel.sum(axis=2))
|
||||
|
||||
# approximate kernel mapping
|
||||
transform = AdditiveChi2Sampler(sample_steps=3)
|
||||
X_trans = transform.fit_transform(X)
|
||||
Y_trans = transform.transform(Y)
|
||||
|
||||
kernel_approx = np.dot(X_trans, Y_trans.T)
|
||||
|
||||
assert_array_almost_equal(kernel, kernel_approx, 1)
|
||||
|
||||
X_sp_trans = transform.fit_transform(csr_matrix(X))
|
||||
Y_sp_trans = transform.transform(csr_matrix(Y))
|
||||
|
||||
assert_array_equal(X_trans, X_sp_trans.A)
|
||||
assert_array_equal(Y_trans, Y_sp_trans.A)
|
||||
|
||||
# test error is raised on negative input
|
||||
Y_neg = Y.copy()
|
||||
Y_neg[0, 0] = -1
|
||||
assert_raises(ValueError, transform.transform, Y_neg)
|
||||
|
||||
# test error on invalid sample_steps
|
||||
transform = AdditiveChi2Sampler(sample_steps=4)
|
||||
assert_raises(ValueError, transform.fit, X)
|
||||
|
||||
# test that the sample interval is set correctly
|
||||
sample_steps_available = [1, 2, 3]
|
||||
for sample_steps in sample_steps_available:
|
||||
|
||||
# test that the sample_interval is initialized correctly
|
||||
transform = AdditiveChi2Sampler(sample_steps=sample_steps)
|
||||
assert transform.sample_interval is None
|
||||
|
||||
# test that the sample_interval is changed in the fit method
|
||||
transform.fit(X)
|
||||
assert transform.sample_interval_ is not None
|
||||
|
||||
# test that the sample_interval is set correctly
|
||||
sample_interval = 0.3
|
||||
transform = AdditiveChi2Sampler(sample_steps=4,
|
||||
sample_interval=sample_interval)
|
||||
assert transform.sample_interval == sample_interval
|
||||
transform.fit(X)
|
||||
assert transform.sample_interval_ == sample_interval
|
||||
|
||||
|
||||
def test_skewed_chi2_sampler():
|
||||
# test that RBFSampler approximates kernel on random data
|
||||
|
||||
# compute exact kernel
|
||||
c = 0.03
|
||||
# set on negative component but greater than c to ensure that the kernel
|
||||
# approximation is valid on the group (-c; +\infty) endowed with the skewed
|
||||
# multiplication.
|
||||
Y[0, 0] = -c / 2.
|
||||
|
||||
# abbreviations for easier formula
|
||||
X_c = (X + c)[:, np.newaxis, :]
|
||||
Y_c = (Y + c)[np.newaxis, :, :]
|
||||
|
||||
# we do it in log-space in the hope that it's more stable
|
||||
# this array is n_samples_x x n_samples_y big x n_features
|
||||
log_kernel = ((np.log(X_c) / 2.) + (np.log(Y_c) / 2.) + np.log(2.) -
|
||||
np.log(X_c + Y_c))
|
||||
# reduce to n_samples_x x n_samples_y by summing over features in log-space
|
||||
kernel = np.exp(log_kernel.sum(axis=2))
|
||||
|
||||
# approximate kernel mapping
|
||||
transform = SkewedChi2Sampler(skewedness=c, n_components=1000,
|
||||
random_state=42)
|
||||
X_trans = transform.fit_transform(X)
|
||||
Y_trans = transform.transform(Y)
|
||||
|
||||
kernel_approx = np.dot(X_trans, Y_trans.T)
|
||||
assert_array_almost_equal(kernel, kernel_approx, 1)
|
||||
assert np.isfinite(kernel).all(), \
|
||||
'NaNs found in the Gram matrix'
|
||||
assert np.isfinite(kernel_approx).all(), \
|
||||
'NaNs found in the approximate Gram matrix'
|
||||
|
||||
# test error is raised on when inputs contains values smaller than -c
|
||||
Y_neg = Y.copy()
|
||||
Y_neg[0, 0] = -c * 2.
|
||||
assert_raises(ValueError, transform.transform, Y_neg)
|
||||
|
||||
|
||||
def test_additive_chi2_sampler_exceptions():
|
||||
"""Ensures correct error message"""
|
||||
transformer = AdditiveChi2Sampler()
|
||||
X_neg = X.copy()
|
||||
X_neg[0, 0] = -1
|
||||
with pytest.raises(ValueError, match="X in AdditiveChi2Sampler.fit"):
|
||||
transformer.fit(X_neg)
|
||||
with pytest.raises(ValueError, match="X in AdditiveChi2Sampler.transform"):
|
||||
transformer.fit(X)
|
||||
transformer.transform(X_neg)
|
||||
|
||||
|
||||
def test_rbf_sampler():
|
||||
# test that RBFSampler approximates kernel on random data
|
||||
# compute exact kernel
|
||||
gamma = 10.
|
||||
kernel = rbf_kernel(X, Y, gamma=gamma)
|
||||
|
||||
# approximate kernel mapping
|
||||
rbf_transform = RBFSampler(gamma=gamma, n_components=1000, random_state=42)
|
||||
X_trans = rbf_transform.fit_transform(X)
|
||||
Y_trans = rbf_transform.transform(Y)
|
||||
kernel_approx = np.dot(X_trans, Y_trans.T)
|
||||
|
||||
error = kernel - kernel_approx
|
||||
assert np.abs(np.mean(error)) <= 0.01 # close to unbiased
|
||||
np.abs(error, out=error)
|
||||
assert np.max(error) <= 0.1 # nothing too far off
|
||||
assert np.mean(error) <= 0.05 # mean is fairly close
|
||||
|
||||
|
||||
def test_input_validation():
|
||||
# Regression test: kernel approx. transformers should work on lists
|
||||
# No assertions; the old versions would simply crash
|
||||
X = [[1, 2], [3, 4], [5, 6]]
|
||||
AdditiveChi2Sampler().fit(X).transform(X)
|
||||
SkewedChi2Sampler().fit(X).transform(X)
|
||||
RBFSampler().fit(X).transform(X)
|
||||
|
||||
X = csr_matrix(X)
|
||||
RBFSampler().fit(X).transform(X)
|
||||
|
||||
|
||||
def test_nystroem_approximation():
|
||||
# some basic tests
|
||||
rnd = np.random.RandomState(0)
|
||||
X = rnd.uniform(size=(10, 4))
|
||||
|
||||
# With n_components = n_samples this is exact
|
||||
X_transformed = Nystroem(n_components=X.shape[0]).fit_transform(X)
|
||||
K = rbf_kernel(X)
|
||||
assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
|
||||
|
||||
trans = Nystroem(n_components=2, random_state=rnd)
|
||||
X_transformed = trans.fit(X).transform(X)
|
||||
assert X_transformed.shape == (X.shape[0], 2)
|
||||
|
||||
# test callable kernel
|
||||
trans = Nystroem(n_components=2, kernel=_linear_kernel, random_state=rnd)
|
||||
X_transformed = trans.fit(X).transform(X)
|
||||
assert X_transformed.shape == (X.shape[0], 2)
|
||||
|
||||
# test that available kernels fit and transform
|
||||
kernels_available = kernel_metrics()
|
||||
for kern in kernels_available:
|
||||
trans = Nystroem(n_components=2, kernel=kern, random_state=rnd)
|
||||
X_transformed = trans.fit(X).transform(X)
|
||||
assert X_transformed.shape == (X.shape[0], 2)
|
||||
|
||||
|
||||
def test_nystroem_default_parameters():
|
||||
rnd = np.random.RandomState(42)
|
||||
X = rnd.uniform(size=(10, 4))
|
||||
|
||||
# rbf kernel should behave as gamma=None by default
|
||||
# aka gamma = 1 / n_features
|
||||
nystroem = Nystroem(n_components=10)
|
||||
X_transformed = nystroem.fit_transform(X)
|
||||
K = rbf_kernel(X, gamma=None)
|
||||
K2 = np.dot(X_transformed, X_transformed.T)
|
||||
assert_array_almost_equal(K, K2)
|
||||
|
||||
# chi2 kernel should behave as gamma=1 by default
|
||||
nystroem = Nystroem(kernel='chi2', n_components=10)
|
||||
X_transformed = nystroem.fit_transform(X)
|
||||
K = chi2_kernel(X, gamma=1)
|
||||
K2 = np.dot(X_transformed, X_transformed.T)
|
||||
assert_array_almost_equal(K, K2)
|
||||
|
||||
|
||||
def test_nystroem_singular_kernel():
|
||||
# test that nystroem works with singular kernel matrix
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(10, 20)
|
||||
X = np.vstack([X] * 2) # duplicate samples
|
||||
|
||||
gamma = 100
|
||||
N = Nystroem(gamma=gamma, n_components=X.shape[0]).fit(X)
|
||||
X_transformed = N.transform(X)
|
||||
|
||||
K = rbf_kernel(X, gamma=gamma)
|
||||
|
||||
assert_array_almost_equal(K, np.dot(X_transformed, X_transformed.T))
|
||||
assert np.all(np.isfinite(Y))
|
||||
|
||||
|
||||
def test_nystroem_poly_kernel_params():
|
||||
# Non-regression: Nystroem should pass other parameters beside gamma.
|
||||
rnd = np.random.RandomState(37)
|
||||
X = rnd.uniform(size=(10, 4))
|
||||
|
||||
K = polynomial_kernel(X, degree=3.1, coef0=.1)
|
||||
nystroem = Nystroem(kernel="polynomial", n_components=X.shape[0],
|
||||
degree=3.1, coef0=.1)
|
||||
X_transformed = nystroem.fit_transform(X)
|
||||
assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
|
||||
|
||||
|
||||
def test_nystroem_callable():
|
||||
# Test Nystroem on a callable.
|
||||
rnd = np.random.RandomState(42)
|
||||
n_samples = 10
|
||||
X = rnd.uniform(size=(n_samples, 4))
|
||||
|
||||
def logging_histogram_kernel(x, y, log):
|
||||
"""Histogram kernel that writes to a log."""
|
||||
log.append(1)
|
||||
return np.minimum(x, y).sum()
|
||||
|
||||
kernel_log = []
|
||||
X = list(X) # test input validation
|
||||
Nystroem(kernel=logging_histogram_kernel,
|
||||
n_components=(n_samples - 1),
|
||||
kernel_params={'log': kernel_log}).fit(X)
|
||||
assert len(kernel_log) == n_samples * (n_samples - 1) / 2
|
||||
|
||||
# if degree, gamma or coef0 is passed, we raise a warning
|
||||
msg = "Don't pass gamma, coef0 or degree to Nystroem"
|
||||
params = ({'gamma': 1}, {'coef0': 1}, {'degree': 2})
|
||||
for param in params:
|
||||
ny = Nystroem(kernel=_linear_kernel, **param)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ny.fit(X)
|
||||
|
||||
|
||||
def test_nystroem_precomputed_kernel():
|
||||
# Non-regression: test Nystroem on precomputed kernel.
|
||||
# PR - 14706
|
||||
rnd = np.random.RandomState(12)
|
||||
X = rnd.uniform(size=(10, 4))
|
||||
|
||||
K = polynomial_kernel(X, degree=2, coef0=.1)
|
||||
nystroem = Nystroem(kernel='precomputed', n_components=X.shape[0])
|
||||
X_transformed = nystroem.fit_transform(K)
|
||||
assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
|
||||
|
||||
# if degree, gamma or coef0 is passed, we raise a ValueError
|
||||
msg = "Don't pass gamma, coef0 or degree to Nystroem"
|
||||
params = ({'gamma': 1}, {'coef0': 1}, {'degree': 2})
|
||||
for param in params:
|
||||
ny = Nystroem(kernel='precomputed', n_components=X.shape[0],
|
||||
**param)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ny.fit(K)
|
85
venv/Lib/site-packages/sklearn/tests/test_kernel_ridge.py
Normal file
85
venv/Lib/site-packages/sklearn/tests/test_kernel_ridge.py
Normal file
|
@ -0,0 +1,85 @@
|
|||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.linear_model import Ridge
|
||||
from sklearn.kernel_ridge import KernelRidge
|
||||
from sklearn.metrics.pairwise import pairwise_kernels
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
|
||||
|
||||
X, y = make_regression(n_features=10, random_state=0)
|
||||
Xcsr = sp.csr_matrix(X)
|
||||
Xcsc = sp.csc_matrix(X)
|
||||
Y = np.array([y, y]).T
|
||||
|
||||
|
||||
def test_kernel_ridge():
|
||||
pred = Ridge(alpha=1, fit_intercept=False).fit(X, y).predict(X)
|
||||
pred2 = KernelRidge(kernel="linear", alpha=1).fit(X, y).predict(X)
|
||||
assert_array_almost_equal(pred, pred2)
|
||||
|
||||
|
||||
def test_kernel_ridge_csr():
|
||||
pred = Ridge(alpha=1, fit_intercept=False,
|
||||
solver="cholesky").fit(Xcsr, y).predict(Xcsr)
|
||||
pred2 = KernelRidge(kernel="linear", alpha=1).fit(Xcsr, y).predict(Xcsr)
|
||||
assert_array_almost_equal(pred, pred2)
|
||||
|
||||
|
||||
def test_kernel_ridge_csc():
|
||||
pred = Ridge(alpha=1, fit_intercept=False,
|
||||
solver="cholesky").fit(Xcsc, y).predict(Xcsc)
|
||||
pred2 = KernelRidge(kernel="linear", alpha=1).fit(Xcsc, y).predict(Xcsc)
|
||||
assert_array_almost_equal(pred, pred2)
|
||||
|
||||
|
||||
def test_kernel_ridge_singular_kernel():
|
||||
# alpha=0 causes a LinAlgError in computing the dual coefficients,
|
||||
# which causes a fallback to a lstsq solver. This is tested here.
|
||||
pred = Ridge(alpha=0, fit_intercept=False).fit(X, y).predict(X)
|
||||
kr = KernelRidge(kernel="linear", alpha=0)
|
||||
ignore_warnings(kr.fit)(X, y)
|
||||
pred2 = kr.predict(X)
|
||||
assert_array_almost_equal(pred, pred2)
|
||||
|
||||
|
||||
def test_kernel_ridge_precomputed():
|
||||
for kernel in ["linear", "rbf", "poly", "cosine"]:
|
||||
K = pairwise_kernels(X, X, metric=kernel)
|
||||
pred = KernelRidge(kernel=kernel).fit(X, y).predict(X)
|
||||
pred2 = KernelRidge(kernel="precomputed").fit(K, y).predict(K)
|
||||
assert_array_almost_equal(pred, pred2)
|
||||
|
||||
|
||||
def test_kernel_ridge_precomputed_kernel_unchanged():
|
||||
K = np.dot(X, X.T)
|
||||
K2 = K.copy()
|
||||
KernelRidge(kernel="precomputed").fit(K, y)
|
||||
assert_array_almost_equal(K, K2)
|
||||
|
||||
|
||||
def test_kernel_ridge_sample_weights():
|
||||
K = np.dot(X, X.T) # precomputed kernel
|
||||
sw = np.random.RandomState(0).rand(X.shape[0])
|
||||
|
||||
pred = Ridge(alpha=1,
|
||||
fit_intercept=False).fit(X, y, sample_weight=sw).predict(X)
|
||||
pred2 = KernelRidge(kernel="linear",
|
||||
alpha=1).fit(X, y, sample_weight=sw).predict(X)
|
||||
pred3 = KernelRidge(kernel="precomputed",
|
||||
alpha=1).fit(K, y, sample_weight=sw).predict(K)
|
||||
assert_array_almost_equal(pred, pred2)
|
||||
assert_array_almost_equal(pred, pred3)
|
||||
|
||||
|
||||
def test_kernel_ridge_multi_output():
|
||||
pred = Ridge(alpha=1, fit_intercept=False).fit(X, Y).predict(X)
|
||||
pred2 = KernelRidge(kernel="linear", alpha=1).fit(X, Y).predict(X)
|
||||
assert_array_almost_equal(pred, pred2)
|
||||
|
||||
pred3 = KernelRidge(kernel="linear", alpha=1).fit(X, y).predict(X)
|
||||
pred3 = np.array([pred3, pred3]).T
|
||||
assert_array_almost_equal(pred2, pred3)
|
147
venv/Lib/site-packages/sklearn/tests/test_metaestimators.py
Normal file
147
venv/Lib/site-packages/sklearn/tests/test_metaestimators.py
Normal file
|
@ -0,0 +1,147 @@
|
|||
"""Common tests for metaestimators"""
|
||||
import functools
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
from sklearn.utils._testing import assert_raises
|
||||
from sklearn.utils.validation import check_is_fitted
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
|
||||
from sklearn.feature_selection import RFE, RFECV
|
||||
from sklearn.ensemble import BaggingClassifier
|
||||
from sklearn.exceptions import NotFittedError
|
||||
|
||||
|
||||
class DelegatorData:
|
||||
def __init__(self, name, construct, skip_methods=(),
|
||||
fit_args=make_classification()):
|
||||
self.name = name
|
||||
self.construct = construct
|
||||
self.fit_args = fit_args
|
||||
self.skip_methods = skip_methods
|
||||
|
||||
|
||||
DELEGATING_METAESTIMATORS = [
|
||||
DelegatorData('Pipeline', lambda est: Pipeline([('est', est)])),
|
||||
DelegatorData('GridSearchCV',
|
||||
lambda est: GridSearchCV(
|
||||
est, param_grid={'param': [5]}, cv=2),
|
||||
skip_methods=['score']),
|
||||
DelegatorData('RandomizedSearchCV',
|
||||
lambda est: RandomizedSearchCV(
|
||||
est, param_distributions={'param': [5]}, cv=2, n_iter=1),
|
||||
skip_methods=['score']),
|
||||
DelegatorData('RFE', RFE,
|
||||
skip_methods=['transform', 'inverse_transform']),
|
||||
DelegatorData('RFECV', RFECV,
|
||||
skip_methods=['transform', 'inverse_transform']),
|
||||
DelegatorData('BaggingClassifier', BaggingClassifier,
|
||||
skip_methods=['transform', 'inverse_transform', 'score',
|
||||
'predict_proba', 'predict_log_proba',
|
||||
'predict'])
|
||||
]
|
||||
|
||||
|
||||
def test_metaestimator_delegation():
|
||||
# Ensures specified metaestimators have methods iff subestimator does
|
||||
def hides(method):
|
||||
@property
|
||||
def wrapper(obj):
|
||||
if obj.hidden_method == method.__name__:
|
||||
raise AttributeError('%r is hidden' % obj.hidden_method)
|
||||
return functools.partial(method, obj)
|
||||
return wrapper
|
||||
|
||||
class SubEstimator(BaseEstimator):
|
||||
def __init__(self, param=1, hidden_method=None):
|
||||
self.param = param
|
||||
self.hidden_method = hidden_method
|
||||
|
||||
def fit(self, X, y=None, *args, **kwargs):
|
||||
self.coef_ = np.arange(X.shape[1])
|
||||
return True
|
||||
|
||||
def _check_fit(self):
|
||||
check_is_fitted(self)
|
||||
|
||||
@hides
|
||||
def inverse_transform(self, X, *args, **kwargs):
|
||||
self._check_fit()
|
||||
return X
|
||||
|
||||
@hides
|
||||
def transform(self, X, *args, **kwargs):
|
||||
self._check_fit()
|
||||
return X
|
||||
|
||||
@hides
|
||||
def predict(self, X, *args, **kwargs):
|
||||
self._check_fit()
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
@hides
|
||||
def predict_proba(self, X, *args, **kwargs):
|
||||
self._check_fit()
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
@hides
|
||||
def predict_log_proba(self, X, *args, **kwargs):
|
||||
self._check_fit()
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
@hides
|
||||
def decision_function(self, X, *args, **kwargs):
|
||||
self._check_fit()
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
@hides
|
||||
def score(self, X, y, *args, **kwargs):
|
||||
self._check_fit()
|
||||
return 1.0
|
||||
|
||||
methods = [k for k in SubEstimator.__dict__.keys()
|
||||
if not k.startswith('_') and not k.startswith('fit')]
|
||||
methods.sort()
|
||||
|
||||
for delegator_data in DELEGATING_METAESTIMATORS:
|
||||
delegate = SubEstimator()
|
||||
delegator = delegator_data.construct(delegate)
|
||||
for method in methods:
|
||||
if method in delegator_data.skip_methods:
|
||||
continue
|
||||
assert hasattr(delegate, method)
|
||||
assert hasattr(delegator, method), (
|
||||
"%s does not have method %r when its delegate does"
|
||||
% (delegator_data.name, method))
|
||||
# delegation before fit raises a NotFittedError
|
||||
if method == 'score':
|
||||
assert_raises(NotFittedError, getattr(delegator, method),
|
||||
delegator_data.fit_args[0],
|
||||
delegator_data.fit_args[1])
|
||||
else:
|
||||
assert_raises(NotFittedError, getattr(delegator, method),
|
||||
delegator_data.fit_args[0])
|
||||
|
||||
delegator.fit(*delegator_data.fit_args)
|
||||
for method in methods:
|
||||
if method in delegator_data.skip_methods:
|
||||
continue
|
||||
# smoke test delegation
|
||||
if method == 'score':
|
||||
getattr(delegator, method)(delegator_data.fit_args[0],
|
||||
delegator_data.fit_args[1])
|
||||
else:
|
||||
getattr(delegator, method)(delegator_data.fit_args[0])
|
||||
|
||||
for method in methods:
|
||||
if method in delegator_data.skip_methods:
|
||||
continue
|
||||
delegate = SubEstimator(hidden_method=method)
|
||||
delegator = delegator_data.construct(delegate)
|
||||
assert not hasattr(delegate, method)
|
||||
assert not hasattr(delegator, method), (
|
||||
"%s has method %r when its delegate does not"
|
||||
% (delegator_data.name, method))
|
749
venv/Lib/site-packages/sklearn/tests/test_multiclass.py
Normal file
749
venv/Lib/site-packages/sklearn/tests/test_multiclass.py
Normal file
|
@ -0,0 +1,749 @@
|
|||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from re import escape
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
from sklearn.utils._testing import assert_raises
|
||||
from sklearn.utils._testing import assert_warns
|
||||
from sklearn.utils._testing import assert_raise_message
|
||||
from sklearn.utils._testing import assert_raises_regexp
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.multiclass import OneVsOneClassifier
|
||||
from sklearn.multiclass import OutputCodeClassifier
|
||||
from sklearn.utils.multiclass import (check_classification_targets,
|
||||
type_of_target)
|
||||
from sklearn.utils import shuffle
|
||||
|
||||
from sklearn.metrics import precision_score
|
||||
from sklearn.metrics import recall_score
|
||||
|
||||
from sklearn.svm import LinearSVC, SVC
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.linear_model import (LinearRegression, Lasso, ElasticNet, Ridge,
|
||||
Perceptron, LogisticRegression,
|
||||
SGDClassifier)
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.model_selection import GridSearchCV, cross_val_score
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn import svm
|
||||
from sklearn import datasets
|
||||
|
||||
iris = datasets.load_iris()
|
||||
rng = np.random.RandomState(0)
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data = iris.data[perm]
|
||||
iris.target = iris.target[perm]
|
||||
n_classes = 3
|
||||
|
||||
|
||||
def test_ovr_exceptions():
|
||||
ovr = OneVsRestClassifier(LinearSVC(random_state=0))
|
||||
assert_raises(ValueError, ovr.predict, [])
|
||||
|
||||
# Fail on multioutput data
|
||||
assert_raises(ValueError, OneVsRestClassifier(MultinomialNB()).fit,
|
||||
np.array([[1, 0], [0, 1]]),
|
||||
np.array([[1, 2], [3, 1]]))
|
||||
assert_raises(ValueError, OneVsRestClassifier(MultinomialNB()).fit,
|
||||
np.array([[1, 0], [0, 1]]),
|
||||
np.array([[1.5, 2.4], [3.1, 0.8]]))
|
||||
|
||||
|
||||
def test_check_classification_targets():
|
||||
# Test that check_classification_target return correct type. #5782
|
||||
y = np.array([0.0, 1.1, 2.0, 3.0])
|
||||
msg = type_of_target(y)
|
||||
assert_raise_message(ValueError, msg, check_classification_targets, y)
|
||||
|
||||
|
||||
def test_ovr_fit_predict():
|
||||
# A classifier which implements decision_function.
|
||||
ovr = OneVsRestClassifier(LinearSVC(random_state=0))
|
||||
pred = ovr.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert len(ovr.estimators_) == n_classes
|
||||
|
||||
clf = LinearSVC(random_state=0)
|
||||
pred2 = clf.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert np.mean(iris.target == pred) == np.mean(iris.target == pred2)
|
||||
|
||||
# A classifier which implements predict_proba.
|
||||
ovr = OneVsRestClassifier(MultinomialNB())
|
||||
pred = ovr.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert np.mean(iris.target == pred) > 0.65
|
||||
|
||||
|
||||
def test_ovr_partial_fit():
|
||||
# Test if partial_fit is working as intended
|
||||
X, y = shuffle(iris.data, iris.target, random_state=0)
|
||||
ovr = OneVsRestClassifier(MultinomialNB())
|
||||
ovr.partial_fit(X[:100], y[:100], np.unique(y))
|
||||
ovr.partial_fit(X[100:], y[100:])
|
||||
pred = ovr.predict(X)
|
||||
ovr2 = OneVsRestClassifier(MultinomialNB())
|
||||
pred2 = ovr2.fit(X, y).predict(X)
|
||||
|
||||
assert_almost_equal(pred, pred2)
|
||||
assert len(ovr.estimators_) == len(np.unique(y))
|
||||
assert np.mean(y == pred) > 0.65
|
||||
|
||||
# Test when mini batches doesn't have all classes
|
||||
# with SGDClassifier
|
||||
X = np.abs(np.random.randn(14, 2))
|
||||
y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]
|
||||
|
||||
ovr = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None,
|
||||
shuffle=False, random_state=0))
|
||||
ovr.partial_fit(X[:7], y[:7], np.unique(y))
|
||||
ovr.partial_fit(X[7:], y[7:])
|
||||
pred = ovr.predict(X)
|
||||
ovr1 = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None,
|
||||
shuffle=False, random_state=0))
|
||||
pred1 = ovr1.fit(X, y).predict(X)
|
||||
assert np.mean(pred == y) == np.mean(pred1 == y)
|
||||
|
||||
# test partial_fit only exists if estimator has it:
|
||||
ovr = OneVsRestClassifier(SVC())
|
||||
assert not hasattr(ovr, "partial_fit")
|
||||
|
||||
|
||||
def test_ovr_partial_fit_exceptions():
|
||||
ovr = OneVsRestClassifier(MultinomialNB())
|
||||
X = np.abs(np.random.randn(14, 2))
|
||||
y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]
|
||||
ovr.partial_fit(X[:7], y[:7], np.unique(y))
|
||||
# A new class value which was not in the first call of partial_fit
|
||||
# It should raise ValueError
|
||||
y1 = [5] + y[7:-1]
|
||||
assert_raises_regexp(ValueError, r"Mini-batch contains \[.+\] while "
|
||||
r"classes must be subset of \[.+\]",
|
||||
ovr.partial_fit, X=X[7:], y=y1)
|
||||
|
||||
|
||||
def test_ovr_ovo_regressor():
|
||||
# test that ovr and ovo work on regressors which don't have a decision_
|
||||
# function
|
||||
ovr = OneVsRestClassifier(DecisionTreeRegressor())
|
||||
pred = ovr.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert len(ovr.estimators_) == n_classes
|
||||
assert_array_equal(np.unique(pred), [0, 1, 2])
|
||||
# we are doing something sensible
|
||||
assert np.mean(pred == iris.target) > .9
|
||||
|
||||
ovr = OneVsOneClassifier(DecisionTreeRegressor())
|
||||
pred = ovr.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert len(ovr.estimators_) == n_classes * (n_classes - 1) / 2
|
||||
assert_array_equal(np.unique(pred), [0, 1, 2])
|
||||
# we are doing something sensible
|
||||
assert np.mean(pred == iris.target) > .9
|
||||
|
||||
|
||||
def test_ovr_fit_predict_sparse():
|
||||
for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix,
|
||||
sp.lil_matrix]:
|
||||
base_clf = MultinomialNB(alpha=1)
|
||||
|
||||
X, Y = datasets.make_multilabel_classification(n_samples=100,
|
||||
n_features=20,
|
||||
n_classes=5,
|
||||
n_labels=3,
|
||||
length=50,
|
||||
allow_unlabeled=True,
|
||||
random_state=0)
|
||||
|
||||
X_train, Y_train = X[:80], Y[:80]
|
||||
X_test = X[80:]
|
||||
|
||||
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
|
||||
Y_pred = clf.predict(X_test)
|
||||
|
||||
clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train))
|
||||
Y_pred_sprs = clf_sprs.predict(X_test)
|
||||
|
||||
assert clf.multilabel_
|
||||
assert sp.issparse(Y_pred_sprs)
|
||||
assert_array_equal(Y_pred_sprs.toarray(), Y_pred)
|
||||
|
||||
# Test predict_proba
|
||||
Y_proba = clf_sprs.predict_proba(X_test)
|
||||
|
||||
# predict assigns a label if the probability that the
|
||||
# sample has the label is greater than 0.5.
|
||||
pred = Y_proba > .5
|
||||
assert_array_equal(pred, Y_pred_sprs.toarray())
|
||||
|
||||
# Test decision_function
|
||||
clf = svm.SVC()
|
||||
clf_sprs = OneVsRestClassifier(clf).fit(X_train, sparse(Y_train))
|
||||
dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int)
|
||||
assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
|
||||
|
||||
|
||||
def test_ovr_always_present():
|
||||
# Test that ovr works with classes that are always present or absent.
|
||||
# Note: tests is the case where _ConstantPredictor is utilised
|
||||
X = np.ones((10, 2))
|
||||
X[:5, :] = 0
|
||||
|
||||
# Build an indicator matrix where two features are always on.
|
||||
# As list of lists, it would be: [[int(i >= 5), 2, 3] for i in range(10)]
|
||||
y = np.zeros((10, 3))
|
||||
y[5:, 0] = 1
|
||||
y[:, 1] = 1
|
||||
y[:, 2] = 1
|
||||
|
||||
ovr = OneVsRestClassifier(LogisticRegression())
|
||||
assert_warns(UserWarning, ovr.fit, X, y)
|
||||
y_pred = ovr.predict(X)
|
||||
assert_array_equal(np.array(y_pred), np.array(y))
|
||||
y_pred = ovr.decision_function(X)
|
||||
assert np.unique(y_pred[:, -2:]) == 1
|
||||
y_pred = ovr.predict_proba(X)
|
||||
assert_array_equal(y_pred[:, -1], np.ones(X.shape[0]))
|
||||
|
||||
# y has a constantly absent label
|
||||
y = np.zeros((10, 2))
|
||||
y[5:, 0] = 1 # variable label
|
||||
ovr = OneVsRestClassifier(LogisticRegression())
|
||||
assert_warns(UserWarning, ovr.fit, X, y)
|
||||
y_pred = ovr.predict_proba(X)
|
||||
assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0]))
|
||||
|
||||
|
||||
def test_ovr_multiclass():
|
||||
# Toy dataset where features correspond directly to labels.
|
||||
X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
|
||||
y = ["eggs", "spam", "ham", "eggs", "ham"]
|
||||
Y = np.array([[0, 0, 1],
|
||||
[0, 1, 0],
|
||||
[1, 0, 0],
|
||||
[0, 0, 1],
|
||||
[1, 0, 0]])
|
||||
|
||||
classes = set("ham eggs spam".split())
|
||||
|
||||
for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
|
||||
LinearRegression(), Ridge(),
|
||||
ElasticNet()):
|
||||
clf = OneVsRestClassifier(base_clf).fit(X, y)
|
||||
assert set(clf.classes_) == classes
|
||||
y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
|
||||
assert_array_equal(y_pred, ["eggs"])
|
||||
|
||||
# test input as label indicator matrix
|
||||
clf = OneVsRestClassifier(base_clf).fit(X, Y)
|
||||
y_pred = clf.predict([[0, 0, 4]])[0]
|
||||
assert_array_equal(y_pred, [0, 0, 1])
|
||||
|
||||
|
||||
def test_ovr_binary():
|
||||
# Toy dataset where features correspond directly to labels.
|
||||
X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
|
||||
y = ["eggs", "spam", "spam", "eggs", "spam"]
|
||||
Y = np.array([[0, 1, 1, 0, 1]]).T
|
||||
|
||||
classes = set("eggs spam".split())
|
||||
|
||||
def conduct_test(base_clf, test_predict_proba=False):
|
||||
clf = OneVsRestClassifier(base_clf).fit(X, y)
|
||||
assert set(clf.classes_) == classes
|
||||
y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
|
||||
assert_array_equal(y_pred, ["eggs"])
|
||||
if hasattr(base_clf, 'decision_function'):
|
||||
dec = clf.decision_function(X)
|
||||
assert dec.shape == (5,)
|
||||
|
||||
if test_predict_proba:
|
||||
X_test = np.array([[0, 0, 4]])
|
||||
probabilities = clf.predict_proba(X_test)
|
||||
assert 2 == len(probabilities[0])
|
||||
assert (clf.classes_[np.argmax(probabilities, axis=1)] ==
|
||||
clf.predict(X_test))
|
||||
|
||||
# test input as label indicator matrix
|
||||
clf = OneVsRestClassifier(base_clf).fit(X, Y)
|
||||
y_pred = clf.predict([[3, 0, 0]])[0]
|
||||
assert y_pred == 1
|
||||
|
||||
for base_clf in (LinearSVC(random_state=0), LinearRegression(),
|
||||
Ridge(), ElasticNet()):
|
||||
conduct_test(base_clf)
|
||||
|
||||
for base_clf in (MultinomialNB(), SVC(probability=True),
|
||||
LogisticRegression()):
|
||||
conduct_test(base_clf, test_predict_proba=True)
|
||||
|
||||
|
||||
def test_ovr_multilabel():
|
||||
# Toy dataset where features correspond directly to labels.
|
||||
X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]])
|
||||
y = np.array([[0, 1, 1],
|
||||
[0, 1, 0],
|
||||
[1, 1, 1],
|
||||
[1, 0, 1],
|
||||
[1, 0, 0]])
|
||||
|
||||
for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
|
||||
LinearRegression(), Ridge(),
|
||||
ElasticNet(), Lasso(alpha=0.5)):
|
||||
clf = OneVsRestClassifier(base_clf).fit(X, y)
|
||||
y_pred = clf.predict([[0, 4, 4]])[0]
|
||||
assert_array_equal(y_pred, [0, 1, 1])
|
||||
assert clf.multilabel_
|
||||
|
||||
|
||||
def test_ovr_fit_predict_svc():
|
||||
ovr = OneVsRestClassifier(svm.SVC())
|
||||
ovr.fit(iris.data, iris.target)
|
||||
assert len(ovr.estimators_) == 3
|
||||
assert ovr.score(iris.data, iris.target) > .9
|
||||
|
||||
|
||||
def test_ovr_multilabel_dataset():
|
||||
base_clf = MultinomialNB(alpha=1)
|
||||
for au, prec, recall in zip((True, False), (0.51, 0.66), (0.51, 0.80)):
|
||||
X, Y = datasets.make_multilabel_classification(n_samples=100,
|
||||
n_features=20,
|
||||
n_classes=5,
|
||||
n_labels=2,
|
||||
length=50,
|
||||
allow_unlabeled=au,
|
||||
random_state=0)
|
||||
X_train, Y_train = X[:80], Y[:80]
|
||||
X_test, Y_test = X[80:], Y[80:]
|
||||
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
|
||||
Y_pred = clf.predict(X_test)
|
||||
|
||||
assert clf.multilabel_
|
||||
assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"),
|
||||
prec,
|
||||
decimal=2)
|
||||
assert_almost_equal(recall_score(Y_test, Y_pred, average="micro"),
|
||||
recall,
|
||||
decimal=2)
|
||||
|
||||
|
||||
def test_ovr_multilabel_predict_proba():
|
||||
base_clf = MultinomialNB(alpha=1)
|
||||
for au in (False, True):
|
||||
X, Y = datasets.make_multilabel_classification(n_samples=100,
|
||||
n_features=20,
|
||||
n_classes=5,
|
||||
n_labels=3,
|
||||
length=50,
|
||||
allow_unlabeled=au,
|
||||
random_state=0)
|
||||
X_train, Y_train = X[:80], Y[:80]
|
||||
X_test = X[80:]
|
||||
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
|
||||
|
||||
# Decision function only estimator.
|
||||
decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
|
||||
assert not hasattr(decision_only, 'predict_proba')
|
||||
|
||||
# Estimator with predict_proba disabled, depending on parameters.
|
||||
decision_only = OneVsRestClassifier(svm.SVC(probability=False))
|
||||
assert not hasattr(decision_only, 'predict_proba')
|
||||
decision_only.fit(X_train, Y_train)
|
||||
assert not hasattr(decision_only, 'predict_proba')
|
||||
assert hasattr(decision_only, 'decision_function')
|
||||
|
||||
# Estimator which can get predict_proba enabled after fitting
|
||||
gs = GridSearchCV(svm.SVC(probability=False),
|
||||
param_grid={'probability': [True]})
|
||||
proba_after_fit = OneVsRestClassifier(gs)
|
||||
assert not hasattr(proba_after_fit, 'predict_proba')
|
||||
proba_after_fit.fit(X_train, Y_train)
|
||||
assert hasattr(proba_after_fit, 'predict_proba')
|
||||
|
||||
Y_pred = clf.predict(X_test)
|
||||
Y_proba = clf.predict_proba(X_test)
|
||||
|
||||
# predict assigns a label if the probability that the
|
||||
# sample has the label is greater than 0.5.
|
||||
pred = Y_proba > .5
|
||||
assert_array_equal(pred, Y_pred)
|
||||
|
||||
|
||||
def test_ovr_single_label_predict_proba():
|
||||
base_clf = MultinomialNB(alpha=1)
|
||||
X, Y = iris.data, iris.target
|
||||
X_train, Y_train = X[:80], Y[:80]
|
||||
X_test = X[80:]
|
||||
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
|
||||
|
||||
# Decision function only estimator.
|
||||
decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
|
||||
assert not hasattr(decision_only, 'predict_proba')
|
||||
|
||||
Y_pred = clf.predict(X_test)
|
||||
Y_proba = clf.predict_proba(X_test)
|
||||
|
||||
assert_almost_equal(Y_proba.sum(axis=1), 1.0)
|
||||
# predict assigns a label if the probability that the
|
||||
# sample has the label is greater than 0.5.
|
||||
pred = np.array([l.argmax() for l in Y_proba])
|
||||
assert not (pred - Y_pred).any()
|
||||
|
||||
|
||||
def test_ovr_multilabel_decision_function():
|
||||
X, Y = datasets.make_multilabel_classification(n_samples=100,
|
||||
n_features=20,
|
||||
n_classes=5,
|
||||
n_labels=3,
|
||||
length=50,
|
||||
allow_unlabeled=True,
|
||||
random_state=0)
|
||||
X_train, Y_train = X[:80], Y[:80]
|
||||
X_test = X[80:]
|
||||
clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train)
|
||||
assert_array_equal((clf.decision_function(X_test) > 0).astype(int),
|
||||
clf.predict(X_test))
|
||||
|
||||
|
||||
def test_ovr_single_label_decision_function():
|
||||
X, Y = datasets.make_classification(n_samples=100,
|
||||
n_features=20,
|
||||
random_state=0)
|
||||
X_train, Y_train = X[:80], Y[:80]
|
||||
X_test = X[80:]
|
||||
clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train)
|
||||
assert_array_equal(clf.decision_function(X_test).ravel() > 0,
|
||||
clf.predict(X_test))
|
||||
|
||||
|
||||
def test_ovr_gridsearch():
|
||||
ovr = OneVsRestClassifier(LinearSVC(random_state=0))
|
||||
Cs = [0.1, 0.5, 0.8]
|
||||
cv = GridSearchCV(ovr, {'estimator__C': Cs})
|
||||
cv.fit(iris.data, iris.target)
|
||||
best_C = cv.best_estimator_.estimators_[0].C
|
||||
assert best_C in Cs
|
||||
|
||||
|
||||
def test_ovr_pipeline():
|
||||
# Test with pipeline of length one
|
||||
# This test is needed because the multiclass estimators may fail to detect
|
||||
# the presence of predict_proba or decision_function.
|
||||
clf = Pipeline([("tree", DecisionTreeClassifier())])
|
||||
ovr_pipe = OneVsRestClassifier(clf)
|
||||
ovr_pipe.fit(iris.data, iris.target)
|
||||
ovr = OneVsRestClassifier(DecisionTreeClassifier())
|
||||
ovr.fit(iris.data, iris.target)
|
||||
assert_array_equal(ovr.predict(iris.data), ovr_pipe.predict(iris.data))
|
||||
|
||||
|
||||
def test_ovr_coef_():
|
||||
for base_classifier in [SVC(kernel='linear', random_state=0),
|
||||
LinearSVC(random_state=0)]:
|
||||
# SVC has sparse coef with sparse input data
|
||||
|
||||
ovr = OneVsRestClassifier(base_classifier)
|
||||
for X in [iris.data, sp.csr_matrix(iris.data)]:
|
||||
# test with dense and sparse coef
|
||||
ovr.fit(X, iris.target)
|
||||
shape = ovr.coef_.shape
|
||||
assert shape[0] == n_classes
|
||||
assert shape[1] == iris.data.shape[1]
|
||||
# don't densify sparse coefficients
|
||||
assert (sp.issparse(ovr.estimators_[0].coef_) ==
|
||||
sp.issparse(ovr.coef_))
|
||||
|
||||
|
||||
def test_ovr_coef_exceptions():
|
||||
# Not fitted exception!
|
||||
ovr = OneVsRestClassifier(LinearSVC(random_state=0))
|
||||
# lambda is needed because we don't want coef_ to be evaluated right away
|
||||
assert_raises(ValueError, lambda x: ovr.coef_, None)
|
||||
|
||||
# Doesn't have coef_ exception!
|
||||
ovr = OneVsRestClassifier(DecisionTreeClassifier())
|
||||
ovr.fit(iris.data, iris.target)
|
||||
assert_raises(AttributeError, lambda x: ovr.coef_, None)
|
||||
|
||||
|
||||
def test_ovo_exceptions():
|
||||
ovo = OneVsOneClassifier(LinearSVC(random_state=0))
|
||||
assert_raises(ValueError, ovo.predict, [])
|
||||
|
||||
|
||||
def test_ovo_fit_on_list():
|
||||
# Test that OneVsOne fitting works with a list of targets and yields the
|
||||
# same output as predict from an array
|
||||
ovo = OneVsOneClassifier(LinearSVC(random_state=0))
|
||||
prediction_from_array = ovo.fit(iris.data, iris.target).predict(iris.data)
|
||||
iris_data_list = [list(a) for a in iris.data]
|
||||
prediction_from_list = ovo.fit(iris_data_list,
|
||||
list(iris.target)).predict(iris_data_list)
|
||||
assert_array_equal(prediction_from_array, prediction_from_list)
|
||||
|
||||
|
||||
def test_ovo_fit_predict():
|
||||
# A classifier which implements decision_function.
|
||||
ovo = OneVsOneClassifier(LinearSVC(random_state=0))
|
||||
ovo.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert len(ovo.estimators_) == n_classes * (n_classes - 1) / 2
|
||||
|
||||
# A classifier which implements predict_proba.
|
||||
ovo = OneVsOneClassifier(MultinomialNB())
|
||||
ovo.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert len(ovo.estimators_) == n_classes * (n_classes - 1) / 2
|
||||
|
||||
|
||||
def test_ovo_partial_fit_predict():
|
||||
temp = datasets.load_iris()
|
||||
X, y = temp.data, temp.target
|
||||
ovo1 = OneVsOneClassifier(MultinomialNB())
|
||||
ovo1.partial_fit(X[:100], y[:100], np.unique(y))
|
||||
ovo1.partial_fit(X[100:], y[100:])
|
||||
pred1 = ovo1.predict(X)
|
||||
|
||||
ovo2 = OneVsOneClassifier(MultinomialNB())
|
||||
ovo2.fit(X, y)
|
||||
pred2 = ovo2.predict(X)
|
||||
assert len(ovo1.estimators_) == n_classes * (n_classes - 1) / 2
|
||||
assert np.mean(y == pred1) > 0.65
|
||||
assert_almost_equal(pred1, pred2)
|
||||
|
||||
# Test when mini-batches have binary target classes
|
||||
ovo1 = OneVsOneClassifier(MultinomialNB())
|
||||
ovo1.partial_fit(X[:60], y[:60], np.unique(y))
|
||||
ovo1.partial_fit(X[60:], y[60:])
|
||||
pred1 = ovo1.predict(X)
|
||||
ovo2 = OneVsOneClassifier(MultinomialNB())
|
||||
pred2 = ovo2.fit(X, y).predict(X)
|
||||
|
||||
assert_almost_equal(pred1, pred2)
|
||||
assert len(ovo1.estimators_) == len(np.unique(y))
|
||||
assert np.mean(y == pred1) > 0.65
|
||||
|
||||
ovo = OneVsOneClassifier(MultinomialNB())
|
||||
X = np.random.rand(14, 2)
|
||||
y = [1, 1, 2, 3, 3, 0, 0, 4, 4, 4, 4, 4, 2, 2]
|
||||
ovo.partial_fit(X[:7], y[:7], [0, 1, 2, 3, 4])
|
||||
ovo.partial_fit(X[7:], y[7:])
|
||||
pred = ovo.predict(X)
|
||||
ovo2 = OneVsOneClassifier(MultinomialNB())
|
||||
pred2 = ovo2.fit(X, y).predict(X)
|
||||
assert_almost_equal(pred, pred2)
|
||||
|
||||
# raises error when mini-batch does not have classes from all_classes
|
||||
ovo = OneVsOneClassifier(MultinomialNB())
|
||||
error_y = [0, 1, 2, 3, 4, 5, 2]
|
||||
message_re = escape("Mini-batch contains {0} while "
|
||||
"it must be subset of {1}".format(np.unique(error_y),
|
||||
np.unique(y)))
|
||||
assert_raises_regexp(ValueError, message_re, ovo.partial_fit, X[:7],
|
||||
error_y, np.unique(y))
|
||||
|
||||
# test partial_fit only exists if estimator has it:
|
||||
ovr = OneVsOneClassifier(SVC())
|
||||
assert not hasattr(ovr, "partial_fit")
|
||||
|
||||
|
||||
def test_ovo_decision_function():
|
||||
n_samples = iris.data.shape[0]
|
||||
|
||||
ovo_clf = OneVsOneClassifier(LinearSVC(random_state=0))
|
||||
# first binary
|
||||
ovo_clf.fit(iris.data, iris.target == 0)
|
||||
decisions = ovo_clf.decision_function(iris.data)
|
||||
assert decisions.shape == (n_samples,)
|
||||
|
||||
# then multi-class
|
||||
ovo_clf.fit(iris.data, iris.target)
|
||||
decisions = ovo_clf.decision_function(iris.data)
|
||||
|
||||
assert decisions.shape == (n_samples, n_classes)
|
||||
assert_array_equal(decisions.argmax(axis=1), ovo_clf.predict(iris.data))
|
||||
|
||||
# Compute the votes
|
||||
votes = np.zeros((n_samples, n_classes))
|
||||
|
||||
k = 0
|
||||
for i in range(n_classes):
|
||||
for j in range(i + 1, n_classes):
|
||||
pred = ovo_clf.estimators_[k].predict(iris.data)
|
||||
votes[pred == 0, i] += 1
|
||||
votes[pred == 1, j] += 1
|
||||
k += 1
|
||||
|
||||
# Extract votes and verify
|
||||
assert_array_equal(votes, np.round(decisions))
|
||||
|
||||
for class_idx in range(n_classes):
|
||||
# For each sample and each class, there only 3 possible vote levels
|
||||
# because they are only 3 distinct class pairs thus 3 distinct
|
||||
# binary classifiers.
|
||||
# Therefore, sorting predictions based on votes would yield
|
||||
# mostly tied predictions:
|
||||
assert set(votes[:, class_idx]).issubset(set([0., 1., 2.]))
|
||||
|
||||
# The OVO decision function on the other hand is able to resolve
|
||||
# most of the ties on this data as it combines both the vote counts
|
||||
# and the aggregated confidence levels of the binary classifiers
|
||||
# to compute the aggregate decision function. The iris dataset
|
||||
# has 150 samples with a couple of duplicates. The OvO decisions
|
||||
# can resolve most of the ties:
|
||||
assert len(np.unique(decisions[:, class_idx])) > 146
|
||||
|
||||
|
||||
def test_ovo_gridsearch():
|
||||
ovo = OneVsOneClassifier(LinearSVC(random_state=0))
|
||||
Cs = [0.1, 0.5, 0.8]
|
||||
cv = GridSearchCV(ovo, {'estimator__C': Cs})
|
||||
cv.fit(iris.data, iris.target)
|
||||
best_C = cv.best_estimator_.estimators_[0].C
|
||||
assert best_C in Cs
|
||||
|
||||
|
||||
def test_ovo_ties():
|
||||
# Test that ties are broken using the decision function,
|
||||
# not defaulting to the smallest label
|
||||
X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
|
||||
y = np.array([2, 0, 1, 2])
|
||||
multi_clf = OneVsOneClassifier(Perceptron(shuffle=False, max_iter=4,
|
||||
tol=None))
|
||||
ovo_prediction = multi_clf.fit(X, y).predict(X)
|
||||
ovo_decision = multi_clf.decision_function(X)
|
||||
|
||||
# Classifiers are in order 0-1, 0-2, 1-2
|
||||
# Use decision_function to compute the votes and the normalized
|
||||
# sum_of_confidences, which is used to disambiguate when there is a tie in
|
||||
# votes.
|
||||
votes = np.round(ovo_decision)
|
||||
normalized_confidences = ovo_decision - votes
|
||||
|
||||
# For the first point, there is one vote per class
|
||||
assert_array_equal(votes[0, :], 1)
|
||||
# For the rest, there is no tie and the prediction is the argmax
|
||||
assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:])
|
||||
# For the tie, the prediction is the class with the highest score
|
||||
assert ovo_prediction[0] == normalized_confidences[0].argmax()
|
||||
|
||||
|
||||
def test_ovo_ties2():
|
||||
# test that ties can not only be won by the first two labels
|
||||
X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
|
||||
y_ref = np.array([2, 0, 1, 2])
|
||||
|
||||
# cycle through labels so that each label wins once
|
||||
for i in range(3):
|
||||
y = (y_ref + i) % 3
|
||||
multi_clf = OneVsOneClassifier(Perceptron(shuffle=False, max_iter=4,
|
||||
tol=None))
|
||||
ovo_prediction = multi_clf.fit(X, y).predict(X)
|
||||
assert ovo_prediction[0] == i % 3
|
||||
|
||||
|
||||
def test_ovo_string_y():
|
||||
# Test that the OvO doesn't mess up the encoding of string labels
|
||||
X = np.eye(4)
|
||||
y = np.array(['a', 'b', 'c', 'd'])
|
||||
|
||||
ovo = OneVsOneClassifier(LinearSVC())
|
||||
ovo.fit(X, y)
|
||||
assert_array_equal(y, ovo.predict(X))
|
||||
|
||||
|
||||
def test_ovo_one_class():
|
||||
# Test error for OvO with one class
|
||||
X = np.eye(4)
|
||||
y = np.array(['a'] * 4)
|
||||
|
||||
ovo = OneVsOneClassifier(LinearSVC())
|
||||
assert_raise_message(ValueError, "when only one class", ovo.fit, X, y)
|
||||
|
||||
|
||||
def test_ovo_float_y():
|
||||
# Test that the OvO errors on float targets
|
||||
X = iris.data
|
||||
y = iris.data[:, 0]
|
||||
|
||||
ovo = OneVsOneClassifier(LinearSVC())
|
||||
assert_raise_message(ValueError, "Unknown label type", ovo.fit, X, y)
|
||||
|
||||
|
||||
def test_ecoc_exceptions():
|
||||
ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
|
||||
assert_raises(ValueError, ecoc.predict, [])
|
||||
|
||||
|
||||
def test_ecoc_fit_predict():
|
||||
# A classifier which implements decision_function.
|
||||
ecoc = OutputCodeClassifier(LinearSVC(random_state=0),
|
||||
code_size=2, random_state=0)
|
||||
ecoc.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert len(ecoc.estimators_) == n_classes * 2
|
||||
|
||||
# A classifier which implements predict_proba.
|
||||
ecoc = OutputCodeClassifier(MultinomialNB(), code_size=2, random_state=0)
|
||||
ecoc.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert len(ecoc.estimators_) == n_classes * 2
|
||||
|
||||
|
||||
def test_ecoc_gridsearch():
|
||||
ecoc = OutputCodeClassifier(LinearSVC(random_state=0),
|
||||
random_state=0)
|
||||
Cs = [0.1, 0.5, 0.8]
|
||||
cv = GridSearchCV(ecoc, {'estimator__C': Cs})
|
||||
cv.fit(iris.data, iris.target)
|
||||
best_C = cv.best_estimator_.estimators_[0].C
|
||||
assert best_C in Cs
|
||||
|
||||
|
||||
def test_ecoc_float_y():
|
||||
# Test that the OCC errors on float targets
|
||||
X = iris.data
|
||||
y = iris.data[:, 0]
|
||||
|
||||
ovo = OutputCodeClassifier(LinearSVC())
|
||||
assert_raise_message(ValueError, "Unknown label type", ovo.fit, X, y)
|
||||
ovo = OutputCodeClassifier(LinearSVC(), code_size=-1)
|
||||
assert_raise_message(ValueError, "code_size should be greater than 0,"
|
||||
" got -1", ovo.fit, X, y)
|
||||
|
||||
|
||||
def test_pairwise_indices():
|
||||
clf_precomputed = svm.SVC(kernel='precomputed')
|
||||
X, y = iris.data, iris.target
|
||||
|
||||
ovr_false = OneVsOneClassifier(clf_precomputed)
|
||||
linear_kernel = np.dot(X, X.T)
|
||||
ovr_false.fit(linear_kernel, y)
|
||||
|
||||
n_estimators = len(ovr_false.estimators_)
|
||||
precomputed_indices = ovr_false.pairwise_indices_
|
||||
|
||||
for idx in precomputed_indices:
|
||||
assert (idx.shape[0] * n_estimators / (n_estimators - 1) ==
|
||||
linear_kernel.shape[0])
|
||||
|
||||
|
||||
def test_pairwise_attribute():
|
||||
clf_precomputed = svm.SVC(kernel='precomputed')
|
||||
clf_notprecomputed = svm.SVC()
|
||||
|
||||
for MultiClassClassifier in [OneVsRestClassifier, OneVsOneClassifier]:
|
||||
ovr_false = MultiClassClassifier(clf_notprecomputed)
|
||||
assert not ovr_false._pairwise
|
||||
|
||||
ovr_true = MultiClassClassifier(clf_precomputed)
|
||||
assert ovr_true._pairwise
|
||||
|
||||
|
||||
def test_pairwise_cross_val_score():
|
||||
clf_precomputed = svm.SVC(kernel='precomputed')
|
||||
clf_notprecomputed = svm.SVC(kernel='linear')
|
||||
|
||||
X, y = iris.data, iris.target
|
||||
|
||||
for MultiClassClassifier in [OneVsRestClassifier, OneVsOneClassifier]:
|
||||
ovr_false = MultiClassClassifier(clf_notprecomputed)
|
||||
ovr_true = MultiClassClassifier(clf_precomputed)
|
||||
|
||||
linear_kernel = np.dot(X, X.T)
|
||||
score_precomputed = cross_val_score(ovr_true, linear_kernel, y)
|
||||
score_linear = cross_val_score(ovr_false, X, y)
|
||||
assert_array_equal(score_precomputed, score_linear)
|
613
venv/Lib/site-packages/sklearn/tests/test_multioutput.py
Normal file
613
venv/Lib/site-packages/sklearn/tests/test_multioutput.py
Normal file
|
@ -0,0 +1,613 @@
|
|||
|
||||
import pytest
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
from joblib import cpu_count
|
||||
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
from sklearn.utils._testing import assert_raises
|
||||
from sklearn.utils._testing import assert_raises_regex
|
||||
from sklearn.utils._testing import assert_raise_message
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn import datasets
|
||||
from sklearn.base import clone
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.linear_model import Lasso
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.linear_model import OrthogonalMatchingPursuit
|
||||
from sklearn.linear_model import Ridge
|
||||
from sklearn.linear_model import SGDClassifier
|
||||
from sklearn.linear_model import SGDRegressor
|
||||
from sklearn.metrics import jaccard_score, mean_squared_error
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.multioutput import ClassifierChain, RegressorChain
|
||||
from sklearn.multioutput import MultiOutputClassifier
|
||||
from sklearn.multioutput import MultiOutputRegressor
|
||||
from sklearn.multioutput import MultiOutputEstimator
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.base import ClassifierMixin
|
||||
from sklearn.utils import shuffle
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.dummy import DummyRegressor, DummyClassifier
|
||||
|
||||
|
||||
def test_multi_target_regression():
|
||||
X, y = datasets.make_regression(n_targets=3)
|
||||
X_train, y_train = X[:50], y[:50]
|
||||
X_test, y_test = X[50:], y[50:]
|
||||
|
||||
references = np.zeros_like(y_test)
|
||||
for n in range(3):
|
||||
rgr = GradientBoostingRegressor(random_state=0)
|
||||
rgr.fit(X_train, y_train[:, n])
|
||||
references[:, n] = rgr.predict(X_test)
|
||||
|
||||
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
|
||||
rgr.fit(X_train, y_train)
|
||||
y_pred = rgr.predict(X_test)
|
||||
|
||||
assert_almost_equal(references, y_pred)
|
||||
|
||||
|
||||
def test_multi_target_regression_partial_fit():
|
||||
X, y = datasets.make_regression(n_targets=3)
|
||||
X_train, y_train = X[:50], y[:50]
|
||||
X_test, y_test = X[50:], y[50:]
|
||||
|
||||
references = np.zeros_like(y_test)
|
||||
half_index = 25
|
||||
for n in range(3):
|
||||
sgr = SGDRegressor(random_state=0, max_iter=5)
|
||||
sgr.partial_fit(X_train[:half_index], y_train[:half_index, n])
|
||||
sgr.partial_fit(X_train[half_index:], y_train[half_index:, n])
|
||||
references[:, n] = sgr.predict(X_test)
|
||||
|
||||
sgr = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
|
||||
|
||||
sgr.partial_fit(X_train[:half_index], y_train[:half_index])
|
||||
sgr.partial_fit(X_train[half_index:], y_train[half_index:])
|
||||
|
||||
y_pred = sgr.predict(X_test)
|
||||
assert_almost_equal(references, y_pred)
|
||||
assert not hasattr(MultiOutputRegressor(Lasso), 'partial_fit')
|
||||
|
||||
|
||||
def test_multi_target_regression_one_target():
|
||||
# Test multi target regression raises
|
||||
X, y = datasets.make_regression(n_targets=1)
|
||||
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
|
||||
assert_raises(ValueError, rgr.fit, X, y)
|
||||
|
||||
|
||||
def test_multi_target_sparse_regression():
|
||||
X, y = datasets.make_regression(n_targets=3)
|
||||
X_train, y_train = X[:50], y[:50]
|
||||
X_test = X[50:]
|
||||
|
||||
for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix,
|
||||
sp.lil_matrix]:
|
||||
rgr = MultiOutputRegressor(Lasso(random_state=0))
|
||||
rgr_sparse = MultiOutputRegressor(Lasso(random_state=0))
|
||||
|
||||
rgr.fit(X_train, y_train)
|
||||
rgr_sparse.fit(sparse(X_train), y_train)
|
||||
|
||||
assert_almost_equal(rgr.predict(X_test),
|
||||
rgr_sparse.predict(sparse(X_test)))
|
||||
|
||||
|
||||
def test_multi_target_sample_weights_api():
|
||||
X = [[1, 2, 3], [4, 5, 6]]
|
||||
y = [[3.141, 2.718], [2.718, 3.141]]
|
||||
w = [0.8, 0.6]
|
||||
|
||||
rgr = MultiOutputRegressor(OrthogonalMatchingPursuit())
|
||||
assert_raises_regex(ValueError, "does not support sample weights",
|
||||
rgr.fit, X, y, w)
|
||||
|
||||
# no exception should be raised if the base estimator supports weights
|
||||
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
|
||||
rgr.fit(X, y, w)
|
||||
|
||||
|
||||
def test_multi_target_sample_weight_partial_fit():
|
||||
# weighted regressor
|
||||
X = [[1, 2, 3], [4, 5, 6]]
|
||||
y = [[3.141, 2.718], [2.718, 3.141]]
|
||||
w = [2., 1.]
|
||||
rgr_w = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
|
||||
rgr_w.partial_fit(X, y, w)
|
||||
|
||||
# weighted with different weights
|
||||
w = [2., 2.]
|
||||
rgr = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
|
||||
rgr.partial_fit(X, y, w)
|
||||
|
||||
assert rgr.predict(X)[0][0] != rgr_w.predict(X)[0][0]
|
||||
|
||||
|
||||
def test_multi_target_sample_weights():
|
||||
# weighted regressor
|
||||
Xw = [[1, 2, 3], [4, 5, 6]]
|
||||
yw = [[3.141, 2.718], [2.718, 3.141]]
|
||||
w = [2., 1.]
|
||||
rgr_w = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
|
||||
rgr_w.fit(Xw, yw, w)
|
||||
|
||||
# unweighted, but with repeated samples
|
||||
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
|
||||
y = [[3.141, 2.718], [3.141, 2.718], [2.718, 3.141]]
|
||||
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
|
||||
rgr.fit(X, y)
|
||||
|
||||
X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
|
||||
assert_almost_equal(rgr.predict(X_test), rgr_w.predict(X_test))
|
||||
|
||||
|
||||
# Import the data
|
||||
iris = datasets.load_iris()
|
||||
# create a multiple targets by randomized shuffling and concatenating y.
|
||||
X = iris.data
|
||||
y1 = iris.target
|
||||
y2 = shuffle(y1, random_state=1)
|
||||
y3 = shuffle(y1, random_state=2)
|
||||
y = np.column_stack((y1, y2, y3))
|
||||
n_samples, n_features = X.shape
|
||||
n_outputs = y.shape[1]
|
||||
n_classes = len(np.unique(y1))
|
||||
classes = list(map(np.unique, (y1, y2, y3)))
|
||||
|
||||
|
||||
def test_multi_output_classification_partial_fit_parallelism():
|
||||
sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
|
||||
mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4)
|
||||
mor.partial_fit(X, y, classes)
|
||||
est1 = mor.estimators_[0]
|
||||
mor.partial_fit(X, y)
|
||||
est2 = mor.estimators_[0]
|
||||
if cpu_count() > 1:
|
||||
# parallelism requires this to be the case for a sane implementation
|
||||
assert est1 is not est2
|
||||
|
||||
|
||||
# check multioutput has predict_proba
|
||||
def test_hasattr_multi_output_predict_proba():
|
||||
# default SGDClassifier has loss='hinge'
|
||||
# which does not expose a predict_proba method
|
||||
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
|
||||
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
|
||||
multi_target_linear.fit(X, y)
|
||||
assert not hasattr(multi_target_linear, "predict_proba")
|
||||
|
||||
# case where predict_proba attribute exists
|
||||
sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
|
||||
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
|
||||
multi_target_linear.fit(X, y)
|
||||
assert hasattr(multi_target_linear, "predict_proba")
|
||||
|
||||
|
||||
# check predict_proba passes
|
||||
def test_multi_output_predict_proba():
|
||||
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
|
||||
param = {'loss': ('hinge', 'log', 'modified_huber')}
|
||||
|
||||
# inner function for custom scoring
|
||||
def custom_scorer(estimator, X, y):
|
||||
if hasattr(estimator, "predict_proba"):
|
||||
return 1.0
|
||||
else:
|
||||
return 0.0
|
||||
grid_clf = GridSearchCV(sgd_linear_clf, param_grid=param,
|
||||
scoring=custom_scorer, cv=3)
|
||||
multi_target_linear = MultiOutputClassifier(grid_clf)
|
||||
multi_target_linear.fit(X, y)
|
||||
|
||||
multi_target_linear.predict_proba(X)
|
||||
|
||||
# SGDClassifier defaults to loss='hinge' which is not a probabilistic
|
||||
# loss function; therefore it does not expose a predict_proba method
|
||||
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
|
||||
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
|
||||
multi_target_linear.fit(X, y)
|
||||
err_msg = "The base estimator should implement predict_proba method"
|
||||
with pytest.raises(AttributeError, match=err_msg):
|
||||
multi_target_linear.predict_proba(X)
|
||||
|
||||
|
||||
def test_multi_output_classification_partial_fit():
|
||||
# test if multi_target initializes correctly with base estimator and fit
|
||||
# assert predictions work as expected for predict
|
||||
|
||||
sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
|
||||
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
|
||||
|
||||
# train the multi_target_linear and also get the predictions.
|
||||
half_index = X.shape[0] // 2
|
||||
multi_target_linear.partial_fit(
|
||||
X[:half_index], y[:half_index], classes=classes)
|
||||
|
||||
first_predictions = multi_target_linear.predict(X)
|
||||
assert (n_samples, n_outputs) == first_predictions.shape
|
||||
|
||||
multi_target_linear.partial_fit(X[half_index:], y[half_index:])
|
||||
second_predictions = multi_target_linear.predict(X)
|
||||
assert (n_samples, n_outputs) == second_predictions.shape
|
||||
|
||||
# train the linear classification with each column and assert that
|
||||
# predictions are equal after first partial_fit and second partial_fit
|
||||
for i in range(3):
|
||||
# create a clone with the same state
|
||||
sgd_linear_clf = clone(sgd_linear_clf)
|
||||
sgd_linear_clf.partial_fit(
|
||||
X[:half_index], y[:half_index, i], classes=classes[i])
|
||||
assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i])
|
||||
sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i])
|
||||
assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
|
||||
|
||||
|
||||
def test_multi_output_classification_partial_fit_no_first_classes_exception():
|
||||
sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
|
||||
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
|
||||
assert_raises_regex(ValueError, "classes must be passed on the first call "
|
||||
"to partial_fit.",
|
||||
multi_target_linear.partial_fit, X, y)
|
||||
|
||||
|
||||
def test_multi_output_classification():
|
||||
# test if multi_target initializes correctly with base estimator and fit
|
||||
# assert predictions work as expected for predict, prodict_proba and score
|
||||
|
||||
forest = RandomForestClassifier(n_estimators=10, random_state=1)
|
||||
multi_target_forest = MultiOutputClassifier(forest)
|
||||
|
||||
# train the multi_target_forest and also get the predictions.
|
||||
multi_target_forest.fit(X, y)
|
||||
|
||||
predictions = multi_target_forest.predict(X)
|
||||
assert (n_samples, n_outputs) == predictions.shape
|
||||
|
||||
predict_proba = multi_target_forest.predict_proba(X)
|
||||
|
||||
assert len(predict_proba) == n_outputs
|
||||
for class_probabilities in predict_proba:
|
||||
assert (n_samples, n_classes) == class_probabilities.shape
|
||||
|
||||
assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1),
|
||||
predictions)
|
||||
|
||||
# train the forest with each column and assert that predictions are equal
|
||||
for i in range(3):
|
||||
forest_ = clone(forest) # create a clone with the same state
|
||||
forest_.fit(X, y[:, i])
|
||||
assert list(forest_.predict(X)) == list(predictions[:, i])
|
||||
assert_array_equal(list(forest_.predict_proba(X)),
|
||||
list(predict_proba[i]))
|
||||
|
||||
|
||||
def test_multiclass_multioutput_estimator():
|
||||
# test to check meta of meta estimators
|
||||
svc = LinearSVC(random_state=0)
|
||||
multi_class_svc = OneVsRestClassifier(svc)
|
||||
multi_target_svc = MultiOutputClassifier(multi_class_svc)
|
||||
|
||||
multi_target_svc.fit(X, y)
|
||||
|
||||
predictions = multi_target_svc.predict(X)
|
||||
assert (n_samples, n_outputs) == predictions.shape
|
||||
|
||||
# train the forest with each column and assert that predictions are equal
|
||||
for i in range(3):
|
||||
multi_class_svc_ = clone(multi_class_svc) # create a clone
|
||||
multi_class_svc_.fit(X, y[:, i])
|
||||
assert (list(multi_class_svc_.predict(X)) ==
|
||||
list(predictions[:, i]))
|
||||
|
||||
|
||||
def test_multiclass_multioutput_estimator_predict_proba():
|
||||
seed = 542
|
||||
|
||||
# make test deterministic
|
||||
rng = np.random.RandomState(seed)
|
||||
|
||||
# random features
|
||||
X = rng.normal(size=(5, 5))
|
||||
|
||||
# random labels
|
||||
y1 = np.array(['b', 'a', 'a', 'b', 'a']).reshape(5, 1) # 2 classes
|
||||
y2 = np.array(['d', 'e', 'f', 'e', 'd']).reshape(5, 1) # 3 classes
|
||||
|
||||
Y = np.concatenate([y1, y2], axis=1)
|
||||
|
||||
clf = MultiOutputClassifier(LogisticRegression(
|
||||
solver='liblinear', random_state=seed))
|
||||
|
||||
clf.fit(X, Y)
|
||||
|
||||
y_result = clf.predict_proba(X)
|
||||
y_actual = [np.array([[0.23481764, 0.76518236],
|
||||
[0.67196072, 0.32803928],
|
||||
[0.54681448, 0.45318552],
|
||||
[0.34883923, 0.65116077],
|
||||
[0.73687069, 0.26312931]]),
|
||||
np.array([[0.5171785, 0.23878628, 0.24403522],
|
||||
[0.22141451, 0.64102704, 0.13755846],
|
||||
[0.16751315, 0.18256843, 0.64991843],
|
||||
[0.27357372, 0.55201592, 0.17441036],
|
||||
[0.65745193, 0.26062899, 0.08191907]])]
|
||||
|
||||
for i in range(len(y_actual)):
|
||||
assert_almost_equal(y_result[i], y_actual[i])
|
||||
|
||||
|
||||
def test_multi_output_classification_sample_weights():
|
||||
# weighted classifier
|
||||
Xw = [[1, 2, 3], [4, 5, 6]]
|
||||
yw = [[3, 2], [2, 3]]
|
||||
w = np.asarray([2., 1.])
|
||||
forest = RandomForestClassifier(n_estimators=10, random_state=1)
|
||||
clf_w = MultiOutputClassifier(forest)
|
||||
clf_w.fit(Xw, yw, w)
|
||||
|
||||
# unweighted, but with repeated samples
|
||||
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
|
||||
y = [[3, 2], [3, 2], [2, 3]]
|
||||
forest = RandomForestClassifier(n_estimators=10, random_state=1)
|
||||
clf = MultiOutputClassifier(forest)
|
||||
clf.fit(X, y)
|
||||
|
||||
X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
|
||||
assert_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
|
||||
|
||||
|
||||
def test_multi_output_classification_partial_fit_sample_weights():
|
||||
# weighted classifier
|
||||
Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
|
||||
yw = [[3, 2], [2, 3], [3, 2]]
|
||||
w = np.asarray([2., 1., 1.])
|
||||
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20)
|
||||
clf_w = MultiOutputClassifier(sgd_linear_clf)
|
||||
clf_w.fit(Xw, yw, w)
|
||||
|
||||
# unweighted, but with repeated samples
|
||||
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
|
||||
y = [[3, 2], [3, 2], [2, 3], [3, 2]]
|
||||
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20)
|
||||
clf = MultiOutputClassifier(sgd_linear_clf)
|
||||
clf.fit(X, y)
|
||||
X_test = [[1.5, 2.5, 3.5]]
|
||||
assert_array_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
|
||||
|
||||
|
||||
def test_multi_output_exceptions():
|
||||
# NotFittedError when fit is not done but score, predict and
|
||||
# and predict_proba are called
|
||||
moc = MultiOutputClassifier(LinearSVC(random_state=0))
|
||||
assert_raises(NotFittedError, moc.predict, y)
|
||||
with pytest.raises(NotFittedError):
|
||||
moc.predict_proba
|
||||
assert_raises(NotFittedError, moc.score, X, y)
|
||||
# ValueError when number of outputs is different
|
||||
# for fit and score
|
||||
y_new = np.column_stack((y1, y2))
|
||||
moc.fit(X, y)
|
||||
assert_raises(ValueError, moc.score, X, y_new)
|
||||
# ValueError when y is continuous
|
||||
assert_raise_message(ValueError, "Unknown label type", moc.fit, X, X[:, 1])
|
||||
|
||||
|
||||
def generate_multilabel_dataset_with_correlations():
|
||||
# Generate a multilabel data set from a multiclass dataset as a way of
|
||||
# by representing the integer number of the original class using a binary
|
||||
# encoding.
|
||||
X, y = make_classification(n_samples=1000,
|
||||
n_features=100,
|
||||
n_classes=16,
|
||||
n_informative=10,
|
||||
random_state=0)
|
||||
|
||||
Y_multi = np.array([[int(yyy) for yyy in format(yy, '#06b')[2:]]
|
||||
for yy in y])
|
||||
return X, Y_multi
|
||||
|
||||
|
||||
def test_classifier_chain_fit_and_predict_with_linear_svc():
|
||||
# Fit classifier chain and verify predict performance using LinearSVC
|
||||
X, Y = generate_multilabel_dataset_with_correlations()
|
||||
classifier_chain = ClassifierChain(LinearSVC())
|
||||
classifier_chain.fit(X, Y)
|
||||
|
||||
Y_pred = classifier_chain.predict(X)
|
||||
assert Y_pred.shape == Y.shape
|
||||
|
||||
Y_decision = classifier_chain.decision_function(X)
|
||||
|
||||
Y_binary = (Y_decision >= 0)
|
||||
assert_array_equal(Y_binary, Y_pred)
|
||||
assert not hasattr(classifier_chain, 'predict_proba')
|
||||
|
||||
|
||||
def test_classifier_chain_fit_and_predict_with_sparse_data():
|
||||
# Fit classifier chain with sparse data
|
||||
X, Y = generate_multilabel_dataset_with_correlations()
|
||||
X_sparse = sp.csr_matrix(X)
|
||||
|
||||
classifier_chain = ClassifierChain(LogisticRegression())
|
||||
classifier_chain.fit(X_sparse, Y)
|
||||
Y_pred_sparse = classifier_chain.predict(X_sparse)
|
||||
|
||||
classifier_chain = ClassifierChain(LogisticRegression())
|
||||
classifier_chain.fit(X, Y)
|
||||
Y_pred_dense = classifier_chain.predict(X)
|
||||
|
||||
assert_array_equal(Y_pred_sparse, Y_pred_dense)
|
||||
|
||||
|
||||
def test_classifier_chain_vs_independent_models():
|
||||
# Verify that an ensemble of classifier chains (each of length
|
||||
# N) can achieve a higher Jaccard similarity score than N independent
|
||||
# models
|
||||
X, Y = generate_multilabel_dataset_with_correlations()
|
||||
X_train = X[:600, :]
|
||||
X_test = X[600:, :]
|
||||
Y_train = Y[:600, :]
|
||||
Y_test = Y[600:, :]
|
||||
|
||||
ovr = OneVsRestClassifier(LogisticRegression())
|
||||
ovr.fit(X_train, Y_train)
|
||||
Y_pred_ovr = ovr.predict(X_test)
|
||||
|
||||
chain = ClassifierChain(LogisticRegression())
|
||||
chain.fit(X_train, Y_train)
|
||||
Y_pred_chain = chain.predict(X_test)
|
||||
|
||||
assert (jaccard_score(Y_test, Y_pred_chain, average='samples') >
|
||||
jaccard_score(Y_test, Y_pred_ovr, average='samples'))
|
||||
|
||||
|
||||
def test_base_chain_fit_and_predict():
|
||||
# Fit base chain and verify predict performance
|
||||
X, Y = generate_multilabel_dataset_with_correlations()
|
||||
chains = [RegressorChain(Ridge()),
|
||||
ClassifierChain(LogisticRegression())]
|
||||
for chain in chains:
|
||||
chain.fit(X, Y)
|
||||
Y_pred = chain.predict(X)
|
||||
assert Y_pred.shape == Y.shape
|
||||
assert ([c.coef_.size for c in chain.estimators_] ==
|
||||
list(range(X.shape[1], X.shape[1] + Y.shape[1])))
|
||||
|
||||
Y_prob = chains[1].predict_proba(X)
|
||||
Y_binary = (Y_prob >= .5)
|
||||
assert_array_equal(Y_binary, Y_pred)
|
||||
|
||||
assert isinstance(chains[1], ClassifierMixin)
|
||||
|
||||
|
||||
def test_base_chain_fit_and_predict_with_sparse_data_and_cv():
|
||||
# Fit base chain with sparse data cross_val_predict
|
||||
X, Y = generate_multilabel_dataset_with_correlations()
|
||||
X_sparse = sp.csr_matrix(X)
|
||||
base_chains = [ClassifierChain(LogisticRegression(), cv=3),
|
||||
RegressorChain(Ridge(), cv=3)]
|
||||
for chain in base_chains:
|
||||
chain.fit(X_sparse, Y)
|
||||
Y_pred = chain.predict(X_sparse)
|
||||
assert Y_pred.shape == Y.shape
|
||||
|
||||
|
||||
def test_base_chain_random_order():
|
||||
# Fit base chain with random order
|
||||
X, Y = generate_multilabel_dataset_with_correlations()
|
||||
for chain in [ClassifierChain(LogisticRegression()),
|
||||
RegressorChain(Ridge())]:
|
||||
chain_random = clone(chain).set_params(order='random', random_state=42)
|
||||
chain_random.fit(X, Y)
|
||||
chain_fixed = clone(chain).set_params(order=chain_random.order_)
|
||||
chain_fixed.fit(X, Y)
|
||||
assert_array_equal(chain_fixed.order_, chain_random.order_)
|
||||
assert list(chain_random.order) != list(range(4))
|
||||
assert len(chain_random.order_) == 4
|
||||
assert len(set(chain_random.order_)) == 4
|
||||
# Randomly ordered chain should behave identically to a fixed order
|
||||
# chain with the same order.
|
||||
for est1, est2 in zip(chain_random.estimators_,
|
||||
chain_fixed.estimators_):
|
||||
assert_array_almost_equal(est1.coef_, est2.coef_)
|
||||
|
||||
|
||||
def test_base_chain_crossval_fit_and_predict():
|
||||
# Fit chain with cross_val_predict and verify predict
|
||||
# performance
|
||||
X, Y = generate_multilabel_dataset_with_correlations()
|
||||
|
||||
for chain in [ClassifierChain(LogisticRegression()),
|
||||
RegressorChain(Ridge())]:
|
||||
chain.fit(X, Y)
|
||||
chain_cv = clone(chain).set_params(cv=3)
|
||||
chain_cv.fit(X, Y)
|
||||
Y_pred_cv = chain_cv.predict(X)
|
||||
Y_pred = chain.predict(X)
|
||||
|
||||
assert Y_pred_cv.shape == Y_pred.shape
|
||||
assert not np.all(Y_pred == Y_pred_cv)
|
||||
if isinstance(chain, ClassifierChain):
|
||||
assert jaccard_score(Y, Y_pred_cv, average='samples') > .4
|
||||
else:
|
||||
assert mean_squared_error(Y, Y_pred_cv) < .25
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'estimator',
|
||||
[RandomForestClassifier(n_estimators=2),
|
||||
MultiOutputClassifier(RandomForestClassifier(n_estimators=2)),
|
||||
ClassifierChain(RandomForestClassifier(n_estimators=2))]
|
||||
)
|
||||
def test_multi_output_classes_(estimator):
|
||||
# Tests classes_ attribute of multioutput classifiers
|
||||
# RandomForestClassifier supports multioutput out-of-the-box
|
||||
estimator.fit(X, y)
|
||||
assert isinstance(estimator.classes_, list)
|
||||
assert len(estimator.classes_) == n_outputs
|
||||
for estimator_classes, expected_classes in zip(classes,
|
||||
estimator.classes_):
|
||||
assert_array_equal(estimator_classes, expected_classes)
|
||||
|
||||
|
||||
# TODO: remove in 0.24
|
||||
def test_deprecation():
|
||||
class A(MultiOutputEstimator, MultiOutputRegressor):
|
||||
pass
|
||||
|
||||
with pytest.warns(FutureWarning, match="is deprecated in version 0.22"):
|
||||
A(SGDRegressor(random_state=0, max_iter=5))
|
||||
|
||||
|
||||
class DummyRegressorWithFitParams(DummyRegressor):
|
||||
def fit(self, X, y, sample_weight=None, **fit_params):
|
||||
self._fit_params = fit_params
|
||||
return super().fit(X, y, sample_weight)
|
||||
|
||||
|
||||
class DummyClassifierWithFitParams(DummyClassifier):
|
||||
def fit(self, X, y, sample_weight=None, **fit_params):
|
||||
self._fit_params = fit_params
|
||||
return super().fit(X, y, sample_weight)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator, dataset",
|
||||
[(MultiOutputClassifier(DummyClassifierWithFitParams(strategy="prior")),
|
||||
datasets.make_multilabel_classification()),
|
||||
(MultiOutputRegressor(DummyRegressorWithFitParams()),
|
||||
datasets.make_regression(n_targets=3))])
|
||||
def test_multioutput_estimator_with_fit_params(estimator, dataset):
|
||||
X, y = dataset
|
||||
some_param = np.zeros_like(X)
|
||||
estimator.fit(X, y, some_param=some_param)
|
||||
for dummy_estimator in estimator.estimators_:
|
||||
assert 'some_param' in dummy_estimator._fit_params
|
||||
|
||||
|
||||
def test_regressor_chain_w_fit_params():
|
||||
# Make sure fit_params are properly propagated to the sub-estimators
|
||||
rng = np.random.RandomState(0)
|
||||
X, y = datasets.make_regression(n_targets=3)
|
||||
weight = rng.rand(y.shape[0])
|
||||
|
||||
class MySGD(SGDRegressor):
|
||||
|
||||
def fit(self, X, y, **fit_params):
|
||||
self.sample_weight_ = fit_params['sample_weight']
|
||||
super().fit(X, y, **fit_params)
|
||||
|
||||
model = RegressorChain(MySGD())
|
||||
|
||||
# Fitting with params
|
||||
fit_param = {'sample_weight': weight}
|
||||
model.fit(X, y, **fit_param)
|
||||
|
||||
for est in model.estimators_:
|
||||
assert est.sample_weight_ is weight
|
844
venv/Lib/site-packages/sklearn/tests/test_naive_bayes.py
Normal file
844
venv/Lib/site-packages/sklearn/tests/test_naive_bayes.py
Normal file
|
@ -0,0 +1,844 @@
|
|||
|
||||
import pickle
|
||||
from io import BytesIO
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
import pytest
|
||||
|
||||
from sklearn.datasets import load_digits, load_iris
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import cross_val_score
|
||||
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_raises
|
||||
from sklearn.utils._testing import assert_raise_message
|
||||
from sklearn.utils._testing import assert_warns
|
||||
from sklearn.utils._testing import assert_no_warnings
|
||||
|
||||
from sklearn.naive_bayes import GaussianNB, BernoulliNB
|
||||
from sklearn.naive_bayes import MultinomialNB, ComplementNB
|
||||
from sklearn.naive_bayes import CategoricalNB
|
||||
from sklearn.naive_bayes import BaseNB, BaseDiscreteNB
|
||||
|
||||
|
||||
# Data is just 6 separable points in the plane
|
||||
X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
|
||||
y = np.array([1, 1, 1, 2, 2, 2])
|
||||
|
||||
# A bit more random tests
|
||||
rng = np.random.RandomState(0)
|
||||
X1 = rng.normal(size=(10, 3))
|
||||
y1 = (rng.normal(size=(10)) > 0).astype(np.int)
|
||||
|
||||
# Data is 6 random integer points in a 100 dimensional space classified to
|
||||
# three classes.
|
||||
X2 = rng.randint(5, size=(6, 100))
|
||||
y2 = np.array([1, 1, 2, 2, 3, 3])
|
||||
|
||||
|
||||
def test_gnb():
|
||||
# Gaussian Naive Bayes classification.
|
||||
# This checks that GaussianNB implements fit and predict and returns
|
||||
# correct values for a simple toy dataset.
|
||||
|
||||
clf = GaussianNB()
|
||||
y_pred = clf.fit(X, y).predict(X)
|
||||
assert_array_equal(y_pred, y)
|
||||
|
||||
y_pred_proba = clf.predict_proba(X)
|
||||
y_pred_log_proba = clf.predict_log_proba(X)
|
||||
assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
|
||||
|
||||
# Test whether label mismatch between target y and classes raises
|
||||
# an Error
|
||||
# FIXME Remove this test once the more general partial_fit tests are merged
|
||||
assert_raises(ValueError, GaussianNB().partial_fit, X, y, classes=[0, 1])
|
||||
|
||||
|
||||
def test_gnb_prior():
|
||||
# Test whether class priors are properly set.
|
||||
clf = GaussianNB().fit(X, y)
|
||||
assert_array_almost_equal(np.array([3, 3]) / 6.0,
|
||||
clf.class_prior_, 8)
|
||||
clf.fit(X1, y1)
|
||||
# Check that the class priors sum to 1
|
||||
assert_array_almost_equal(clf.class_prior_.sum(), 1)
|
||||
|
||||
|
||||
def test_gnb_sample_weight():
|
||||
"""Test whether sample weights are properly used in GNB. """
|
||||
# Sample weights all being 1 should not change results
|
||||
sw = np.ones(6)
|
||||
clf = GaussianNB().fit(X, y)
|
||||
clf_sw = GaussianNB().fit(X, y, sw)
|
||||
|
||||
assert_array_almost_equal(clf.theta_, clf_sw.theta_)
|
||||
assert_array_almost_equal(clf.sigma_, clf_sw.sigma_)
|
||||
|
||||
# Fitting twice with half sample-weights should result
|
||||
# in same result as fitting once with full weights
|
||||
sw = rng.rand(y.shape[0])
|
||||
clf1 = GaussianNB().fit(X, y, sample_weight=sw)
|
||||
clf2 = GaussianNB().partial_fit(X, y, classes=[1, 2], sample_weight=sw / 2)
|
||||
clf2.partial_fit(X, y, sample_weight=sw / 2)
|
||||
|
||||
assert_array_almost_equal(clf1.theta_, clf2.theta_)
|
||||
assert_array_almost_equal(clf1.sigma_, clf2.sigma_)
|
||||
|
||||
# Check that duplicate entries and correspondingly increased sample
|
||||
# weights yield the same result
|
||||
ind = rng.randint(0, X.shape[0], 20)
|
||||
sample_weight = np.bincount(ind, minlength=X.shape[0])
|
||||
|
||||
clf_dupl = GaussianNB().fit(X[ind], y[ind])
|
||||
clf_sw = GaussianNB().fit(X, y, sample_weight)
|
||||
|
||||
assert_array_almost_equal(clf_dupl.theta_, clf_sw.theta_)
|
||||
assert_array_almost_equal(clf_dupl.sigma_, clf_sw.sigma_)
|
||||
|
||||
|
||||
def test_gnb_neg_priors():
|
||||
"""Test whether an error is raised in case of negative priors"""
|
||||
clf = GaussianNB(priors=np.array([-1., 2.]))
|
||||
assert_raises(ValueError, clf.fit, X, y)
|
||||
|
||||
|
||||
def test_gnb_priors():
|
||||
"""Test whether the class prior override is properly used"""
|
||||
clf = GaussianNB(priors=np.array([0.3, 0.7])).fit(X, y)
|
||||
assert_array_almost_equal(clf.predict_proba([[-0.1, -0.1]]),
|
||||
np.array([[0.825303662161683,
|
||||
0.174696337838317]]), 8)
|
||||
assert_array_almost_equal(clf.class_prior_, np.array([0.3, 0.7]))
|
||||
|
||||
|
||||
def test_gnb_priors_sum_isclose():
|
||||
# test whether the class prior sum is properly tested"""
|
||||
X = np.array([[-1, -1], [-2, -1], [-3, -2], [-4, -5], [-5, -4],
|
||||
[1, 1], [2, 1], [3, 2], [4, 4], [5, 5]])
|
||||
priors = np.array([0.08, 0.14, 0.03, 0.16, 0.11, 0.16, 0.07, 0.14,
|
||||
0.11, 0.0])
|
||||
Y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
|
||||
clf = GaussianNB(priors=priors)
|
||||
# smoke test for issue #9633
|
||||
clf.fit(X, Y)
|
||||
|
||||
|
||||
def test_gnb_wrong_nb_priors():
|
||||
""" Test whether an error is raised if the number of prior is different
|
||||
from the number of class"""
|
||||
clf = GaussianNB(priors=np.array([.25, .25, .25, .25]))
|
||||
assert_raises(ValueError, clf.fit, X, y)
|
||||
|
||||
|
||||
def test_gnb_prior_greater_one():
|
||||
"""Test if an error is raised if the sum of prior greater than one"""
|
||||
clf = GaussianNB(priors=np.array([2., 1.]))
|
||||
assert_raises(ValueError, clf.fit, X, y)
|
||||
|
||||
|
||||
def test_gnb_prior_large_bias():
|
||||
"""Test if good prediction when class prior favor largely one class"""
|
||||
clf = GaussianNB(priors=np.array([0.01, 0.99]))
|
||||
clf.fit(X, y)
|
||||
assert clf.predict([[-0.1, -0.1]]) == np.array([2])
|
||||
|
||||
|
||||
def test_gnb_check_update_with_no_data():
|
||||
""" Test when the partial fit is called without any data"""
|
||||
# Create an empty array
|
||||
prev_points = 100
|
||||
mean = 0.
|
||||
var = 1.
|
||||
x_empty = np.empty((0, X.shape[1]))
|
||||
tmean, tvar = GaussianNB._update_mean_variance(prev_points, mean,
|
||||
var, x_empty)
|
||||
assert tmean == mean
|
||||
assert tvar == var
|
||||
|
||||
|
||||
def test_gnb_pfit_wrong_nb_features():
|
||||
"""Test whether an error is raised when the number of feature changes
|
||||
between two partial fit"""
|
||||
clf = GaussianNB()
|
||||
# Fit for the first time the GNB
|
||||
clf.fit(X, y)
|
||||
# Partial fit a second time with an incoherent X
|
||||
assert_raises(ValueError, clf.partial_fit, np.hstack((X, X)), y)
|
||||
|
||||
|
||||
def test_gnb_partial_fit():
|
||||
clf = GaussianNB().fit(X, y)
|
||||
clf_pf = GaussianNB().partial_fit(X, y, np.unique(y))
|
||||
assert_array_almost_equal(clf.theta_, clf_pf.theta_)
|
||||
assert_array_almost_equal(clf.sigma_, clf_pf.sigma_)
|
||||
assert_array_almost_equal(clf.class_prior_, clf_pf.class_prior_)
|
||||
|
||||
clf_pf2 = GaussianNB().partial_fit(X[0::2, :], y[0::2], np.unique(y))
|
||||
clf_pf2.partial_fit(X[1::2], y[1::2])
|
||||
assert_array_almost_equal(clf.theta_, clf_pf2.theta_)
|
||||
assert_array_almost_equal(clf.sigma_, clf_pf2.sigma_)
|
||||
assert_array_almost_equal(clf.class_prior_, clf_pf2.class_prior_)
|
||||
|
||||
|
||||
def test_gnb_naive_bayes_scale_invariance():
|
||||
# Scaling the data should not change the prediction results
|
||||
iris = load_iris()
|
||||
X, y = iris.data, iris.target
|
||||
labels = [GaussianNB().fit(f * X, y).predict(f * X)
|
||||
for f in [1E-10, 1, 1E10]]
|
||||
assert_array_equal(labels[0], labels[1])
|
||||
assert_array_equal(labels[1], labels[2])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("cls", [MultinomialNB, BernoulliNB, CategoricalNB])
|
||||
def test_discretenb_prior(cls):
|
||||
# Test whether class priors are properly set.
|
||||
clf = cls().fit(X2, y2)
|
||||
assert_array_almost_equal(np.log(np.array([2, 2, 2]) / 6.0),
|
||||
clf.class_log_prior_, 8)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("cls", [MultinomialNB, BernoulliNB, CategoricalNB])
|
||||
def test_discretenb_partial_fit(cls):
|
||||
clf1 = cls()
|
||||
clf1.fit([[0, 1], [1, 0], [1, 1]], [0, 1, 1])
|
||||
|
||||
clf2 = cls()
|
||||
clf2.partial_fit([[0, 1], [1, 0], [1, 1]], [0, 1, 1], classes=[0, 1])
|
||||
assert_array_equal(clf1.class_count_, clf2.class_count_)
|
||||
if cls is CategoricalNB:
|
||||
for i in range(len(clf1.category_count_)):
|
||||
assert_array_equal(clf1.category_count_[i],
|
||||
clf2.category_count_[i])
|
||||
else:
|
||||
assert_array_equal(clf1.feature_count_, clf2.feature_count_)
|
||||
|
||||
clf3 = cls()
|
||||
# all categories have to appear in the first partial fit
|
||||
clf3.partial_fit([[0, 1]], [0], classes=[0, 1])
|
||||
clf3.partial_fit([[1, 0]], [1])
|
||||
clf3.partial_fit([[1, 1]], [1])
|
||||
assert_array_equal(clf1.class_count_, clf3.class_count_)
|
||||
if cls is CategoricalNB:
|
||||
# the categories for each feature of CategoricalNB are mapped to an
|
||||
# index chronologically with each call of partial fit and therefore
|
||||
# the category_count matrices cannot be compared for equality
|
||||
for i in range(len(clf1.category_count_)):
|
||||
assert_array_equal(clf1.category_count_[i].shape,
|
||||
clf3.category_count_[i].shape)
|
||||
assert_array_equal(np.sum(clf1.category_count_[i], axis=1),
|
||||
np.sum(clf3.category_count_[i], axis=1))
|
||||
|
||||
# assert category 0 occurs 1x in the first class and 0x in the 2nd
|
||||
# class
|
||||
assert_array_equal(clf1.category_count_[0][0], np.array([1, 0]))
|
||||
# assert category 1 occurs 0x in the first class and 2x in the 2nd
|
||||
# class
|
||||
assert_array_equal(clf1.category_count_[0][1], np.array([0, 2]))
|
||||
|
||||
# assert category 0 occurs 0x in the first class and 1x in the 2nd
|
||||
# class
|
||||
assert_array_equal(clf1.category_count_[1][0], np.array([0, 1]))
|
||||
# assert category 1 occurs 1x in the first class and 1x in the 2nd
|
||||
# class
|
||||
assert_array_equal(clf1.category_count_[1][1], np.array([1, 1]))
|
||||
else:
|
||||
assert_array_equal(clf1.feature_count_, clf3.feature_count_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, GaussianNB,
|
||||
CategoricalNB])
|
||||
def test_discretenb_pickle(cls):
|
||||
# Test picklability of discrete naive Bayes classifiers
|
||||
|
||||
clf = cls().fit(X2, y2)
|
||||
y_pred = clf.predict(X2)
|
||||
|
||||
store = BytesIO()
|
||||
pickle.dump(clf, store)
|
||||
clf = pickle.load(BytesIO(store.getvalue()))
|
||||
|
||||
assert_array_equal(y_pred, clf.predict(X2))
|
||||
|
||||
# Test pickling of estimator trained with partial_fit
|
||||
clf2 = cls().partial_fit(X2[:3], y2[:3], classes=np.unique(y2))
|
||||
clf2.partial_fit(X2[3:], y2[3:])
|
||||
store = BytesIO()
|
||||
pickle.dump(clf2, store)
|
||||
clf2 = pickle.load(BytesIO(store.getvalue()))
|
||||
assert_array_equal(y_pred, clf2.predict(X2))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, GaussianNB,
|
||||
CategoricalNB])
|
||||
def test_discretenb_input_check_fit(cls):
|
||||
# Test input checks for the fit method
|
||||
|
||||
# check shape consistency for number of samples at fit time
|
||||
assert_raises(ValueError, cls().fit, X2, y2[:-1])
|
||||
|
||||
# check shape consistency for number of input features at predict time
|
||||
clf = cls().fit(X2, y2)
|
||||
assert_raises(ValueError, clf.predict, X2[:, :-1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, CategoricalNB])
|
||||
def test_discretenb_input_check_partial_fit(cls):
|
||||
# check shape consistency
|
||||
assert_raises(ValueError, cls().partial_fit, X2, y2[:-1],
|
||||
classes=np.unique(y2))
|
||||
|
||||
# classes is required for first call to partial fit
|
||||
assert_raises(ValueError, cls().partial_fit, X2, y2)
|
||||
|
||||
# check consistency of consecutive classes values
|
||||
clf = cls()
|
||||
clf.partial_fit(X2, y2, classes=np.unique(y2))
|
||||
assert_raises(ValueError, clf.partial_fit, X2, y2,
|
||||
classes=np.arange(42))
|
||||
|
||||
# check consistency of input shape for partial_fit
|
||||
assert_raises(ValueError, clf.partial_fit, X2[:, :-1], y2)
|
||||
|
||||
# check consistency of input shape for predict
|
||||
assert_raises(ValueError, clf.predict, X2[:, :-1])
|
||||
|
||||
|
||||
def test_discretenb_predict_proba():
|
||||
# Test discrete NB classes' probability scores
|
||||
|
||||
# The 100s below distinguish Bernoulli from multinomial.
|
||||
# FIXME: write a test to show this.
|
||||
X_bernoulli = [[1, 100, 0], [0, 1, 0], [0, 100, 1]]
|
||||
X_multinomial = [[0, 1], [1, 3], [4, 0]]
|
||||
|
||||
# test binary case (1-d output)
|
||||
y = [0, 0, 2] # 2 is regression test for binary case, 02e673
|
||||
for cls, X in zip([BernoulliNB, MultinomialNB],
|
||||
[X_bernoulli, X_multinomial]):
|
||||
clf = cls().fit(X, y)
|
||||
assert clf.predict(X[-1:]) == 2
|
||||
assert clf.predict_proba([X[0]]).shape == (1, 2)
|
||||
assert_array_almost_equal(clf.predict_proba(X[:2]).sum(axis=1),
|
||||
np.array([1., 1.]), 6)
|
||||
|
||||
# test multiclass case (2-d output, must sum to one)
|
||||
y = [0, 1, 2]
|
||||
for cls, X in zip([BernoulliNB, MultinomialNB],
|
||||
[X_bernoulli, X_multinomial]):
|
||||
clf = cls().fit(X, y)
|
||||
assert clf.predict_proba(X[0:1]).shape == (1, 3)
|
||||
assert clf.predict_proba(X[:2]).shape == (2, 3)
|
||||
assert_almost_equal(np.sum(clf.predict_proba([X[1]])), 1)
|
||||
assert_almost_equal(np.sum(clf.predict_proba([X[-1]])), 1)
|
||||
assert_almost_equal(np.sum(np.exp(clf.class_log_prior_)), 1)
|
||||
assert_almost_equal(np.sum(np.exp(clf.intercept_)), 1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, CategoricalNB])
|
||||
def test_discretenb_uniform_prior(cls):
|
||||
# Test whether discrete NB classes fit a uniform prior
|
||||
# when fit_prior=False and class_prior=None
|
||||
|
||||
clf = cls()
|
||||
clf.set_params(fit_prior=False)
|
||||
clf.fit([[0], [0], [1]], [0, 0, 1])
|
||||
prior = np.exp(clf.class_log_prior_)
|
||||
assert_array_almost_equal(prior, np.array([.5, .5]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, CategoricalNB])
|
||||
def test_discretenb_provide_prior(cls):
|
||||
# Test whether discrete NB classes use provided prior
|
||||
|
||||
clf = cls(class_prior=[0.5, 0.5])
|
||||
clf.fit([[0], [0], [1]], [0, 0, 1])
|
||||
prior = np.exp(clf.class_log_prior_)
|
||||
assert_array_almost_equal(prior, np.array([.5, .5]))
|
||||
|
||||
# Inconsistent number of classes with prior
|
||||
assert_raises(ValueError, clf.fit, [[0], [1], [2]], [0, 1, 2])
|
||||
assert_raises(ValueError, clf.partial_fit, [[0], [1]], [0, 1],
|
||||
classes=[0, 1, 1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, CategoricalNB])
|
||||
def test_discretenb_provide_prior_with_partial_fit(cls):
|
||||
# Test whether discrete NB classes use provided prior
|
||||
# when using partial_fit
|
||||
|
||||
iris = load_iris()
|
||||
iris_data1, iris_data2, iris_target1, iris_target2 = train_test_split(
|
||||
iris.data, iris.target, test_size=0.4, random_state=415)
|
||||
|
||||
for prior in [None, [0.3, 0.3, 0.4]]:
|
||||
clf_full = cls(class_prior=prior)
|
||||
clf_full.fit(iris.data, iris.target)
|
||||
clf_partial = cls(class_prior=prior)
|
||||
clf_partial.partial_fit(iris_data1, iris_target1,
|
||||
classes=[0, 1, 2])
|
||||
clf_partial.partial_fit(iris_data2, iris_target2)
|
||||
assert_array_almost_equal(clf_full.class_log_prior_,
|
||||
clf_partial.class_log_prior_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, CategoricalNB])
|
||||
def test_discretenb_sample_weight_multiclass(cls):
|
||||
# check shape consistency for number of samples at fit time
|
||||
X = [
|
||||
[0, 0, 1],
|
||||
[0, 1, 1],
|
||||
[0, 1, 1],
|
||||
[1, 0, 0],
|
||||
]
|
||||
y = [0, 0, 1, 2]
|
||||
sample_weight = np.array([1, 1, 2, 2], dtype=np.float64)
|
||||
sample_weight /= sample_weight.sum()
|
||||
clf = cls().fit(X, y, sample_weight=sample_weight)
|
||||
assert_array_equal(clf.predict(X), [0, 1, 1, 2])
|
||||
|
||||
# Check sample weight using the partial_fit method
|
||||
clf = cls()
|
||||
clf.partial_fit(X[:2], y[:2], classes=[0, 1, 2],
|
||||
sample_weight=sample_weight[:2])
|
||||
clf.partial_fit(X[2:3], y[2:3], sample_weight=sample_weight[2:3])
|
||||
clf.partial_fit(X[3:], y[3:], sample_weight=sample_weight[3:])
|
||||
assert_array_equal(clf.predict(X), [0, 1, 1, 2])
|
||||
|
||||
|
||||
@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB])
|
||||
def test_discretenb_coef_intercept_shape(cls):
|
||||
# coef_ and intercept_ should have shapes as in other linear models.
|
||||
# Non-regression test for issue #2127.
|
||||
X = [[1, 0, 0], [1, 1, 1]]
|
||||
y = [1, 2] # binary classification
|
||||
clf = cls()
|
||||
|
||||
clf.fit(X, y)
|
||||
assert clf.coef_.shape == (1, 3)
|
||||
assert clf.intercept_.shape == (1,)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('kind', ('dense', 'sparse'))
|
||||
def test_mnnb(kind):
|
||||
# Test Multinomial Naive Bayes classification.
|
||||
# This checks that MultinomialNB implements fit and predict and returns
|
||||
# correct values for a simple toy dataset.
|
||||
|
||||
if kind == 'dense':
|
||||
X = X2
|
||||
elif kind == 'sparse':
|
||||
X = scipy.sparse.csr_matrix(X2)
|
||||
|
||||
# Check the ability to predict the learning set.
|
||||
clf = MultinomialNB()
|
||||
assert_raises(ValueError, clf.fit, -X, y2)
|
||||
y_pred = clf.fit(X, y2).predict(X)
|
||||
|
||||
assert_array_equal(y_pred, y2)
|
||||
|
||||
# Verify that np.log(clf.predict_proba(X)) gives the same results as
|
||||
# clf.predict_log_proba(X)
|
||||
y_pred_proba = clf.predict_proba(X)
|
||||
y_pred_log_proba = clf.predict_log_proba(X)
|
||||
assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
|
||||
|
||||
# Check that incremental fitting yields the same results
|
||||
clf2 = MultinomialNB()
|
||||
clf2.partial_fit(X[:2], y2[:2], classes=np.unique(y2))
|
||||
clf2.partial_fit(X[2:5], y2[2:5])
|
||||
clf2.partial_fit(X[5:], y2[5:])
|
||||
|
||||
y_pred2 = clf2.predict(X)
|
||||
assert_array_equal(y_pred2, y2)
|
||||
|
||||
y_pred_proba2 = clf2.predict_proba(X)
|
||||
y_pred_log_proba2 = clf2.predict_log_proba(X)
|
||||
assert_array_almost_equal(np.log(y_pred_proba2), y_pred_log_proba2, 8)
|
||||
assert_array_almost_equal(y_pred_proba2, y_pred_proba)
|
||||
assert_array_almost_equal(y_pred_log_proba2, y_pred_log_proba)
|
||||
|
||||
# Partial fit on the whole data at once should be the same as fit too
|
||||
clf3 = MultinomialNB()
|
||||
clf3.partial_fit(X, y2, classes=np.unique(y2))
|
||||
|
||||
y_pred3 = clf3.predict(X)
|
||||
assert_array_equal(y_pred3, y2)
|
||||
y_pred_proba3 = clf3.predict_proba(X)
|
||||
y_pred_log_proba3 = clf3.predict_log_proba(X)
|
||||
assert_array_almost_equal(np.log(y_pred_proba3), y_pred_log_proba3, 8)
|
||||
assert_array_almost_equal(y_pred_proba3, y_pred_proba)
|
||||
assert_array_almost_equal(y_pred_log_proba3, y_pred_log_proba)
|
||||
|
||||
|
||||
def test_mnb_prior_unobserved_targets():
|
||||
# test smoothing of prior for yet unobserved targets
|
||||
|
||||
# Create toy training data
|
||||
X = np.array([[0, 1], [1, 0]])
|
||||
y = np.array([0, 1])
|
||||
|
||||
clf = MultinomialNB()
|
||||
|
||||
assert_no_warnings(
|
||||
clf.partial_fit, X, y, classes=[0, 1, 2]
|
||||
)
|
||||
|
||||
assert clf.predict([[0, 1]]) == 0
|
||||
assert clf.predict([[1, 0]]) == 1
|
||||
assert clf.predict([[1, 1]]) == 0
|
||||
|
||||
# add a training example with previously unobserved class
|
||||
assert_no_warnings(
|
||||
clf.partial_fit, [[1, 1]], [2]
|
||||
)
|
||||
|
||||
assert clf.predict([[0, 1]]) == 0
|
||||
assert clf.predict([[1, 0]]) == 1
|
||||
assert clf.predict([[1, 1]]) == 2
|
||||
|
||||
|
||||
def test_mnb_sample_weight():
|
||||
clf = MultinomialNB()
|
||||
clf.fit([[1, 2], [1, 2], [1, 0]],
|
||||
[0, 0, 1],
|
||||
sample_weight=[1, 1, 4])
|
||||
assert_array_equal(clf.predict([[1, 0]]), [1])
|
||||
positive_prior = np.exp(clf.intercept_[0])
|
||||
assert_array_almost_equal([1 - positive_prior, positive_prior],
|
||||
[1 / 3., 2 / 3.])
|
||||
|
||||
|
||||
def test_bnb():
|
||||
# Tests that BernoulliNB when alpha=1.0 gives the same values as
|
||||
# those given for the toy example in Manning, Raghavan, and
|
||||
# Schuetze's "Introduction to Information Retrieval" book:
|
||||
# https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
|
||||
|
||||
# Training data points are:
|
||||
# Chinese Beijing Chinese (class: China)
|
||||
# Chinese Chinese Shanghai (class: China)
|
||||
# Chinese Macao (class: China)
|
||||
# Tokyo Japan Chinese (class: Japan)
|
||||
|
||||
# Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo
|
||||
X = np.array([[1, 1, 0, 0, 0, 0],
|
||||
[0, 1, 0, 0, 1, 0],
|
||||
[0, 1, 0, 1, 0, 0],
|
||||
[0, 1, 1, 0, 0, 1]])
|
||||
|
||||
# Classes are China (0), Japan (1)
|
||||
Y = np.array([0, 0, 0, 1])
|
||||
|
||||
# Fit BernoulliBN w/ alpha = 1.0
|
||||
clf = BernoulliNB(alpha=1.0)
|
||||
clf.fit(X, Y)
|
||||
|
||||
# Check the class prior is correct
|
||||
class_prior = np.array([0.75, 0.25])
|
||||
assert_array_almost_equal(np.exp(clf.class_log_prior_), class_prior)
|
||||
|
||||
# Check the feature probabilities are correct
|
||||
feature_prob = np.array([[0.4, 0.8, 0.2, 0.4, 0.4, 0.2],
|
||||
[1 / 3.0, 2 / 3.0, 2 / 3.0, 1 / 3.0, 1 / 3.0,
|
||||
2 / 3.0]])
|
||||
assert_array_almost_equal(np.exp(clf.feature_log_prob_), feature_prob)
|
||||
|
||||
# Testing data point is:
|
||||
# Chinese Chinese Chinese Tokyo Japan
|
||||
X_test = np.array([[0, 1, 1, 0, 0, 1]])
|
||||
|
||||
# Check the predictive probabilities are correct
|
||||
unnorm_predict_proba = np.array([[0.005183999999999999,
|
||||
0.02194787379972565]])
|
||||
predict_proba = unnorm_predict_proba / np.sum(unnorm_predict_proba)
|
||||
assert_array_almost_equal(clf.predict_proba(X_test), predict_proba)
|
||||
|
||||
|
||||
def test_bnb_feature_log_prob():
|
||||
# Test for issue #4268.
|
||||
# Tests that the feature log prob value computed by BernoulliNB when
|
||||
# alpha=1.0 is equal to the expression given in Manning, Raghavan,
|
||||
# and Schuetze's "Introduction to Information Retrieval" book:
|
||||
# http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
|
||||
|
||||
X = np.array([[0, 0, 0], [1, 1, 0], [0, 1, 0], [1, 0, 1], [0, 1, 0]])
|
||||
Y = np.array([0, 0, 1, 2, 2])
|
||||
|
||||
# Fit Bernoulli NB w/ alpha = 1.0
|
||||
clf = BernoulliNB(alpha=1.0)
|
||||
clf.fit(X, Y)
|
||||
|
||||
# Manually form the (log) numerator and denominator that
|
||||
# constitute P(feature presence | class)
|
||||
num = np.log(clf.feature_count_ + 1.0)
|
||||
denom = np.tile(np.log(clf.class_count_ + 2.0), (X.shape[1], 1)).T
|
||||
|
||||
# Check manual estimate matches
|
||||
assert_array_almost_equal(clf.feature_log_prob_, (num - denom))
|
||||
|
||||
|
||||
def test_cnb():
|
||||
# Tests ComplementNB when alpha=1.0 for the toy example in Manning,
|
||||
# Raghavan, and Schuetze's "Introduction to Information Retrieval" book:
|
||||
# https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
|
||||
|
||||
# Training data points are:
|
||||
# Chinese Beijing Chinese (class: China)
|
||||
# Chinese Chinese Shanghai (class: China)
|
||||
# Chinese Macao (class: China)
|
||||
# Tokyo Japan Chinese (class: Japan)
|
||||
|
||||
# Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo.
|
||||
X = np.array([[1, 1, 0, 0, 0, 0],
|
||||
[0, 1, 0, 0, 1, 0],
|
||||
[0, 1, 0, 1, 0, 0],
|
||||
[0, 1, 1, 0, 0, 1]])
|
||||
|
||||
# Classes are China (0), Japan (1).
|
||||
Y = np.array([0, 0, 0, 1])
|
||||
|
||||
# Check that weights are correct. See steps 4-6 in Table 4 of
|
||||
# Rennie et al. (2003).
|
||||
theta = np.array([
|
||||
[
|
||||
(0 + 1) / (3 + 6),
|
||||
(1 + 1) / (3 + 6),
|
||||
(1 + 1) / (3 + 6),
|
||||
(0 + 1) / (3 + 6),
|
||||
(0 + 1) / (3 + 6),
|
||||
(1 + 1) / (3 + 6)
|
||||
],
|
||||
[
|
||||
(1 + 1) / (6 + 6),
|
||||
(3 + 1) / (6 + 6),
|
||||
(0 + 1) / (6 + 6),
|
||||
(1 + 1) / (6 + 6),
|
||||
(1 + 1) / (6 + 6),
|
||||
(0 + 1) / (6 + 6)
|
||||
]])
|
||||
|
||||
weights = np.zeros(theta.shape)
|
||||
normed_weights = np.zeros(theta.shape)
|
||||
for i in range(2):
|
||||
weights[i] = -np.log(theta[i])
|
||||
normed_weights[i] = weights[i] / weights[i].sum()
|
||||
|
||||
# Verify inputs are nonnegative.
|
||||
clf = ComplementNB(alpha=1.0)
|
||||
assert_raises(ValueError, clf.fit, -X, Y)
|
||||
|
||||
clf.fit(X, Y)
|
||||
|
||||
# Check that counts/weights are correct.
|
||||
feature_count = np.array([[1, 3, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1]])
|
||||
assert_array_equal(clf.feature_count_, feature_count)
|
||||
class_count = np.array([3, 1])
|
||||
assert_array_equal(clf.class_count_, class_count)
|
||||
feature_all = np.array([1, 4, 1, 1, 1, 1])
|
||||
assert_array_equal(clf.feature_all_, feature_all)
|
||||
assert_array_almost_equal(clf.feature_log_prob_, weights)
|
||||
|
||||
clf = ComplementNB(alpha=1.0, norm=True)
|
||||
clf.fit(X, Y)
|
||||
assert_array_almost_equal(clf.feature_log_prob_, normed_weights)
|
||||
|
||||
|
||||
def test_categoricalnb():
|
||||
# Check the ability to predict the training set.
|
||||
clf = CategoricalNB()
|
||||
y_pred = clf.fit(X2, y2).predict(X2)
|
||||
assert_array_equal(y_pred, y2)
|
||||
|
||||
X3 = np.array([[1, 4], [2, 5]])
|
||||
y3 = np.array([1, 2])
|
||||
clf = CategoricalNB(alpha=1, fit_prior=False)
|
||||
|
||||
clf.fit(X3, y3)
|
||||
|
||||
# Check error is raised for X with negative entries
|
||||
X = np.array([[0, -1]])
|
||||
y = np.array([1])
|
||||
error_msg = "Negative values in data passed to CategoricalNB (input X)"
|
||||
assert_raise_message(ValueError, error_msg, clf.predict, X)
|
||||
assert_raise_message(ValueError, error_msg, clf.fit, X, y)
|
||||
|
||||
# Check error is raised for incorrect X
|
||||
X = np.array([[1, 4, 1], [2, 5, 6]])
|
||||
msg = "Expected input with 2 features, got 3 instead"
|
||||
assert_raise_message(ValueError, msg, clf.predict, X)
|
||||
|
||||
# Test alpha
|
||||
X3_test = np.array([[2, 5]])
|
||||
# alpha=1 increases the count of all categories by one so the final
|
||||
# probability for each category is not 50/50 but 1/3 to 2/3
|
||||
bayes_numerator = np.array([[1/3*1/3, 2/3*2/3]])
|
||||
bayes_denominator = bayes_numerator.sum()
|
||||
assert_array_almost_equal(clf.predict_proba(X3_test),
|
||||
bayes_numerator / bayes_denominator)
|
||||
|
||||
# Assert category_count has counted all features
|
||||
assert len(clf.category_count_) == X3.shape[1]
|
||||
|
||||
# Check sample_weight
|
||||
X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
clf = CategoricalNB(alpha=1, fit_prior=False)
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([1]))
|
||||
|
||||
for factor in [1., 0.3, 5, 0.0001]:
|
||||
X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
sample_weight = np.array([1, 1, 10, 0.1]) * factor
|
||||
clf = CategoricalNB(alpha=1, fit_prior=False)
|
||||
clf.fit(X, y, sample_weight=sample_weight)
|
||||
assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([2]))
|
||||
|
||||
|
||||
def test_alpha():
|
||||
# Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case
|
||||
X = np.array([[1, 0], [1, 1]])
|
||||
y = np.array([0, 1])
|
||||
nb = BernoulliNB(alpha=0.)
|
||||
assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1])
|
||||
assert_warns(UserWarning, nb.fit, X, y)
|
||||
prob = np.array([[1, 0], [0, 1]])
|
||||
assert_array_almost_equal(nb.predict_proba(X), prob)
|
||||
|
||||
nb = MultinomialNB(alpha=0.)
|
||||
assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1])
|
||||
assert_warns(UserWarning, nb.fit, X, y)
|
||||
prob = np.array([[2. / 3, 1. / 3], [0, 1]])
|
||||
assert_array_almost_equal(nb.predict_proba(X), prob)
|
||||
|
||||
nb = CategoricalNB(alpha=0.)
|
||||
assert_warns(UserWarning, nb.fit, X, y)
|
||||
prob = np.array([[1., 0.], [0., 1.]])
|
||||
assert_array_almost_equal(nb.predict_proba(X), prob)
|
||||
|
||||
# Test sparse X
|
||||
X = scipy.sparse.csr_matrix(X)
|
||||
nb = BernoulliNB(alpha=0.)
|
||||
assert_warns(UserWarning, nb.fit, X, y)
|
||||
prob = np.array([[1, 0], [0, 1]])
|
||||
assert_array_almost_equal(nb.predict_proba(X), prob)
|
||||
|
||||
nb = MultinomialNB(alpha=0.)
|
||||
assert_warns(UserWarning, nb.fit, X, y)
|
||||
prob = np.array([[2. / 3, 1. / 3], [0, 1]])
|
||||
assert_array_almost_equal(nb.predict_proba(X), prob)
|
||||
|
||||
# Test for alpha < 0
|
||||
X = np.array([[1, 0], [1, 1]])
|
||||
y = np.array([0, 1])
|
||||
expected_msg = ('Smoothing parameter alpha = -1.0e-01. '
|
||||
'alpha should be > 0.')
|
||||
b_nb = BernoulliNB(alpha=-0.1)
|
||||
m_nb = MultinomialNB(alpha=-0.1)
|
||||
c_nb = CategoricalNB(alpha=-0.1)
|
||||
assert_raise_message(ValueError, expected_msg, b_nb.fit, X, y)
|
||||
assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y)
|
||||
assert_raise_message(ValueError, expected_msg, c_nb.fit, X, y)
|
||||
|
||||
b_nb = BernoulliNB(alpha=-0.1)
|
||||
m_nb = MultinomialNB(alpha=-0.1)
|
||||
assert_raise_message(ValueError, expected_msg, b_nb.partial_fit,
|
||||
X, y, classes=[0, 1])
|
||||
assert_raise_message(ValueError, expected_msg, m_nb.partial_fit,
|
||||
X, y, classes=[0, 1])
|
||||
|
||||
|
||||
def test_alpha_vector():
|
||||
X = np.array([[1, 0], [1, 1]])
|
||||
y = np.array([0, 1])
|
||||
|
||||
# Setting alpha=np.array with same length
|
||||
# as number of features should be fine
|
||||
alpha = np.array([1, 2])
|
||||
nb = MultinomialNB(alpha=alpha)
|
||||
nb.partial_fit(X, y, classes=[0, 1])
|
||||
|
||||
# Test feature probabilities uses pseudo-counts (alpha)
|
||||
feature_prob = np.array([[1 / 2, 1 / 2], [2 / 5, 3 / 5]])
|
||||
assert_array_almost_equal(nb.feature_log_prob_, np.log(feature_prob))
|
||||
|
||||
# Test predictions
|
||||
prob = np.array([[5 / 9, 4 / 9], [25 / 49, 24 / 49]])
|
||||
assert_array_almost_equal(nb.predict_proba(X), prob)
|
||||
|
||||
# Test alpha non-negative
|
||||
alpha = np.array([1., -0.1])
|
||||
expected_msg = ('Smoothing parameter alpha = -1.0e-01. '
|
||||
'alpha should be > 0.')
|
||||
m_nb = MultinomialNB(alpha=alpha)
|
||||
assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y)
|
||||
|
||||
# Test that too small pseudo-counts are replaced
|
||||
ALPHA_MIN = 1e-10
|
||||
alpha = np.array([ALPHA_MIN / 2, 0.5])
|
||||
m_nb = MultinomialNB(alpha=alpha)
|
||||
m_nb.partial_fit(X, y, classes=[0, 1])
|
||||
assert_array_almost_equal(m_nb._check_alpha(),
|
||||
[ALPHA_MIN, 0.5],
|
||||
decimal=12)
|
||||
|
||||
# Test correct dimensions
|
||||
alpha = np.array([1., 2., 3.])
|
||||
m_nb = MultinomialNB(alpha=alpha)
|
||||
expected_msg = ('alpha should be a scalar or a numpy array '
|
||||
'with shape [n_features]')
|
||||
assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y)
|
||||
|
||||
|
||||
def test_check_accuracy_on_digits():
|
||||
# Non regression test to make sure that any further refactoring / optim
|
||||
# of the NB models do not harm the performance on a slightly non-linearly
|
||||
# separable dataset
|
||||
X, y = load_digits(return_X_y=True)
|
||||
binary_3v8 = np.logical_or(y == 3, y == 8)
|
||||
X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8]
|
||||
|
||||
# Multinomial NB
|
||||
scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10)
|
||||
assert scores.mean() > 0.86
|
||||
|
||||
scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10)
|
||||
assert scores.mean() > 0.94
|
||||
|
||||
# Bernoulli NB
|
||||
scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10)
|
||||
assert scores.mean() > 0.83
|
||||
|
||||
scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10)
|
||||
assert scores.mean() > 0.92
|
||||
|
||||
# Gaussian NB
|
||||
scores = cross_val_score(GaussianNB(), X, y, cv=10)
|
||||
assert scores.mean() > 0.77
|
||||
|
||||
scores = cross_val_score(GaussianNB(var_smoothing=0.1), X, y, cv=10)
|
||||
assert scores.mean() > 0.89
|
||||
|
||||
scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
|
||||
assert scores.mean() > 0.86
|
||||
|
||||
|
||||
# TODO: remove in 0.24
|
||||
def test_deprecations():
|
||||
|
||||
class A(BaseNB, GaussianNB):
|
||||
pass
|
||||
|
||||
class B(BaseDiscreteNB, CategoricalNB):
|
||||
pass
|
||||
|
||||
with pytest.warns(FutureWarning, match="is deprecated in version 0.22"):
|
||||
A()
|
||||
|
||||
with pytest.warns(FutureWarning, match="is deprecated in version 0.22"):
|
||||
B()
|
1245
venv/Lib/site-packages/sklearn/tests/test_pipeline.py
Normal file
1245
venv/Lib/site-packages/sklearn/tests/test_pipeline.py
Normal file
File diff suppressed because it is too large
Load diff
366
venv/Lib/site-packages/sklearn/tests/test_random_projection.py
Normal file
366
venv/Lib/site-packages/sklearn/tests/test_random_projection.py
Normal file
|
@ -0,0 +1,366 @@
|
|||
|
||||
import functools
|
||||
from typing import List, Any
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
import pytest
|
||||
|
||||
from sklearn.metrics import euclidean_distances
|
||||
|
||||
from sklearn.random_projection import johnson_lindenstrauss_min_dim
|
||||
from sklearn.random_projection import _gaussian_random_matrix
|
||||
from sklearn.random_projection import gaussian_random_matrix
|
||||
from sklearn.random_projection import _sparse_random_matrix
|
||||
from sklearn.random_projection import sparse_random_matrix
|
||||
from sklearn.random_projection import SparseRandomProjection
|
||||
from sklearn.random_projection import GaussianRandomProjection
|
||||
|
||||
from sklearn.utils._testing import assert_raises
|
||||
from sklearn.utils._testing import assert_raise_message
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_warns
|
||||
from sklearn.exceptions import DataDimensionalityWarning
|
||||
|
||||
all_sparse_random_matrix: List[Any] = [_sparse_random_matrix]
|
||||
all_dense_random_matrix: List[Any] = [_gaussian_random_matrix]
|
||||
all_random_matrix = all_sparse_random_matrix + all_dense_random_matrix
|
||||
|
||||
all_SparseRandomProjection: List[Any] = [SparseRandomProjection]
|
||||
all_DenseRandomProjection: List[Any] = [GaussianRandomProjection]
|
||||
all_RandomProjection = set(all_SparseRandomProjection +
|
||||
all_DenseRandomProjection)
|
||||
|
||||
|
||||
# Make some random data with uniformly located non zero entries with
|
||||
# Gaussian distributed values
|
||||
def make_sparse_random_data(n_samples, n_features, n_nonzeros):
|
||||
rng = np.random.RandomState(0)
|
||||
data_coo = sp.coo_matrix(
|
||||
(rng.randn(n_nonzeros),
|
||||
(rng.randint(n_samples, size=n_nonzeros),
|
||||
rng.randint(n_features, size=n_nonzeros))),
|
||||
shape=(n_samples, n_features))
|
||||
return data_coo.toarray(), data_coo.tocsr()
|
||||
|
||||
|
||||
def densify(matrix):
|
||||
if not sp.issparse(matrix):
|
||||
return matrix
|
||||
else:
|
||||
return matrix.toarray()
|
||||
|
||||
|
||||
n_samples, n_features = (10, 1000)
|
||||
n_nonzeros = int(n_samples * n_features / 100.)
|
||||
data, data_csr = make_sparse_random_data(n_samples, n_features, n_nonzeros)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# test on JL lemma
|
||||
###############################################################################
|
||||
def test_invalid_jl_domain():
|
||||
assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=1.1)
|
||||
assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=0.0)
|
||||
assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=-0.1)
|
||||
assert_raises(ValueError, johnson_lindenstrauss_min_dim, 0, eps=0.5)
|
||||
|
||||
|
||||
def test_input_size_jl_min_dim():
|
||||
assert_raises(ValueError, johnson_lindenstrauss_min_dim,
|
||||
3 * [100], eps=2 * [0.9])
|
||||
|
||||
assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100],
|
||||
eps=2 * [0.9])
|
||||
|
||||
johnson_lindenstrauss_min_dim(np.random.randint(1, 10, size=(10, 10)),
|
||||
eps=np.full((10, 10), 0.5))
|
||||
|
||||
|
||||
###############################################################################
|
||||
# tests random matrix generation
|
||||
###############################################################################
|
||||
def check_input_size_random_matrix(random_matrix):
|
||||
assert_raises(ValueError, random_matrix, 0, 0)
|
||||
assert_raises(ValueError, random_matrix, -1, 1)
|
||||
assert_raises(ValueError, random_matrix, 1, -1)
|
||||
assert_raises(ValueError, random_matrix, 1, 0)
|
||||
assert_raises(ValueError, random_matrix, -1, 0)
|
||||
|
||||
|
||||
def check_size_generated(random_matrix):
|
||||
assert random_matrix(1, 5).shape == (1, 5)
|
||||
assert random_matrix(5, 1).shape == (5, 1)
|
||||
assert random_matrix(5, 5).shape == (5, 5)
|
||||
assert random_matrix(1, 1).shape == (1, 1)
|
||||
|
||||
|
||||
def check_zero_mean_and_unit_norm(random_matrix):
|
||||
# All random matrix should produce a transformation matrix
|
||||
# with zero mean and unit norm for each columns
|
||||
|
||||
A = densify(random_matrix(10000, 1, random_state=0))
|
||||
|
||||
assert_array_almost_equal(0, np.mean(A), 3)
|
||||
assert_array_almost_equal(1.0, np.linalg.norm(A), 1)
|
||||
|
||||
|
||||
def check_input_with_sparse_random_matrix(random_matrix):
|
||||
n_components, n_features = 5, 10
|
||||
|
||||
for density in [-1., 0.0, 1.1]:
|
||||
assert_raises(ValueError,
|
||||
random_matrix, n_components, n_features, density=density)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("random_matrix", all_random_matrix)
|
||||
def test_basic_property_of_random_matrix(random_matrix):
|
||||
# Check basic properties of random matrix generation
|
||||
check_input_size_random_matrix(random_matrix)
|
||||
check_size_generated(random_matrix)
|
||||
check_zero_mean_and_unit_norm(random_matrix)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("random_matrix", all_sparse_random_matrix)
|
||||
def test_basic_property_of_sparse_random_matrix(random_matrix):
|
||||
check_input_with_sparse_random_matrix(random_matrix)
|
||||
|
||||
random_matrix_dense = functools.partial(random_matrix, density=1.0)
|
||||
|
||||
check_zero_mean_and_unit_norm(random_matrix_dense)
|
||||
|
||||
|
||||
def test_gaussian_random_matrix():
|
||||
# Check some statical properties of Gaussian random matrix
|
||||
# Check that the random matrix follow the proper distribution.
|
||||
# Let's say that each element of a_{ij} of A is taken from
|
||||
# a_ij ~ N(0.0, 1 / n_components).
|
||||
#
|
||||
n_components = 100
|
||||
n_features = 1000
|
||||
A = _gaussian_random_matrix(n_components, n_features, random_state=0)
|
||||
|
||||
assert_array_almost_equal(0.0, np.mean(A), 2)
|
||||
assert_array_almost_equal(np.var(A, ddof=1), 1 / n_components, 1)
|
||||
|
||||
|
||||
def test_sparse_random_matrix():
|
||||
# Check some statical properties of sparse random matrix
|
||||
n_components = 100
|
||||
n_features = 500
|
||||
|
||||
for density in [0.3, 1.]:
|
||||
s = 1 / density
|
||||
|
||||
A = _sparse_random_matrix(n_components,
|
||||
n_features,
|
||||
density=density,
|
||||
random_state=0)
|
||||
A = densify(A)
|
||||
|
||||
# Check possible values
|
||||
values = np.unique(A)
|
||||
assert np.sqrt(s) / np.sqrt(n_components) in values
|
||||
assert - np.sqrt(s) / np.sqrt(n_components) in values
|
||||
|
||||
if density == 1.0:
|
||||
assert np.size(values) == 2
|
||||
else:
|
||||
assert 0. in values
|
||||
assert np.size(values) == 3
|
||||
|
||||
# Check that the random matrix follow the proper distribution.
|
||||
# Let's say that each element of a_{ij} of A is taken from
|
||||
#
|
||||
# - -sqrt(s) / sqrt(n_components) with probability 1 / 2s
|
||||
# - 0 with probability 1 - 1 / s
|
||||
# - +sqrt(s) / sqrt(n_components) with probability 1 / 2s
|
||||
#
|
||||
assert_almost_equal(np.mean(A == 0.0),
|
||||
1 - 1 / s, decimal=2)
|
||||
assert_almost_equal(np.mean(A == np.sqrt(s) / np.sqrt(n_components)),
|
||||
1 / (2 * s), decimal=2)
|
||||
assert_almost_equal(np.mean(A == - np.sqrt(s) / np.sqrt(n_components)),
|
||||
1 / (2 * s), decimal=2)
|
||||
|
||||
assert_almost_equal(np.var(A == 0.0, ddof=1),
|
||||
(1 - 1 / s) * 1 / s, decimal=2)
|
||||
assert_almost_equal(np.var(A == np.sqrt(s) / np.sqrt(n_components),
|
||||
ddof=1),
|
||||
(1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2)
|
||||
assert_almost_equal(np.var(A == - np.sqrt(s) / np.sqrt(n_components),
|
||||
ddof=1),
|
||||
(1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# tests on random projection transformer
|
||||
###############################################################################
|
||||
def test_sparse_random_projection_transformer_invalid_density():
|
||||
for RandomProjection in all_SparseRandomProjection:
|
||||
assert_raises(ValueError,
|
||||
RandomProjection(density=1.1).fit, data)
|
||||
|
||||
assert_raises(ValueError,
|
||||
RandomProjection(density=0).fit, data)
|
||||
|
||||
assert_raises(ValueError,
|
||||
RandomProjection(density=-0.1).fit, data)
|
||||
|
||||
|
||||
def test_random_projection_transformer_invalid_input():
|
||||
for RandomProjection in all_RandomProjection:
|
||||
assert_raises(ValueError,
|
||||
RandomProjection(n_components='auto').fit, [[0, 1, 2]])
|
||||
|
||||
assert_raises(ValueError,
|
||||
RandomProjection(n_components=-10).fit, data)
|
||||
|
||||
|
||||
def test_try_to_transform_before_fit():
|
||||
for RandomProjection in all_RandomProjection:
|
||||
assert_raises(ValueError,
|
||||
RandomProjection(n_components='auto').transform, data)
|
||||
|
||||
|
||||
def test_too_many_samples_to_find_a_safe_embedding():
|
||||
data, _ = make_sparse_random_data(1000, 100, 1000)
|
||||
|
||||
for RandomProjection in all_RandomProjection:
|
||||
rp = RandomProjection(n_components='auto', eps=0.1)
|
||||
expected_msg = (
|
||||
'eps=0.100000 and n_samples=1000 lead to a target dimension'
|
||||
' of 5920 which is larger than the original space with'
|
||||
' n_features=100')
|
||||
assert_raise_message(ValueError, expected_msg, rp.fit, data)
|
||||
|
||||
|
||||
def test_random_projection_embedding_quality():
|
||||
data, _ = make_sparse_random_data(8, 5000, 15000)
|
||||
eps = 0.2
|
||||
|
||||
original_distances = euclidean_distances(data, squared=True)
|
||||
original_distances = original_distances.ravel()
|
||||
non_identical = original_distances != 0.0
|
||||
|
||||
# remove 0 distances to avoid division by 0
|
||||
original_distances = original_distances[non_identical]
|
||||
|
||||
for RandomProjection in all_RandomProjection:
|
||||
rp = RandomProjection(n_components='auto', eps=eps, random_state=0)
|
||||
projected = rp.fit_transform(data)
|
||||
|
||||
projected_distances = euclidean_distances(projected, squared=True)
|
||||
projected_distances = projected_distances.ravel()
|
||||
|
||||
# remove 0 distances to avoid division by 0
|
||||
projected_distances = projected_distances[non_identical]
|
||||
|
||||
distances_ratio = projected_distances / original_distances
|
||||
|
||||
# check that the automatically tuned values for the density respect the
|
||||
# contract for eps: pairwise distances are preserved according to the
|
||||
# Johnson-Lindenstrauss lemma
|
||||
assert distances_ratio.max() < 1 + eps
|
||||
assert 1 - eps < distances_ratio.min()
|
||||
|
||||
|
||||
def test_SparseRandomProjection_output_representation():
|
||||
for SparseRandomProjection in all_SparseRandomProjection:
|
||||
# when using sparse input, the projected data can be forced to be a
|
||||
# dense numpy array
|
||||
rp = SparseRandomProjection(n_components=10, dense_output=True,
|
||||
random_state=0)
|
||||
rp.fit(data)
|
||||
assert isinstance(rp.transform(data), np.ndarray)
|
||||
|
||||
sparse_data = sp.csr_matrix(data)
|
||||
assert isinstance(rp.transform(sparse_data), np.ndarray)
|
||||
|
||||
# the output can be left to a sparse matrix instead
|
||||
rp = SparseRandomProjection(n_components=10, dense_output=False,
|
||||
random_state=0)
|
||||
rp = rp.fit(data)
|
||||
# output for dense input will stay dense:
|
||||
assert isinstance(rp.transform(data), np.ndarray)
|
||||
|
||||
# output for sparse output will be sparse:
|
||||
assert sp.issparse(rp.transform(sparse_data))
|
||||
|
||||
|
||||
def test_correct_RandomProjection_dimensions_embedding():
|
||||
for RandomProjection in all_RandomProjection:
|
||||
rp = RandomProjection(n_components='auto',
|
||||
random_state=0,
|
||||
eps=0.5).fit(data)
|
||||
|
||||
# the number of components is adjusted from the shape of the training
|
||||
# set
|
||||
assert rp.n_components == 'auto'
|
||||
assert rp.n_components_ == 110
|
||||
|
||||
if RandomProjection in all_SparseRandomProjection:
|
||||
assert rp.density == 'auto'
|
||||
assert_almost_equal(rp.density_, 0.03, 2)
|
||||
|
||||
assert rp.components_.shape == (110, n_features)
|
||||
|
||||
projected_1 = rp.transform(data)
|
||||
assert projected_1.shape == (n_samples, 110)
|
||||
|
||||
# once the RP is 'fitted' the projection is always the same
|
||||
projected_2 = rp.transform(data)
|
||||
assert_array_equal(projected_1, projected_2)
|
||||
|
||||
# fit transform with same random seed will lead to the same results
|
||||
rp2 = RandomProjection(random_state=0, eps=0.5)
|
||||
projected_3 = rp2.fit_transform(data)
|
||||
assert_array_equal(projected_1, projected_3)
|
||||
|
||||
# Try to transform with an input X of size different from fitted.
|
||||
assert_raises(ValueError, rp.transform, data[:, 1:5])
|
||||
|
||||
# it is also possible to fix the number of components and the density
|
||||
# level
|
||||
if RandomProjection in all_SparseRandomProjection:
|
||||
rp = RandomProjection(n_components=100, density=0.001,
|
||||
random_state=0)
|
||||
projected = rp.fit_transform(data)
|
||||
assert projected.shape == (n_samples, 100)
|
||||
assert rp.components_.shape == (100, n_features)
|
||||
assert rp.components_.nnz < 115 # close to 1% density
|
||||
assert 85 < rp.components_.nnz # close to 1% density
|
||||
|
||||
|
||||
def test_warning_n_components_greater_than_n_features():
|
||||
n_features = 20
|
||||
data, _ = make_sparse_random_data(5, n_features, int(n_features / 4))
|
||||
|
||||
for RandomProjection in all_RandomProjection:
|
||||
assert_warns(DataDimensionalityWarning,
|
||||
RandomProjection(n_components=n_features + 1).fit, data)
|
||||
|
||||
|
||||
def test_works_with_sparse_data():
|
||||
n_features = 20
|
||||
data, _ = make_sparse_random_data(5, n_features, int(n_features / 4))
|
||||
|
||||
for RandomProjection in all_RandomProjection:
|
||||
rp_dense = RandomProjection(n_components=3,
|
||||
random_state=1).fit(data)
|
||||
rp_sparse = RandomProjection(n_components=3,
|
||||
random_state=1).fit(sp.csr_matrix(data))
|
||||
assert_array_almost_equal(densify(rp_dense.components_),
|
||||
densify(rp_sparse.components_))
|
||||
|
||||
|
||||
# TODO remove in 0.24
|
||||
def test_deprecations():
|
||||
|
||||
with pytest.warns(FutureWarning, match="deprecated in 0.22"):
|
||||
gaussian_random_matrix(10, 100)
|
||||
|
||||
with pytest.warns(FutureWarning, match="deprecated in 0.22"):
|
||||
sparse_random_matrix(10, 100)
|
Loading…
Add table
Add a link
Reference in a new issue