Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
902
venv/Lib/site-packages/sklearn/ensemble/tests/test_bagging.py
Normal file
902
venv/Lib/site-packages/sklearn/ensemble/tests/test_bagging.py
Normal file
|
@ -0,0 +1,902 @@
|
|||
"""
|
||||
Testing for the bagging ensemble module (sklearn.ensemble.bagging).
|
||||
"""
|
||||
|
||||
# Author: Gilles Louppe
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
import joblib
|
||||
|
||||
from sklearn.base import BaseEstimator
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_raises
|
||||
from sklearn.utils._testing import assert_warns
|
||||
from sklearn.utils._testing import assert_warns_message
|
||||
from sklearn.utils._testing import assert_raise_message
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from sklearn.model_selection import GridSearchCV, ParameterGrid
|
||||
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
|
||||
from sklearn.linear_model import Perceptron, LogisticRegression
|
||||
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.svm import SVC, SVR
|
||||
from sklearn.random_projection import SparseRandomProjection
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.feature_selection import SelectKBest
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.preprocessing import FunctionTransformer
|
||||
|
||||
from scipy.sparse import csc_matrix, csr_matrix
|
||||
|
||||
rng = check_random_state(0)
|
||||
|
||||
# also load the iris dataset
|
||||
# and randomly permute it
|
||||
iris = load_iris()
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data = iris.data[perm]
|
||||
iris.target = iris.target[perm]
|
||||
|
||||
# also load the diabetes dataset
|
||||
# and randomly permute it
|
||||
diabetes = load_diabetes()
|
||||
perm = rng.permutation(diabetes.target.size)
|
||||
diabetes.data = diabetes.data[perm]
|
||||
diabetes.target = diabetes.target[perm]
|
||||
|
||||
|
||||
# TODO: Remove in 0.24 when DummyClassifier's `strategy` default updates
|
||||
@ignore_warnings(category=FutureWarning)
|
||||
def test_classification():
|
||||
# Check classification for various parameter settings.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(iris.data,
|
||||
iris.target,
|
||||
random_state=rng)
|
||||
grid = ParameterGrid({"max_samples": [0.5, 1.0],
|
||||
"max_features": [1, 2, 4],
|
||||
"bootstrap": [True, False],
|
||||
"bootstrap_features": [True, False]})
|
||||
|
||||
for base_estimator in [None,
|
||||
DummyClassifier(),
|
||||
Perceptron(),
|
||||
DecisionTreeClassifier(),
|
||||
KNeighborsClassifier(),
|
||||
SVC()]:
|
||||
for params in grid:
|
||||
BaggingClassifier(base_estimator=base_estimator,
|
||||
random_state=rng,
|
||||
**params).fit(X_train, y_train).predict(X_test)
|
||||
|
||||
|
||||
def test_sparse_classification():
|
||||
# Check classification for various parameter settings on sparse input.
|
||||
|
||||
class CustomSVC(SVC):
|
||||
"""SVC variant that records the nature of the training set"""
|
||||
|
||||
def fit(self, X, y):
|
||||
super().fit(X, y)
|
||||
self.data_type_ = type(X)
|
||||
return self
|
||||
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(iris.data,
|
||||
iris.target,
|
||||
random_state=rng)
|
||||
parameter_sets = [
|
||||
{"max_samples": 0.5,
|
||||
"max_features": 2,
|
||||
"bootstrap": True,
|
||||
"bootstrap_features": True},
|
||||
{"max_samples": 1.0,
|
||||
"max_features": 4,
|
||||
"bootstrap": True,
|
||||
"bootstrap_features": True},
|
||||
{"max_features": 2,
|
||||
"bootstrap": False,
|
||||
"bootstrap_features": True},
|
||||
{"max_samples": 0.5,
|
||||
"bootstrap": True,
|
||||
"bootstrap_features": False},
|
||||
]
|
||||
|
||||
for sparse_format in [csc_matrix, csr_matrix]:
|
||||
X_train_sparse = sparse_format(X_train)
|
||||
X_test_sparse = sparse_format(X_test)
|
||||
for params in parameter_sets:
|
||||
for f in ['predict', 'predict_proba', 'predict_log_proba', 'decision_function']:
|
||||
# Trained on sparse format
|
||||
sparse_classifier = BaggingClassifier(
|
||||
base_estimator=CustomSVC(decision_function_shape='ovr'),
|
||||
random_state=1,
|
||||
**params
|
||||
).fit(X_train_sparse, y_train)
|
||||
sparse_results = getattr(sparse_classifier, f)(X_test_sparse)
|
||||
|
||||
# Trained on dense format
|
||||
dense_classifier = BaggingClassifier(
|
||||
base_estimator=CustomSVC(decision_function_shape='ovr'),
|
||||
random_state=1,
|
||||
**params
|
||||
).fit(X_train, y_train)
|
||||
dense_results = getattr(dense_classifier, f)(X_test)
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
sparse_type = type(X_train_sparse)
|
||||
types = [i.data_type_ for i in sparse_classifier.estimators_]
|
||||
|
||||
assert all([t == sparse_type for t in types])
|
||||
|
||||
|
||||
def test_regression():
|
||||
# Check regression for various parameter settings.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
|
||||
diabetes.target[:50],
|
||||
random_state=rng)
|
||||
grid = ParameterGrid({"max_samples": [0.5, 1.0],
|
||||
"max_features": [0.5, 1.0],
|
||||
"bootstrap": [True, False],
|
||||
"bootstrap_features": [True, False]})
|
||||
|
||||
for base_estimator in [None,
|
||||
DummyRegressor(),
|
||||
DecisionTreeRegressor(),
|
||||
KNeighborsRegressor(),
|
||||
SVR()]:
|
||||
for params in grid:
|
||||
BaggingRegressor(base_estimator=base_estimator,
|
||||
random_state=rng,
|
||||
**params).fit(X_train, y_train).predict(X_test)
|
||||
|
||||
|
||||
def test_sparse_regression():
|
||||
# Check regression for various parameter settings on sparse input.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
|
||||
diabetes.target[:50],
|
||||
random_state=rng)
|
||||
|
||||
class CustomSVR(SVR):
|
||||
"""SVC variant that records the nature of the training set"""
|
||||
|
||||
def fit(self, X, y):
|
||||
super().fit(X, y)
|
||||
self.data_type_ = type(X)
|
||||
return self
|
||||
|
||||
parameter_sets = [
|
||||
{"max_samples": 0.5,
|
||||
"max_features": 2,
|
||||
"bootstrap": True,
|
||||
"bootstrap_features": True},
|
||||
{"max_samples": 1.0,
|
||||
"max_features": 4,
|
||||
"bootstrap": True,
|
||||
"bootstrap_features": True},
|
||||
{"max_features": 2,
|
||||
"bootstrap": False,
|
||||
"bootstrap_features": True},
|
||||
{"max_samples": 0.5,
|
||||
"bootstrap": True,
|
||||
"bootstrap_features": False},
|
||||
]
|
||||
|
||||
for sparse_format in [csc_matrix, csr_matrix]:
|
||||
X_train_sparse = sparse_format(X_train)
|
||||
X_test_sparse = sparse_format(X_test)
|
||||
for params in parameter_sets:
|
||||
|
||||
# Trained on sparse format
|
||||
sparse_classifier = BaggingRegressor(
|
||||
base_estimator=CustomSVR(),
|
||||
random_state=1,
|
||||
**params
|
||||
).fit(X_train_sparse, y_train)
|
||||
sparse_results = sparse_classifier.predict(X_test_sparse)
|
||||
|
||||
# Trained on dense format
|
||||
dense_results = BaggingRegressor(
|
||||
base_estimator=CustomSVR(),
|
||||
random_state=1,
|
||||
**params
|
||||
).fit(X_train, y_train).predict(X_test)
|
||||
|
||||
sparse_type = type(X_train_sparse)
|
||||
types = [i.data_type_ for i in sparse_classifier.estimators_]
|
||||
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
assert all([t == sparse_type for t in types])
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
|
||||
class DummySizeEstimator(BaseEstimator):
|
||||
|
||||
def fit(self, X, y):
|
||||
self.training_size_ = X.shape[0]
|
||||
self.training_hash_ = joblib.hash(X)
|
||||
|
||||
|
||||
def test_bootstrap_samples():
|
||||
# Test that bootstrapping samples generate non-perfect base estimators.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
|
||||
diabetes.target,
|
||||
random_state=rng)
|
||||
|
||||
base_estimator = DecisionTreeRegressor().fit(X_train, y_train)
|
||||
|
||||
# without bootstrap, all trees are perfect on the training set
|
||||
ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
|
||||
max_samples=1.0,
|
||||
bootstrap=False,
|
||||
random_state=rng).fit(X_train, y_train)
|
||||
|
||||
assert (base_estimator.score(X_train, y_train) ==
|
||||
ensemble.score(X_train, y_train))
|
||||
|
||||
# with bootstrap, trees are no longer perfect on the training set
|
||||
ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
|
||||
max_samples=1.0,
|
||||
bootstrap=True,
|
||||
random_state=rng).fit(X_train, y_train)
|
||||
|
||||
assert (base_estimator.score(X_train, y_train) >
|
||||
ensemble.score(X_train, y_train))
|
||||
|
||||
# check that each sampling correspond to a complete bootstrap resample.
|
||||
# the size of each bootstrap should be the same as the input data but
|
||||
# the data should be different (checked using the hash of the data).
|
||||
ensemble = BaggingRegressor(base_estimator=DummySizeEstimator(),
|
||||
bootstrap=True).fit(X_train, y_train)
|
||||
training_hash = []
|
||||
for estimator in ensemble.estimators_:
|
||||
assert estimator.training_size_ == X_train.shape[0]
|
||||
training_hash.append(estimator.training_hash_)
|
||||
assert len(set(training_hash)) == len(training_hash)
|
||||
|
||||
|
||||
def test_bootstrap_features():
|
||||
# Test that bootstrapping features may generate duplicate features.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
|
||||
diabetes.target,
|
||||
random_state=rng)
|
||||
|
||||
ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
|
||||
max_features=1.0,
|
||||
bootstrap_features=False,
|
||||
random_state=rng).fit(X_train, y_train)
|
||||
|
||||
for features in ensemble.estimators_features_:
|
||||
assert diabetes.data.shape[1] == np.unique(features).shape[0]
|
||||
|
||||
ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
|
||||
max_features=1.0,
|
||||
bootstrap_features=True,
|
||||
random_state=rng).fit(X_train, y_train)
|
||||
|
||||
for features in ensemble.estimators_features_:
|
||||
assert diabetes.data.shape[1] > np.unique(features).shape[0]
|
||||
|
||||
|
||||
def test_probability():
|
||||
# Predict probabilities.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(iris.data,
|
||||
iris.target,
|
||||
random_state=rng)
|
||||
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
# Normal case
|
||||
ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
|
||||
random_state=rng).fit(X_train, y_train)
|
||||
|
||||
assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
|
||||
axis=1),
|
||||
np.ones(len(X_test)))
|
||||
|
||||
assert_array_almost_equal(ensemble.predict_proba(X_test),
|
||||
np.exp(ensemble.predict_log_proba(X_test)))
|
||||
|
||||
# Degenerate case, where some classes are missing
|
||||
ensemble = BaggingClassifier(base_estimator=LogisticRegression(),
|
||||
random_state=rng,
|
||||
max_samples=5).fit(X_train, y_train)
|
||||
|
||||
assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
|
||||
axis=1),
|
||||
np.ones(len(X_test)))
|
||||
|
||||
assert_array_almost_equal(ensemble.predict_proba(X_test),
|
||||
np.exp(ensemble.predict_log_proba(X_test)))
|
||||
|
||||
|
||||
def test_oob_score_classification():
|
||||
# Check that oob prediction is a good estimation of the generalization
|
||||
# error.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(iris.data,
|
||||
iris.target,
|
||||
random_state=rng)
|
||||
|
||||
for base_estimator in [DecisionTreeClassifier(), SVC()]:
|
||||
clf = BaggingClassifier(base_estimator=base_estimator,
|
||||
n_estimators=100,
|
||||
bootstrap=True,
|
||||
oob_score=True,
|
||||
random_state=rng).fit(X_train, y_train)
|
||||
|
||||
test_score = clf.score(X_test, y_test)
|
||||
|
||||
assert abs(test_score - clf.oob_score_) < 0.1
|
||||
|
||||
# Test with few estimators
|
||||
assert_warns(UserWarning,
|
||||
BaggingClassifier(base_estimator=base_estimator,
|
||||
n_estimators=1,
|
||||
bootstrap=True,
|
||||
oob_score=True,
|
||||
random_state=rng).fit,
|
||||
X_train,
|
||||
y_train)
|
||||
|
||||
|
||||
def test_oob_score_regression():
|
||||
# Check that oob prediction is a good estimation of the generalization
|
||||
# error.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
|
||||
diabetes.target,
|
||||
random_state=rng)
|
||||
|
||||
clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
|
||||
n_estimators=50,
|
||||
bootstrap=True,
|
||||
oob_score=True,
|
||||
random_state=rng).fit(X_train, y_train)
|
||||
|
||||
test_score = clf.score(X_test, y_test)
|
||||
|
||||
assert abs(test_score - clf.oob_score_) < 0.1
|
||||
|
||||
# Test with few estimators
|
||||
assert_warns(UserWarning,
|
||||
BaggingRegressor(base_estimator=DecisionTreeRegressor(),
|
||||
n_estimators=1,
|
||||
bootstrap=True,
|
||||
oob_score=True,
|
||||
random_state=rng).fit,
|
||||
X_train,
|
||||
y_train)
|
||||
|
||||
|
||||
def test_single_estimator():
|
||||
# Check singleton ensembles.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
|
||||
diabetes.target,
|
||||
random_state=rng)
|
||||
|
||||
clf1 = BaggingRegressor(base_estimator=KNeighborsRegressor(),
|
||||
n_estimators=1,
|
||||
bootstrap=False,
|
||||
bootstrap_features=False,
|
||||
random_state=rng).fit(X_train, y_train)
|
||||
|
||||
clf2 = KNeighborsRegressor().fit(X_train, y_train)
|
||||
|
||||
assert_array_almost_equal(clf1.predict(X_test), clf2.predict(X_test))
|
||||
|
||||
|
||||
def test_error():
|
||||
# Test that it gives proper exception on deficient input.
|
||||
X, y = iris.data, iris.target
|
||||
base = DecisionTreeClassifier()
|
||||
|
||||
# Test max_samples
|
||||
assert_raises(ValueError,
|
||||
BaggingClassifier(base, max_samples=-1).fit, X, y)
|
||||
assert_raises(ValueError,
|
||||
BaggingClassifier(base, max_samples=0.0).fit, X, y)
|
||||
assert_raises(ValueError,
|
||||
BaggingClassifier(base, max_samples=2.0).fit, X, y)
|
||||
assert_raises(ValueError,
|
||||
BaggingClassifier(base, max_samples=1000).fit, X, y)
|
||||
assert_raises(ValueError,
|
||||
BaggingClassifier(base, max_samples="foobar").fit, X, y)
|
||||
|
||||
# Test max_features
|
||||
assert_raises(ValueError,
|
||||
BaggingClassifier(base, max_features=-1).fit, X, y)
|
||||
assert_raises(ValueError,
|
||||
BaggingClassifier(base, max_features=0.0).fit, X, y)
|
||||
assert_raises(ValueError,
|
||||
BaggingClassifier(base, max_features=2.0).fit, X, y)
|
||||
assert_raises(ValueError,
|
||||
BaggingClassifier(base, max_features=5).fit, X, y)
|
||||
assert_raises(ValueError,
|
||||
BaggingClassifier(base, max_features="foobar").fit, X, y)
|
||||
|
||||
# Test support of decision_function
|
||||
assert not hasattr(BaggingClassifier(base).fit(X, y), 'decision_function')
|
||||
|
||||
|
||||
def test_parallel_classification():
|
||||
# Check parallel classification.
|
||||
rng = check_random_state(0)
|
||||
|
||||
# Classification
|
||||
X_train, X_test, y_train, y_test = train_test_split(iris.data,
|
||||
iris.target,
|
||||
random_state=rng)
|
||||
|
||||
ensemble = BaggingClassifier(DecisionTreeClassifier(),
|
||||
n_jobs=3,
|
||||
random_state=0).fit(X_train, y_train)
|
||||
|
||||
# predict_proba
|
||||
ensemble.set_params(n_jobs=1)
|
||||
y1 = ensemble.predict_proba(X_test)
|
||||
ensemble.set_params(n_jobs=2)
|
||||
y2 = ensemble.predict_proba(X_test)
|
||||
assert_array_almost_equal(y1, y2)
|
||||
|
||||
ensemble = BaggingClassifier(DecisionTreeClassifier(),
|
||||
n_jobs=1,
|
||||
random_state=0).fit(X_train, y_train)
|
||||
|
||||
y3 = ensemble.predict_proba(X_test)
|
||||
assert_array_almost_equal(y1, y3)
|
||||
|
||||
# decision_function
|
||||
ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
|
||||
n_jobs=3,
|
||||
random_state=0).fit(X_train, y_train)
|
||||
|
||||
ensemble.set_params(n_jobs=1)
|
||||
decisions1 = ensemble.decision_function(X_test)
|
||||
ensemble.set_params(n_jobs=2)
|
||||
decisions2 = ensemble.decision_function(X_test)
|
||||
assert_array_almost_equal(decisions1, decisions2)
|
||||
|
||||
X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1))))
|
||||
assert_raise_message(ValueError, "Number of features of the model "
|
||||
"must match the input. Model n_features is {0} "
|
||||
"and input n_features is {1} "
|
||||
"".format(X_test.shape[1], X_err.shape[1]),
|
||||
ensemble.decision_function, X_err)
|
||||
|
||||
ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
|
||||
n_jobs=1,
|
||||
random_state=0).fit(X_train, y_train)
|
||||
|
||||
decisions3 = ensemble.decision_function(X_test)
|
||||
assert_array_almost_equal(decisions1, decisions3)
|
||||
|
||||
|
||||
def test_parallel_regression():
|
||||
# Check parallel regression.
|
||||
rng = check_random_state(0)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
|
||||
diabetes.target,
|
||||
random_state=rng)
|
||||
|
||||
ensemble = BaggingRegressor(DecisionTreeRegressor(),
|
||||
n_jobs=3,
|
||||
random_state=0).fit(X_train, y_train)
|
||||
|
||||
ensemble.set_params(n_jobs=1)
|
||||
y1 = ensemble.predict(X_test)
|
||||
ensemble.set_params(n_jobs=2)
|
||||
y2 = ensemble.predict(X_test)
|
||||
assert_array_almost_equal(y1, y2)
|
||||
|
||||
ensemble = BaggingRegressor(DecisionTreeRegressor(),
|
||||
n_jobs=1,
|
||||
random_state=0).fit(X_train, y_train)
|
||||
|
||||
y3 = ensemble.predict(X_test)
|
||||
assert_array_almost_equal(y1, y3)
|
||||
|
||||
|
||||
def test_gridsearch():
|
||||
# Check that bagging ensembles can be grid-searched.
|
||||
# Transform iris into a binary classification task
|
||||
X, y = iris.data, iris.target
|
||||
y[y == 2] = 1
|
||||
|
||||
# Grid search with scoring based on decision_function
|
||||
parameters = {'n_estimators': (1, 2),
|
||||
'base_estimator__C': (1, 2)}
|
||||
|
||||
GridSearchCV(BaggingClassifier(SVC()),
|
||||
parameters,
|
||||
scoring="roc_auc").fit(X, y)
|
||||
|
||||
|
||||
def test_base_estimator():
|
||||
# Check base_estimator and its default values.
|
||||
rng = check_random_state(0)
|
||||
|
||||
# Classification
|
||||
X_train, X_test, y_train, y_test = train_test_split(iris.data,
|
||||
iris.target,
|
||||
random_state=rng)
|
||||
|
||||
ensemble = BaggingClassifier(None,
|
||||
n_jobs=3,
|
||||
random_state=0).fit(X_train, y_train)
|
||||
|
||||
assert isinstance(ensemble.base_estimator_, DecisionTreeClassifier)
|
||||
|
||||
ensemble = BaggingClassifier(DecisionTreeClassifier(),
|
||||
n_jobs=3,
|
||||
random_state=0).fit(X_train, y_train)
|
||||
|
||||
assert isinstance(ensemble.base_estimator_, DecisionTreeClassifier)
|
||||
|
||||
ensemble = BaggingClassifier(Perceptron(),
|
||||
n_jobs=3,
|
||||
random_state=0).fit(X_train, y_train)
|
||||
|
||||
assert isinstance(ensemble.base_estimator_, Perceptron)
|
||||
|
||||
# Regression
|
||||
X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
|
||||
diabetes.target,
|
||||
random_state=rng)
|
||||
|
||||
ensemble = BaggingRegressor(None,
|
||||
n_jobs=3,
|
||||
random_state=0).fit(X_train, y_train)
|
||||
|
||||
assert isinstance(ensemble.base_estimator_, DecisionTreeRegressor)
|
||||
|
||||
ensemble = BaggingRegressor(DecisionTreeRegressor(),
|
||||
n_jobs=3,
|
||||
random_state=0).fit(X_train, y_train)
|
||||
|
||||
assert isinstance(ensemble.base_estimator_, DecisionTreeRegressor)
|
||||
|
||||
ensemble = BaggingRegressor(SVR(),
|
||||
n_jobs=3,
|
||||
random_state=0).fit(X_train, y_train)
|
||||
assert isinstance(ensemble.base_estimator_, SVR)
|
||||
|
||||
|
||||
def test_bagging_with_pipeline():
|
||||
estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
|
||||
DecisionTreeClassifier()),
|
||||
max_features=2)
|
||||
estimator.fit(iris.data, iris.target)
|
||||
assert isinstance(estimator[0].steps[-1][1].random_state, int)
|
||||
|
||||
|
||||
class DummyZeroEstimator(BaseEstimator):
|
||||
|
||||
def fit(self, X, y):
|
||||
self.classes_ = np.unique(y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
return self.classes_[np.zeros(X.shape[0], dtype=int)]
|
||||
|
||||
|
||||
def test_bagging_sample_weight_unsupported_but_passed():
|
||||
estimator = BaggingClassifier(DummyZeroEstimator())
|
||||
rng = check_random_state(0)
|
||||
|
||||
estimator.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert_raises(ValueError, estimator.fit, iris.data, iris.target,
|
||||
sample_weight=rng.randint(10, size=(iris.data.shape[0])))
|
||||
|
||||
|
||||
def test_warm_start(random_state=42):
|
||||
# Test if fitting incrementally with warm start gives a forest of the
|
||||
# right size and the same results as a normal fit.
|
||||
X, y = make_hastie_10_2(n_samples=20, random_state=1)
|
||||
|
||||
clf_ws = None
|
||||
for n_estimators in [5, 10]:
|
||||
if clf_ws is None:
|
||||
clf_ws = BaggingClassifier(n_estimators=n_estimators,
|
||||
random_state=random_state,
|
||||
warm_start=True)
|
||||
else:
|
||||
clf_ws.set_params(n_estimators=n_estimators)
|
||||
clf_ws.fit(X, y)
|
||||
assert len(clf_ws) == n_estimators
|
||||
|
||||
clf_no_ws = BaggingClassifier(n_estimators=10, random_state=random_state,
|
||||
warm_start=False)
|
||||
clf_no_ws.fit(X, y)
|
||||
|
||||
assert (set([tree.random_state for tree in clf_ws]) ==
|
||||
set([tree.random_state for tree in clf_no_ws]))
|
||||
|
||||
|
||||
def test_warm_start_smaller_n_estimators():
|
||||
# Test if warm start'ed second fit with smaller n_estimators raises error.
|
||||
X, y = make_hastie_10_2(n_samples=20, random_state=1)
|
||||
clf = BaggingClassifier(n_estimators=5, warm_start=True)
|
||||
clf.fit(X, y)
|
||||
clf.set_params(n_estimators=4)
|
||||
assert_raises(ValueError, clf.fit, X, y)
|
||||
|
||||
|
||||
def test_warm_start_equal_n_estimators():
|
||||
# Test that nothing happens when fitting without increasing n_estimators
|
||||
X, y = make_hastie_10_2(n_samples=20, random_state=1)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
|
||||
|
||||
clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83)
|
||||
clf.fit(X_train, y_train)
|
||||
|
||||
y_pred = clf.predict(X_test)
|
||||
# modify X to nonsense values, this should not change anything
|
||||
X_train += 1.
|
||||
|
||||
assert_warns_message(UserWarning,
|
||||
"Warm-start fitting without increasing n_estimators does not",
|
||||
clf.fit, X_train, y_train)
|
||||
assert_array_equal(y_pred, clf.predict(X_test))
|
||||
|
||||
|
||||
def test_warm_start_equivalence():
|
||||
# warm started classifier with 5+5 estimators should be equivalent to
|
||||
# one classifier with 10 estimators
|
||||
X, y = make_hastie_10_2(n_samples=20, random_state=1)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
|
||||
|
||||
clf_ws = BaggingClassifier(n_estimators=5, warm_start=True,
|
||||
random_state=3141)
|
||||
clf_ws.fit(X_train, y_train)
|
||||
clf_ws.set_params(n_estimators=10)
|
||||
clf_ws.fit(X_train, y_train)
|
||||
y1 = clf_ws.predict(X_test)
|
||||
|
||||
clf = BaggingClassifier(n_estimators=10, warm_start=False,
|
||||
random_state=3141)
|
||||
clf.fit(X_train, y_train)
|
||||
y2 = clf.predict(X_test)
|
||||
|
||||
assert_array_almost_equal(y1, y2)
|
||||
|
||||
|
||||
def test_warm_start_with_oob_score_fails():
|
||||
# Check using oob_score and warm_start simultaneously fails
|
||||
X, y = make_hastie_10_2(n_samples=20, random_state=1)
|
||||
clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True)
|
||||
assert_raises(ValueError, clf.fit, X, y)
|
||||
|
||||
|
||||
def test_oob_score_removed_on_warm_start():
|
||||
X, y = make_hastie_10_2(n_samples=2000, random_state=1)
|
||||
|
||||
clf = BaggingClassifier(n_estimators=50, oob_score=True)
|
||||
clf.fit(X, y)
|
||||
|
||||
clf.set_params(warm_start=True, oob_score=False, n_estimators=100)
|
||||
clf.fit(X, y)
|
||||
|
||||
assert_raises(AttributeError, getattr, clf, "oob_score_")
|
||||
|
||||
|
||||
def test_oob_score_consistency():
|
||||
# Make sure OOB scores are identical when random_state, estimator, and
|
||||
# training data are fixed and fitting is done twice
|
||||
X, y = make_hastie_10_2(n_samples=200, random_state=1)
|
||||
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5,
|
||||
max_features=0.5, oob_score=True,
|
||||
random_state=1)
|
||||
assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
|
||||
|
||||
|
||||
def test_estimators_samples():
|
||||
# Check that format of estimators_samples_ is correct and that results
|
||||
# generated at fit time can be identically reproduced at a later time
|
||||
# using data saved in object attributes.
|
||||
X, y = make_hastie_10_2(n_samples=200, random_state=1)
|
||||
bagging = BaggingClassifier(LogisticRegression(), max_samples=0.5,
|
||||
max_features=0.5, random_state=1,
|
||||
bootstrap=False)
|
||||
bagging.fit(X, y)
|
||||
|
||||
# Get relevant attributes
|
||||
estimators_samples = bagging.estimators_samples_
|
||||
estimators_features = bagging.estimators_features_
|
||||
estimators = bagging.estimators_
|
||||
|
||||
# Test for correct formatting
|
||||
assert len(estimators_samples) == len(estimators)
|
||||
assert len(estimators_samples[0]) == len(X) // 2
|
||||
assert estimators_samples[0].dtype.kind == 'i'
|
||||
|
||||
# Re-fit single estimator to test for consistent sampling
|
||||
estimator_index = 0
|
||||
estimator_samples = estimators_samples[estimator_index]
|
||||
estimator_features = estimators_features[estimator_index]
|
||||
estimator = estimators[estimator_index]
|
||||
|
||||
X_train = (X[estimator_samples])[:, estimator_features]
|
||||
y_train = y[estimator_samples]
|
||||
|
||||
orig_coefs = estimator.coef_
|
||||
estimator.fit(X_train, y_train)
|
||||
new_coefs = estimator.coef_
|
||||
|
||||
assert_array_almost_equal(orig_coefs, new_coefs)
|
||||
|
||||
|
||||
def test_estimators_samples_deterministic():
|
||||
# This test is a regression test to check that with a random step
|
||||
# (e.g. SparseRandomProjection) and a given random state, the results
|
||||
# generated at fit time can be identically reproduced at a later time using
|
||||
# data saved in object attributes. Check issue #9524 for full discussion.
|
||||
|
||||
iris = load_iris()
|
||||
X, y = iris.data, iris.target
|
||||
|
||||
base_pipeline = make_pipeline(SparseRandomProjection(n_components=2),
|
||||
LogisticRegression())
|
||||
clf = BaggingClassifier(base_estimator=base_pipeline,
|
||||
max_samples=0.5,
|
||||
random_state=0)
|
||||
clf.fit(X, y)
|
||||
pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy()
|
||||
|
||||
estimator = clf.estimators_[0]
|
||||
estimator_sample = clf.estimators_samples_[0]
|
||||
estimator_feature = clf.estimators_features_[0]
|
||||
|
||||
X_train = (X[estimator_sample])[:, estimator_feature]
|
||||
y_train = y[estimator_sample]
|
||||
|
||||
estimator.fit(X_train, y_train)
|
||||
assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
|
||||
|
||||
|
||||
def test_max_samples_consistency():
|
||||
# Make sure validated max_samples and original max_samples are identical
|
||||
# when valid integer max_samples supplied by user
|
||||
max_samples = 100
|
||||
X, y = make_hastie_10_2(n_samples=2*max_samples, random_state=1)
|
||||
bagging = BaggingClassifier(KNeighborsClassifier(),
|
||||
max_samples=max_samples,
|
||||
max_features=0.5, random_state=1)
|
||||
bagging.fit(X, y)
|
||||
assert bagging._max_samples == max_samples
|
||||
|
||||
|
||||
def test_set_oob_score_label_encoding():
|
||||
# Make sure the oob_score doesn't change when the labels change
|
||||
# See: https://github.com/scikit-learn/scikit-learn/issues/8933
|
||||
random_state = 5
|
||||
X = [[-1], [0], [1]] * 5
|
||||
Y1 = ['A', 'B', 'C'] * 5
|
||||
Y2 = [-1, 0, 1] * 5
|
||||
Y3 = [0, 1, 2] * 5
|
||||
x1 = BaggingClassifier(oob_score=True,
|
||||
random_state=random_state).fit(X, Y1).oob_score_
|
||||
x2 = BaggingClassifier(oob_score=True,
|
||||
random_state=random_state).fit(X, Y2).oob_score_
|
||||
x3 = BaggingClassifier(oob_score=True,
|
||||
random_state=random_state).fit(X, Y3).oob_score_
|
||||
assert [x1, x2] == [x3, x3]
|
||||
|
||||
|
||||
def replace(X):
|
||||
X = X.astype('float', copy=True)
|
||||
X[~np.isfinite(X)] = 0
|
||||
return X
|
||||
|
||||
|
||||
def test_bagging_regressor_with_missing_inputs():
|
||||
# Check that BaggingRegressor can accept X with missing/infinite data
|
||||
X = np.array([
|
||||
[1, 3, 5],
|
||||
[2, None, 6],
|
||||
[2, np.nan, 6],
|
||||
[2, np.inf, 6],
|
||||
[2, np.NINF, 6],
|
||||
])
|
||||
y_values = [
|
||||
np.array([2, 3, 3, 3, 3]),
|
||||
np.array([
|
||||
[2, 1, 9],
|
||||
[3, 6, 8],
|
||||
[3, 6, 8],
|
||||
[3, 6, 8],
|
||||
[3, 6, 8],
|
||||
])
|
||||
]
|
||||
for y in y_values:
|
||||
regressor = DecisionTreeRegressor()
|
||||
pipeline = make_pipeline(
|
||||
FunctionTransformer(replace), regressor
|
||||
)
|
||||
pipeline.fit(X, y).predict(X)
|
||||
bagging_regressor = BaggingRegressor(pipeline)
|
||||
y_hat = bagging_regressor.fit(X, y).predict(X)
|
||||
assert y.shape == y_hat.shape
|
||||
|
||||
# Verify that exceptions can be raised by wrapper regressor
|
||||
regressor = DecisionTreeRegressor()
|
||||
pipeline = make_pipeline(regressor)
|
||||
assert_raises(ValueError, pipeline.fit, X, y)
|
||||
bagging_regressor = BaggingRegressor(pipeline)
|
||||
assert_raises(ValueError, bagging_regressor.fit, X, y)
|
||||
|
||||
|
||||
def test_bagging_classifier_with_missing_inputs():
|
||||
# Check that BaggingClassifier can accept X with missing/infinite data
|
||||
X = np.array([
|
||||
[1, 3, 5],
|
||||
[2, None, 6],
|
||||
[2, np.nan, 6],
|
||||
[2, np.inf, 6],
|
||||
[2, np.NINF, 6],
|
||||
])
|
||||
y = np.array([3, 6, 6, 6, 6])
|
||||
classifier = DecisionTreeClassifier()
|
||||
pipeline = make_pipeline(
|
||||
FunctionTransformer(replace), classifier
|
||||
)
|
||||
pipeline.fit(X, y).predict(X)
|
||||
bagging_classifier = BaggingClassifier(pipeline)
|
||||
bagging_classifier.fit(X, y)
|
||||
y_hat = bagging_classifier.predict(X)
|
||||
assert y.shape == y_hat.shape
|
||||
bagging_classifier.predict_log_proba(X)
|
||||
bagging_classifier.predict_proba(X)
|
||||
|
||||
# Verify that exceptions can be raised by wrapper classifier
|
||||
classifier = DecisionTreeClassifier()
|
||||
pipeline = make_pipeline(classifier)
|
||||
assert_raises(ValueError, pipeline.fit, X, y)
|
||||
bagging_classifier = BaggingClassifier(pipeline)
|
||||
assert_raises(ValueError, bagging_classifier.fit, X, y)
|
||||
|
||||
|
||||
def test_bagging_small_max_features():
|
||||
# Check that Bagging estimator can accept low fractional max_features
|
||||
|
||||
X = np.array([[1, 2], [3, 4]])
|
||||
y = np.array([1, 0])
|
||||
|
||||
bagging = BaggingClassifier(LogisticRegression(),
|
||||
max_features=0.3, random_state=1)
|
||||
bagging.fit(X, y)
|
||||
|
||||
|
||||
def test_bagging_get_estimators_indices():
|
||||
# Check that Bagging estimator can generate sample indices properly
|
||||
# Non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/16436
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(13, 4)
|
||||
y = np.arange(13)
|
||||
|
||||
class MyEstimator(DecisionTreeRegressor):
|
||||
"""An estimator which stores y indices information at fit."""
|
||||
def fit(self, X, y):
|
||||
self._sample_indices = y
|
||||
|
||||
clf = BaggingRegressor(base_estimator=MyEstimator(),
|
||||
n_estimators=1, random_state=0)
|
||||
clf.fit(X, y)
|
||||
|
||||
assert_array_equal(clf.estimators_[0]._sample_indices,
|
||||
clf.estimators_samples_[0])
|
127
venv/Lib/site-packages/sklearn/ensemble/tests/test_base.py
Normal file
127
venv/Lib/site-packages/sklearn/ensemble/tests/test_base.py
Normal file
|
@ -0,0 +1,127 @@
|
|||
"""
|
||||
Testing for the base module (sklearn.ensemble.base).
|
||||
"""
|
||||
|
||||
# Authors: Gilles Louppe
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils._testing import assert_raise_message
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.ensemble import BaggingClassifier
|
||||
from sklearn.ensemble._base import _set_random_states
|
||||
from sklearn.linear_model import Perceptron
|
||||
from collections import OrderedDict
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.feature_selection import SelectFromModel
|
||||
|
||||
|
||||
def test_base():
|
||||
# Check BaseEnsemble methods.
|
||||
ensemble = BaggingClassifier(
|
||||
base_estimator=Perceptron(random_state=None), n_estimators=3)
|
||||
|
||||
iris = load_iris()
|
||||
ensemble.fit(iris.data, iris.target)
|
||||
ensemble.estimators_ = [] # empty the list and create estimators manually
|
||||
|
||||
ensemble._make_estimator()
|
||||
random_state = np.random.RandomState(3)
|
||||
ensemble._make_estimator(random_state=random_state)
|
||||
ensemble._make_estimator(random_state=random_state)
|
||||
ensemble._make_estimator(append=False)
|
||||
|
||||
assert 3 == len(ensemble)
|
||||
assert 3 == len(ensemble.estimators_)
|
||||
|
||||
assert isinstance(ensemble[0], Perceptron)
|
||||
assert ensemble[0].random_state is None
|
||||
assert isinstance(ensemble[1].random_state, int)
|
||||
assert isinstance(ensemble[2].random_state, int)
|
||||
assert ensemble[1].random_state != ensemble[2].random_state
|
||||
|
||||
np_int_ensemble = BaggingClassifier(base_estimator=Perceptron(),
|
||||
n_estimators=np.int32(3))
|
||||
np_int_ensemble.fit(iris.data, iris.target)
|
||||
|
||||
|
||||
def test_base_zero_n_estimators():
|
||||
# Check that instantiating a BaseEnsemble with n_estimators<=0 raises
|
||||
# a ValueError.
|
||||
ensemble = BaggingClassifier(base_estimator=Perceptron(),
|
||||
n_estimators=0)
|
||||
iris = load_iris()
|
||||
assert_raise_message(ValueError,
|
||||
"n_estimators must be greater than zero, got 0.",
|
||||
ensemble.fit, iris.data, iris.target)
|
||||
|
||||
|
||||
def test_base_not_int_n_estimators():
|
||||
# Check that instantiating a BaseEnsemble with a string as n_estimators
|
||||
# raises a ValueError demanding n_estimators to be supplied as an integer.
|
||||
string_ensemble = BaggingClassifier(base_estimator=Perceptron(),
|
||||
n_estimators='3')
|
||||
iris = load_iris()
|
||||
assert_raise_message(ValueError,
|
||||
"n_estimators must be an integer",
|
||||
string_ensemble.fit, iris.data, iris.target)
|
||||
float_ensemble = BaggingClassifier(base_estimator=Perceptron(),
|
||||
n_estimators=3.0)
|
||||
assert_raise_message(ValueError,
|
||||
"n_estimators must be an integer",
|
||||
float_ensemble.fit, iris.data, iris.target)
|
||||
|
||||
|
||||
def test_set_random_states():
|
||||
# Linear Discriminant Analysis doesn't have random state: smoke test
|
||||
_set_random_states(LinearDiscriminantAnalysis(), random_state=17)
|
||||
|
||||
clf1 = Perceptron(random_state=None)
|
||||
assert clf1.random_state is None
|
||||
# check random_state is None still sets
|
||||
_set_random_states(clf1, None)
|
||||
assert isinstance(clf1.random_state, int)
|
||||
|
||||
# check random_state fixes results in consistent initialisation
|
||||
_set_random_states(clf1, 3)
|
||||
assert isinstance(clf1.random_state, int)
|
||||
clf2 = Perceptron(random_state=None)
|
||||
_set_random_states(clf2, 3)
|
||||
assert clf1.random_state == clf2.random_state
|
||||
|
||||
# nested random_state
|
||||
|
||||
def make_steps():
|
||||
return [('sel', SelectFromModel(Perceptron(random_state=None))),
|
||||
('clf', Perceptron(random_state=None))]
|
||||
|
||||
est1 = Pipeline(make_steps())
|
||||
_set_random_states(est1, 3)
|
||||
assert isinstance(est1.steps[0][1].estimator.random_state, int)
|
||||
assert isinstance(est1.steps[1][1].random_state, int)
|
||||
assert (est1.get_params()['sel__estimator__random_state'] !=
|
||||
est1.get_params()['clf__random_state'])
|
||||
|
||||
# ensure multiple random_state parameters are invariant to get_params()
|
||||
# iteration order
|
||||
|
||||
class AlphaParamPipeline(Pipeline):
|
||||
def get_params(self, *args, **kwargs):
|
||||
params = Pipeline.get_params(self, *args, **kwargs).items()
|
||||
return OrderedDict(sorted(params))
|
||||
|
||||
class RevParamPipeline(Pipeline):
|
||||
def get_params(self, *args, **kwargs):
|
||||
params = Pipeline.get_params(self, *args, **kwargs).items()
|
||||
return OrderedDict(sorted(params, reverse=True))
|
||||
|
||||
for cls in [AlphaParamPipeline, RevParamPipeline]:
|
||||
est2 = cls(make_steps())
|
||||
_set_random_states(est2, 3)
|
||||
assert (est1.get_params()['sel__estimator__random_state'] ==
|
||||
est2.get_params()['sel__estimator__random_state'])
|
||||
assert (est1.get_params()['clf__random_state'] ==
|
||||
est2.get_params()['clf__random_state'])
|
172
venv/Lib/site-packages/sklearn/ensemble/tests/test_common.py
Normal file
172
venv/Lib/site-packages/sklearn/ensemble/tests/test_common.py
Normal file
|
@ -0,0 +1,172 @@
|
|||
import pytest
|
||||
|
||||
from sklearn.base import clone
|
||||
from sklearn.base import ClassifierMixin
|
||||
from sklearn.base import is_classifier
|
||||
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.datasets import make_regression
|
||||
|
||||
from sklearn.linear_model import LogisticRegression, LinearRegression
|
||||
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
|
||||
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
||||
|
||||
from sklearn.ensemble import StackingClassifier, StackingRegressor
|
||||
from sklearn.ensemble import VotingClassifier, VotingRegressor
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, estimator",
|
||||
[(*make_classification(n_samples=10),
|
||||
StackingClassifier(estimators=[('lr', LogisticRegression()),
|
||||
('svm', LinearSVC()),
|
||||
('rf', RandomForestClassifier())])),
|
||||
(*make_classification(n_samples=10),
|
||||
VotingClassifier(estimators=[('lr', LogisticRegression()),
|
||||
('svm', LinearSVC()),
|
||||
('rf', RandomForestClassifier())])),
|
||||
(*make_regression(n_samples=10),
|
||||
StackingRegressor(estimators=[('lr', LinearRegression()),
|
||||
('svm', LinearSVR()),
|
||||
('rf', RandomForestRegressor())])),
|
||||
(*make_regression(n_samples=10),
|
||||
VotingRegressor(estimators=[('lr', LinearRegression()),
|
||||
('svm', LinearSVR()),
|
||||
('rf', RandomForestRegressor())]))],
|
||||
ids=['stacking-classifier', 'voting-classifier',
|
||||
'stacking-regressor', 'voting-regressor']
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
|
||||
# check that the behavior of `estimators`, `estimators_`,
|
||||
# `named_estimators`, `named_estimators_` is consistent across all
|
||||
# ensemble classes and when using `set_params()`.
|
||||
|
||||
# before fit
|
||||
assert 'svm' in estimator.named_estimators
|
||||
assert estimator.named_estimators.svm is estimator.estimators[1][1]
|
||||
assert estimator.named_estimators.svm is estimator.named_estimators['svm']
|
||||
|
||||
# check fitted attributes
|
||||
estimator.fit(X, y)
|
||||
assert len(estimator.named_estimators) == 3
|
||||
assert len(estimator.named_estimators_) == 3
|
||||
assert (sorted(list(estimator.named_estimators_.keys())) ==
|
||||
sorted(['lr', 'svm', 'rf']))
|
||||
|
||||
# check that set_params() does not add a new attribute
|
||||
estimator_new_params = clone(estimator)
|
||||
svm_estimator = SVC() if is_classifier(estimator) else SVR()
|
||||
estimator_new_params.set_params(svm=svm_estimator).fit(X, y)
|
||||
assert not hasattr(estimator_new_params, 'svm')
|
||||
assert (estimator_new_params.named_estimators.lr.get_params() ==
|
||||
estimator.named_estimators.lr.get_params())
|
||||
assert (estimator_new_params.named_estimators.rf.get_params() ==
|
||||
estimator.named_estimators.rf.get_params())
|
||||
|
||||
# check the behavior when setting an dropping an estimator
|
||||
estimator_dropped = clone(estimator)
|
||||
estimator_dropped.set_params(svm='drop')
|
||||
estimator_dropped.fit(X, y)
|
||||
assert len(estimator_dropped.named_estimators) == 3
|
||||
assert estimator_dropped.named_estimators.svm == 'drop'
|
||||
assert len(estimator_dropped.named_estimators_) == 3
|
||||
assert (sorted(list(estimator_dropped.named_estimators_.keys())) ==
|
||||
sorted(['lr', 'svm', 'rf']))
|
||||
for sub_est in estimator_dropped.named_estimators_:
|
||||
# check that the correspondence is correct
|
||||
assert not isinstance(sub_est, type(estimator.named_estimators.svm))
|
||||
|
||||
# check that we can set the parameters of the underlying classifier
|
||||
estimator.set_params(svm__C=10.0)
|
||||
estimator.set_params(rf__max_depth=5)
|
||||
assert (estimator.get_params()['svm__C'] ==
|
||||
estimator.get_params()['svm'].get_params()['C'])
|
||||
assert (estimator.get_params()['rf__max_depth'] ==
|
||||
estimator.get_params()['rf'].get_params()['max_depth'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Ensemble",
|
||||
[StackingClassifier, VotingClassifier, StackingRegressor, VotingRegressor]
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_type(Ensemble):
|
||||
# check that ensemble will fail during validation if the underlying
|
||||
# estimators are not of the same type (i.e. classifier or regressor)
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
X, y = make_classification(n_samples=10)
|
||||
estimators = [('lr', LinearRegression())]
|
||||
ensemble_type = 'classifier'
|
||||
else:
|
||||
X, y = make_regression(n_samples=10)
|
||||
estimators = [('lr', LogisticRegression())]
|
||||
ensemble_type = 'regressor'
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = "should be a {}".format(ensemble_type)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, Ensemble",
|
||||
[(*make_classification(n_samples=10), StackingClassifier),
|
||||
(*make_classification(n_samples=10), VotingClassifier),
|
||||
(*make_regression(n_samples=10), StackingRegressor),
|
||||
(*make_regression(n_samples=10), VotingRegressor)]
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble):
|
||||
# raise an error when the name contains dunder
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
estimators = [('lr__', LogisticRegression())]
|
||||
else:
|
||||
estimators = [('lr__', LinearRegression())]
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = r"Estimator names must not contain __: got \['lr__'\]"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
# raise an error when the name is not unique
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
estimators = [('lr', LogisticRegression()),
|
||||
('lr', LogisticRegression())]
|
||||
else:
|
||||
estimators = [('lr', LinearRegression()),
|
||||
('lr', LinearRegression())]
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = r"Names provided are not unique: \['lr', 'lr'\]"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
# raise an error when the name conflicts with the parameters
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
estimators = [('estimators', LogisticRegression())]
|
||||
else:
|
||||
estimators = [('estimators', LinearRegression())]
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = "Estimator names conflict with constructor arguments"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, estimator",
|
||||
[(*make_classification(n_samples=10),
|
||||
StackingClassifier(estimators=[('lr', LogisticRegression())])),
|
||||
(*make_classification(n_samples=10),
|
||||
VotingClassifier(estimators=[('lr', LogisticRegression())])),
|
||||
(*make_regression(n_samples=10),
|
||||
StackingRegressor(estimators=[('lr', LinearRegression())])),
|
||||
(*make_regression(n_samples=10),
|
||||
VotingRegressor(estimators=[('lr', LinearRegression())]))],
|
||||
ids=['stacking-classifier', 'voting-classifier',
|
||||
'stacking-regressor', 'voting-regressor']
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
|
||||
# check that we raise a consistent error when all estimators are
|
||||
# dropped
|
||||
estimator.set_params(lr='drop')
|
||||
with pytest.raises(ValueError, match="All estimators are dropped."):
|
||||
estimator.fit(X, y)
|
1380
venv/Lib/site-packages/sklearn/ensemble/tests/test_forest.py
Normal file
1380
venv/Lib/site-packages/sklearn/ensemble/tests/test_forest.py
Normal file
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,343 @@
|
|||
"""
|
||||
Testing for the gradient boosting loss functions and initial estimators.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_almost_equal
|
||||
from numpy.testing import assert_allclose
|
||||
import pytest
|
||||
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils.stats import _weighted_percentile
|
||||
from sklearn.ensemble._gb_losses import RegressionLossFunction
|
||||
from sklearn.ensemble._gb_losses import LeastSquaresError
|
||||
from sklearn.ensemble._gb_losses import LeastAbsoluteError
|
||||
from sklearn.ensemble._gb_losses import HuberLossFunction
|
||||
from sklearn.ensemble._gb_losses import QuantileLossFunction
|
||||
from sklearn.ensemble._gb_losses import BinomialDeviance
|
||||
from sklearn.ensemble._gb_losses import MultinomialDeviance
|
||||
from sklearn.ensemble._gb_losses import ExponentialLoss
|
||||
from sklearn.ensemble._gb_losses import LOSS_FUNCTIONS
|
||||
|
||||
|
||||
def test_binomial_deviance():
|
||||
# Check binomial deviance loss.
|
||||
# Check against alternative definitions in ESLII.
|
||||
bd = BinomialDeviance(2)
|
||||
|
||||
# pred has the same BD for y in {0, 1}
|
||||
assert (bd(np.array([0.0]), np.array([0.0])) ==
|
||||
bd(np.array([1.0]), np.array([0.0])))
|
||||
|
||||
assert_almost_equal(bd(np.array([1.0, 1.0, 1.0]),
|
||||
np.array([100.0, 100.0, 100.0])),
|
||||
0.0)
|
||||
assert_almost_equal(bd(np.array([1.0, 0.0, 0.0]),
|
||||
np.array([100.0, -100.0, -100.0])), 0)
|
||||
|
||||
# check if same results as alternative definition of deviance (from ESLII)
|
||||
def alt_dev(y, pred):
|
||||
return np.mean(np.logaddexp(0.0, -2.0 * (2.0 * y - 1) * pred))
|
||||
|
||||
test_data = [(np.array([1.0, 1.0, 1.0]), np.array([100.0, 100.0, 100.0])),
|
||||
(np.array([0.0, 0.0, 0.0]), np.array([100.0, 100.0, 100.0])),
|
||||
(np.array([0.0, 0.0, 0.0]),
|
||||
np.array([-100.0, -100.0, -100.0])),
|
||||
(np.array([1.0, 1.0, 1.0]),
|
||||
np.array([-100.0, -100.0, -100.0]))]
|
||||
|
||||
for datum in test_data:
|
||||
assert_almost_equal(bd(*datum), alt_dev(*datum))
|
||||
|
||||
# check the gradient against the
|
||||
def alt_ng(y, pred):
|
||||
return (2 * y - 1) / (1 + np.exp(2 * (2 * y - 1) * pred))
|
||||
|
||||
for datum in test_data:
|
||||
assert_almost_equal(bd.negative_gradient(*datum), alt_ng(*datum))
|
||||
|
||||
|
||||
def test_sample_weight_smoke():
|
||||
rng = check_random_state(13)
|
||||
y = rng.rand(100)
|
||||
pred = rng.rand(100)
|
||||
|
||||
# least squares
|
||||
loss = LeastSquaresError(1)
|
||||
loss_wo_sw = loss(y, pred)
|
||||
loss_w_sw = loss(y, pred, np.ones(pred.shape[0], dtype=np.float32))
|
||||
assert_almost_equal(loss_wo_sw, loss_w_sw)
|
||||
|
||||
|
||||
def test_sample_weight_init_estimators():
|
||||
# Smoke test for init estimators with sample weights.
|
||||
rng = check_random_state(13)
|
||||
X = rng.rand(100, 2)
|
||||
sample_weight = np.ones(100)
|
||||
reg_y = rng.rand(100)
|
||||
|
||||
clf_y = rng.randint(0, 2, size=100)
|
||||
|
||||
for Loss in LOSS_FUNCTIONS.values():
|
||||
if Loss is None:
|
||||
continue
|
||||
if issubclass(Loss, RegressionLossFunction):
|
||||
k = 1
|
||||
y = reg_y
|
||||
else:
|
||||
k = 2
|
||||
y = clf_y
|
||||
if Loss.is_multi_class:
|
||||
# skip multiclass
|
||||
continue
|
||||
|
||||
loss = Loss(k)
|
||||
init_est = loss.init_estimator()
|
||||
init_est.fit(X, y)
|
||||
out = loss.get_init_raw_predictions(X, init_est)
|
||||
assert out.shape == (y.shape[0], 1)
|
||||
|
||||
sw_init_est = loss.init_estimator()
|
||||
sw_init_est.fit(X, y, sample_weight=sample_weight)
|
||||
sw_out = loss.get_init_raw_predictions(X, sw_init_est)
|
||||
assert sw_out.shape == (y.shape[0], 1)
|
||||
|
||||
# check if predictions match
|
||||
assert_allclose(out, sw_out, rtol=1e-2)
|
||||
|
||||
|
||||
def test_weighted_percentile():
|
||||
y = np.empty(102, dtype=np.float64)
|
||||
y[:50] = 0
|
||||
y[-51:] = 2
|
||||
y[-1] = 100000
|
||||
y[50] = 1
|
||||
sw = np.ones(102, dtype=np.float64)
|
||||
sw[-1] = 0.0
|
||||
score = _weighted_percentile(y, sw, 50)
|
||||
assert score == 1
|
||||
|
||||
|
||||
def test_weighted_percentile_equal():
|
||||
y = np.empty(102, dtype=np.float64)
|
||||
y.fill(0.0)
|
||||
sw = np.ones(102, dtype=np.float64)
|
||||
sw[-1] = 0.0
|
||||
score = _weighted_percentile(y, sw, 50)
|
||||
assert score == 0
|
||||
|
||||
|
||||
def test_weighted_percentile_zero_weight():
|
||||
y = np.empty(102, dtype=np.float64)
|
||||
y.fill(1.0)
|
||||
sw = np.ones(102, dtype=np.float64)
|
||||
sw.fill(0.0)
|
||||
score = _weighted_percentile(y, sw, 50)
|
||||
assert score == 1.0
|
||||
|
||||
|
||||
def test_quantile_loss_function():
|
||||
# Non regression test for the QuantileLossFunction object
|
||||
# There was a sign problem when evaluating the function
|
||||
# for negative values of 'ytrue - ypred'
|
||||
x = np.asarray([-1.0, 0.0, 1.0])
|
||||
y_found = QuantileLossFunction(1, 0.9)(x, np.zeros_like(x))
|
||||
y_expected = np.asarray([0.1, 0.0, 0.9]).mean()
|
||||
np.testing.assert_allclose(y_found, y_expected)
|
||||
|
||||
|
||||
def test_sample_weight_deviance():
|
||||
# Test if deviance supports sample weights.
|
||||
rng = check_random_state(13)
|
||||
sample_weight = np.ones(100)
|
||||
reg_y = rng.rand(100)
|
||||
clf_y = rng.randint(0, 2, size=100)
|
||||
mclf_y = rng.randint(0, 3, size=100)
|
||||
|
||||
for Loss in LOSS_FUNCTIONS.values():
|
||||
if Loss is None:
|
||||
continue
|
||||
if issubclass(Loss, RegressionLossFunction):
|
||||
k = 1
|
||||
y = reg_y
|
||||
p = reg_y
|
||||
else:
|
||||
k = 2
|
||||
y = clf_y
|
||||
p = clf_y
|
||||
if Loss.is_multi_class:
|
||||
k = 3
|
||||
y = mclf_y
|
||||
# one-hot encoding
|
||||
p = np.zeros((y.shape[0], k), dtype=np.float64)
|
||||
for i in range(k):
|
||||
p[:, i] = y == i
|
||||
|
||||
loss = Loss(k)
|
||||
deviance_w_w = loss(y, p, sample_weight)
|
||||
deviance_wo_w = loss(y, p)
|
||||
assert deviance_wo_w == deviance_w_w
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'n_classes, n_samples', [(3, 100), (5, 57), (7, 13)]
|
||||
)
|
||||
def test_multinomial_deviance(n_classes, n_samples):
|
||||
# Check multinomial deviance with and without sample weights.
|
||||
rng = np.random.RandomState(13)
|
||||
sample_weight = np.ones(n_samples)
|
||||
y_true = rng.randint(0, n_classes, size=n_samples)
|
||||
y_pred = np.zeros((n_samples, n_classes), dtype=np.float64)
|
||||
for klass in range(y_pred.shape[1]):
|
||||
y_pred[:, klass] = y_true == klass
|
||||
|
||||
loss = MultinomialDeviance(n_classes)
|
||||
loss_wo_sw = loss(y_true, y_pred)
|
||||
assert loss_wo_sw > 0
|
||||
loss_w_sw = loss(y_true, y_pred, sample_weight=sample_weight)
|
||||
assert loss_wo_sw == pytest.approx(loss_w_sw)
|
||||
|
||||
# Multinomial deviance uses weighted average loss rather than
|
||||
# weighted sum loss, so we make sure that the value remains the same
|
||||
# when we device the weight by 2.
|
||||
loss_w_sw = loss(y_true, y_pred, sample_weight=0.5 * sample_weight)
|
||||
assert loss_wo_sw == pytest.approx(loss_w_sw)
|
||||
|
||||
|
||||
def test_mdl_computation_weighted():
|
||||
raw_predictions = np.array([[1., -1., -.1], [-2., 1., 2.]])
|
||||
y_true = np.array([0, 1])
|
||||
weights = np.array([1, 3])
|
||||
expected_loss = 1.0909323
|
||||
# MultinomialDeviance loss computation with weights.
|
||||
loss = MultinomialDeviance(3)
|
||||
assert (loss(y_true, raw_predictions, weights)
|
||||
== pytest.approx(expected_loss))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('n', [0, 1, 2])
|
||||
def test_mdl_exception(n):
|
||||
# Check that MultinomialDeviance throws an exception when n_classes <= 2
|
||||
err_msg = 'MultinomialDeviance requires more than 2 classes.'
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
MultinomialDeviance(n)
|
||||
|
||||
|
||||
def test_init_raw_predictions_shapes():
|
||||
# Make sure get_init_raw_predictions returns float64 arrays with shape
|
||||
# (n_samples, K) where K is 1 for binary classification and regression, and
|
||||
# K = n_classes for multiclass classification
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 5))
|
||||
y = rng.normal(size=n_samples)
|
||||
for loss in (LeastSquaresError(n_classes=1),
|
||||
LeastAbsoluteError(n_classes=1),
|
||||
QuantileLossFunction(n_classes=1),
|
||||
HuberLossFunction(n_classes=1)):
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
assert raw_predictions.shape == (n_samples, 1)
|
||||
assert raw_predictions.dtype == np.float64
|
||||
|
||||
y = rng.randint(0, 2, size=n_samples)
|
||||
for loss in (BinomialDeviance(n_classes=2),
|
||||
ExponentialLoss(n_classes=2)):
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
assert raw_predictions.shape == (n_samples, 1)
|
||||
assert raw_predictions.dtype == np.float64
|
||||
|
||||
for n_classes in range(3, 5):
|
||||
y = rng.randint(0, n_classes, size=n_samples)
|
||||
loss = MultinomialDeviance(n_classes=n_classes)
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
assert raw_predictions.shape == (n_samples, n_classes)
|
||||
assert raw_predictions.dtype == np.float64
|
||||
|
||||
|
||||
def test_init_raw_predictions_values():
|
||||
# Make sure the get_init_raw_predictions() returns the expected values for
|
||||
# each loss.
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 5))
|
||||
y = rng.normal(size=n_samples)
|
||||
|
||||
# Least squares loss
|
||||
loss = LeastSquaresError(n_classes=1)
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
# Make sure baseline prediction is the mean of all targets
|
||||
assert_almost_equal(raw_predictions, y.mean())
|
||||
|
||||
# Least absolute and huber loss
|
||||
for Loss in (LeastAbsoluteError, HuberLossFunction):
|
||||
loss = Loss(n_classes=1)
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
# Make sure baseline prediction is the median of all targets
|
||||
assert_almost_equal(raw_predictions, np.median(y))
|
||||
|
||||
# Quantile loss
|
||||
for alpha in (.1, .5, .9):
|
||||
loss = QuantileLossFunction(n_classes=1, alpha=alpha)
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
# Make sure baseline prediction is the alpha-quantile of all targets
|
||||
assert_almost_equal(raw_predictions, np.percentile(y, alpha * 100))
|
||||
|
||||
y = rng.randint(0, 2, size=n_samples)
|
||||
|
||||
# Binomial deviance
|
||||
loss = BinomialDeviance(n_classes=2)
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
# Make sure baseline prediction is equal to link_function(p), where p
|
||||
# is the proba of the positive class. We want predict_proba() to return p,
|
||||
# and by definition
|
||||
# p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction)
|
||||
# So we want raw_prediction = link_function(p) = log(p / (1 - p))
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
p = y.mean()
|
||||
assert_almost_equal(raw_predictions, np.log(p / (1 - p)))
|
||||
|
||||
# Exponential loss
|
||||
loss = ExponentialLoss(n_classes=2)
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
p = y.mean()
|
||||
assert_almost_equal(raw_predictions, .5 * np.log(p / (1 - p)))
|
||||
|
||||
# Multinomial deviance loss
|
||||
for n_classes in range(3, 5):
|
||||
y = rng.randint(0, n_classes, size=n_samples)
|
||||
loss = MultinomialDeviance(n_classes=n_classes)
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
for k in range(n_classes):
|
||||
p = (y == k).mean()
|
||||
assert_almost_equal(raw_predictions[:, k], np.log(p))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('seed', range(5))
|
||||
def test_lad_equals_quantile_50(seed):
|
||||
# Make sure quantile loss with alpha = .5 is equivalent to LAD
|
||||
lad = LeastAbsoluteError(n_classes=1)
|
||||
ql = QuantileLossFunction(n_classes=1, alpha=0.5)
|
||||
|
||||
n_samples = 50
|
||||
rng = np.random.RandomState(seed)
|
||||
raw_predictions = rng.normal(size=(n_samples))
|
||||
y_true = rng.normal(size=(n_samples))
|
||||
|
||||
lad_loss = lad(y_true, raw_predictions)
|
||||
ql_loss = ql(y_true, raw_predictions)
|
||||
assert_almost_equal(lad_loss, 2 * ql_loss)
|
||||
|
||||
weights = np.linspace(0, 1, n_samples) ** 2
|
||||
lad_weighted_loss = lad(y_true, raw_predictions, sample_weight=weights)
|
||||
ql_weighted_loss = ql(y_true, raw_predictions, sample_weight=weights)
|
||||
assert_almost_equal(lad_weighted_loss, 2 * ql_weighted_loss)
|
358
venv/Lib/site-packages/sklearn/ensemble/tests/test_iforest.py
Normal file
358
venv/Lib/site-packages/sklearn/ensemble/tests/test_iforest.py
Normal file
|
@ -0,0 +1,358 @@
|
|||
"""
|
||||
Testing for Isolation Forest algorithm (sklearn.ensemble.iforest).
|
||||
"""
|
||||
|
||||
# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
|
||||
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_raises
|
||||
from sklearn.utils._testing import assert_warns_message
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
from sklearn.model_selection import ParameterGrid
|
||||
from sklearn.ensemble import IsolationForest
|
||||
from sklearn.ensemble._iforest import _average_path_length
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.datasets import load_diabetes, load_iris
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
from scipy.sparse import csc_matrix, csr_matrix
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
rng = check_random_state(0)
|
||||
|
||||
# load the iris dataset
|
||||
# and randomly permute it
|
||||
iris = load_iris()
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data = iris.data[perm]
|
||||
iris.target = iris.target[perm]
|
||||
|
||||
# also load the diabetes dataset
|
||||
# and randomly permute it
|
||||
diabetes = load_diabetes()
|
||||
perm = rng.permutation(diabetes.target.size)
|
||||
diabetes.data = diabetes.data[perm]
|
||||
diabetes.target = diabetes.target[perm]
|
||||
|
||||
|
||||
def test_iforest():
|
||||
"""Check Isolation Forest for various parameter settings."""
|
||||
X_train = np.array([[0, 1], [1, 2]])
|
||||
X_test = np.array([[2, 1], [1, 1]])
|
||||
|
||||
grid = ParameterGrid({"n_estimators": [3],
|
||||
"max_samples": [0.5, 1.0, 3],
|
||||
"bootstrap": [True, False]})
|
||||
|
||||
with ignore_warnings():
|
||||
for params in grid:
|
||||
IsolationForest(random_state=rng,
|
||||
**params).fit(X_train).predict(X_test)
|
||||
|
||||
|
||||
def test_iforest_sparse():
|
||||
"""Check IForest for various parameter settings on sparse input."""
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
|
||||
diabetes.target[:50],
|
||||
random_state=rng)
|
||||
grid = ParameterGrid({"max_samples": [0.5, 1.0],
|
||||
"bootstrap": [True, False]})
|
||||
|
||||
for sparse_format in [csc_matrix, csr_matrix]:
|
||||
X_train_sparse = sparse_format(X_train)
|
||||
X_test_sparse = sparse_format(X_test)
|
||||
|
||||
for params in grid:
|
||||
# Trained on sparse format
|
||||
sparse_classifier = IsolationForest(
|
||||
n_estimators=10, random_state=1, **params).fit(X_train_sparse)
|
||||
sparse_results = sparse_classifier.predict(X_test_sparse)
|
||||
|
||||
# Trained on dense format
|
||||
dense_classifier = IsolationForest(
|
||||
n_estimators=10, random_state=1, **params).fit(X_train)
|
||||
dense_results = dense_classifier.predict(X_test)
|
||||
|
||||
assert_array_equal(sparse_results, dense_results)
|
||||
|
||||
|
||||
def test_iforest_error():
|
||||
"""Test that it gives proper exception on deficient input."""
|
||||
X = iris.data
|
||||
|
||||
# Test max_samples
|
||||
assert_raises(ValueError,
|
||||
IsolationForest(max_samples=-1).fit, X)
|
||||
assert_raises(ValueError,
|
||||
IsolationForest(max_samples=0.0).fit, X)
|
||||
assert_raises(ValueError,
|
||||
IsolationForest(max_samples=2.0).fit, X)
|
||||
# The dataset has less than 256 samples, explicitly setting
|
||||
# max_samples > n_samples should result in a warning. If not set
|
||||
# explicitly there should be no warning
|
||||
assert_warns_message(UserWarning,
|
||||
"max_samples will be set to n_samples for estimation",
|
||||
IsolationForest(max_samples=1000).fit, X)
|
||||
# note that assert_no_warnings does not apply since it enables a
|
||||
# PendingDeprecationWarning triggered by scipy.sparse's use of
|
||||
# np.matrix. See issue #11251.
|
||||
with pytest.warns(None) as record:
|
||||
IsolationForest(max_samples='auto').fit(X)
|
||||
user_warnings = [each for each in record
|
||||
if issubclass(each.category, UserWarning)]
|
||||
assert len(user_warnings) == 0
|
||||
with pytest.warns(None) as record:
|
||||
IsolationForest(max_samples=np.int64(2)).fit(X)
|
||||
user_warnings = [each for each in record
|
||||
if issubclass(each.category, UserWarning)]
|
||||
assert len(user_warnings) == 0
|
||||
|
||||
assert_raises(ValueError, IsolationForest(max_samples='foobar').fit, X)
|
||||
assert_raises(ValueError, IsolationForest(max_samples=1.5).fit, X)
|
||||
|
||||
# test X_test n_features match X_train one:
|
||||
assert_raises(ValueError, IsolationForest().fit(X).predict, X[:, 1:])
|
||||
|
||||
# test that behaviour='old' will raise an error
|
||||
msg = "The old behaviour of IsolationForest is not implemented anymore."
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
IsolationForest(behaviour='old').fit(X)
|
||||
|
||||
|
||||
def test_recalculate_max_depth():
|
||||
"""Check max_depth recalculation when max_samples is reset to n_samples"""
|
||||
X = iris.data
|
||||
clf = IsolationForest().fit(X)
|
||||
for est in clf.estimators_:
|
||||
assert est.max_depth == int(np.ceil(np.log2(X.shape[0])))
|
||||
|
||||
|
||||
def test_max_samples_attribute():
|
||||
X = iris.data
|
||||
clf = IsolationForest().fit(X)
|
||||
assert clf.max_samples_ == X.shape[0]
|
||||
|
||||
clf = IsolationForest(max_samples=500)
|
||||
assert_warns_message(UserWarning,
|
||||
"max_samples will be set to n_samples for estimation",
|
||||
clf.fit, X)
|
||||
assert clf.max_samples_ == X.shape[0]
|
||||
|
||||
clf = IsolationForest(max_samples=0.4).fit(X)
|
||||
assert clf.max_samples_ == 0.4*X.shape[0]
|
||||
|
||||
|
||||
def test_iforest_parallel_regression():
|
||||
"""Check parallel regression."""
|
||||
rng = check_random_state(0)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
|
||||
diabetes.target,
|
||||
random_state=rng)
|
||||
|
||||
ensemble = IsolationForest(n_jobs=3,
|
||||
random_state=0).fit(X_train)
|
||||
|
||||
ensemble.set_params(n_jobs=1)
|
||||
y1 = ensemble.predict(X_test)
|
||||
ensemble.set_params(n_jobs=2)
|
||||
y2 = ensemble.predict(X_test)
|
||||
assert_array_almost_equal(y1, y2)
|
||||
|
||||
ensemble = IsolationForest(n_jobs=1,
|
||||
random_state=0).fit(X_train)
|
||||
|
||||
y3 = ensemble.predict(X_test)
|
||||
assert_array_almost_equal(y1, y3)
|
||||
|
||||
|
||||
def test_iforest_performance():
|
||||
"""Test Isolation Forest performs well"""
|
||||
|
||||
# Generate train/test data
|
||||
rng = check_random_state(2)
|
||||
X = 0.3 * rng.randn(120, 2)
|
||||
X_train = np.r_[X + 2, X - 2]
|
||||
X_train = X[:100]
|
||||
|
||||
# Generate some abnormal novel observations
|
||||
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
|
||||
X_test = np.r_[X[100:], X_outliers]
|
||||
y_test = np.array([0] * 20 + [1] * 20)
|
||||
|
||||
# fit the model
|
||||
clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)
|
||||
|
||||
# predict scores (the lower, the more normal)
|
||||
y_pred = - clf.decision_function(X_test)
|
||||
|
||||
# check that there is at most 6 errors (false positive or false negative)
|
||||
assert roc_auc_score(y_test, y_pred) > 0.98
|
||||
|
||||
|
||||
@pytest.mark.parametrize("contamination", [0.25, "auto"])
|
||||
def test_iforest_works(contamination):
|
||||
# toy sample (the last two samples are outliers)
|
||||
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]
|
||||
|
||||
# Test IsolationForest
|
||||
clf = IsolationForest(random_state=rng, contamination=contamination)
|
||||
clf.fit(X)
|
||||
decision_func = -clf.decision_function(X)
|
||||
pred = clf.predict(X)
|
||||
# assert detect outliers:
|
||||
assert np.min(decision_func[-2:]) > np.max(decision_func[:-2])
|
||||
assert_array_equal(pred, 6 * [1] + 2 * [-1])
|
||||
|
||||
|
||||
def test_max_samples_consistency():
|
||||
# Make sure validated max_samples in iforest and BaseBagging are identical
|
||||
X = iris.data
|
||||
clf = IsolationForest().fit(X)
|
||||
assert clf.max_samples_ == clf._max_samples
|
||||
|
||||
|
||||
def test_iforest_subsampled_features():
|
||||
# It tests non-regression for #5732 which failed at predict.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
|
||||
diabetes.target[:50],
|
||||
random_state=rng)
|
||||
clf = IsolationForest(max_features=0.8)
|
||||
clf.fit(X_train, y_train)
|
||||
clf.predict(X_test)
|
||||
|
||||
|
||||
def test_iforest_average_path_length():
|
||||
# It tests non-regression for #8549 which used the wrong formula
|
||||
# for average path length, strictly for the integer case
|
||||
# Updated to check average path length when input is <= 2 (issue #11839)
|
||||
result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0
|
||||
result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0
|
||||
assert_allclose(_average_path_length([0]), [0.0])
|
||||
assert_allclose(_average_path_length([1]), [0.0])
|
||||
assert_allclose(_average_path_length([2]), [1.0])
|
||||
assert_allclose(_average_path_length([5]), [result_one])
|
||||
assert_allclose(_average_path_length([999]), [result_two])
|
||||
assert_allclose(
|
||||
_average_path_length(np.array([1, 2, 5, 999])),
|
||||
[0.0, 1.0, result_one, result_two],
|
||||
)
|
||||
# _average_path_length is increasing
|
||||
avg_path_length = _average_path_length(np.arange(5))
|
||||
assert_array_equal(avg_path_length, np.sort(avg_path_length))
|
||||
|
||||
|
||||
def test_score_samples():
|
||||
X_train = [[1, 1], [1, 2], [2, 1]]
|
||||
clf1 = IsolationForest(contamination=0.1).fit(X_train)
|
||||
clf2 = IsolationForest().fit(X_train)
|
||||
assert_array_equal(clf1.score_samples([[2., 2.]]),
|
||||
clf1.decision_function([[2., 2.]]) + clf1.offset_)
|
||||
assert_array_equal(clf2.score_samples([[2., 2.]]),
|
||||
clf2.decision_function([[2., 2.]]) + clf2.offset_)
|
||||
assert_array_equal(clf1.score_samples([[2., 2.]]),
|
||||
clf2.score_samples([[2., 2.]]))
|
||||
|
||||
|
||||
def test_iforest_warm_start():
|
||||
"""Test iterative addition of iTrees to an iForest """
|
||||
|
||||
rng = check_random_state(0)
|
||||
X = rng.randn(20, 2)
|
||||
|
||||
# fit first 10 trees
|
||||
clf = IsolationForest(n_estimators=10, max_samples=20,
|
||||
random_state=rng, warm_start=True)
|
||||
clf.fit(X)
|
||||
# remember the 1st tree
|
||||
tree_1 = clf.estimators_[0]
|
||||
# fit another 10 trees
|
||||
clf.set_params(n_estimators=20)
|
||||
clf.fit(X)
|
||||
# expecting 20 fitted trees and no overwritten trees
|
||||
assert len(clf.estimators_) == 20
|
||||
assert clf.estimators_[0] is tree_1
|
||||
|
||||
|
||||
# mock get_chunk_n_rows to actually test more than one chunk (here one
|
||||
# chunk = 3 rows:
|
||||
@patch(
|
||||
"sklearn.ensemble._iforest.get_chunk_n_rows",
|
||||
side_effect=Mock(**{"return_value": 3}),
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]
|
||||
)
|
||||
def test_iforest_chunks_works1(
|
||||
mocked_get_chunk, contamination, n_predict_calls
|
||||
):
|
||||
test_iforest_works(contamination)
|
||||
assert mocked_get_chunk.call_count == n_predict_calls
|
||||
|
||||
|
||||
# idem with chunk_size = 5 rows
|
||||
@patch(
|
||||
"sklearn.ensemble._iforest.get_chunk_n_rows",
|
||||
side_effect=Mock(**{"return_value": 10}),
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]
|
||||
)
|
||||
def test_iforest_chunks_works2(
|
||||
mocked_get_chunk, contamination, n_predict_calls
|
||||
):
|
||||
test_iforest_works(contamination)
|
||||
assert mocked_get_chunk.call_count == n_predict_calls
|
||||
|
||||
|
||||
def test_iforest_deprecation():
|
||||
iforest = IsolationForest(behaviour='new')
|
||||
warn_msg = "'behaviour' is deprecated in 0.22 and will be removed in 0.24"
|
||||
with pytest.warns(FutureWarning, match=warn_msg):
|
||||
iforest.fit(iris.data)
|
||||
|
||||
|
||||
def test_iforest_with_uniform_data():
|
||||
"""Test whether iforest predicts inliers when using uniform data"""
|
||||
|
||||
# 2-d array of all 1s
|
||||
X = np.ones((100, 10))
|
||||
iforest = IsolationForest()
|
||||
iforest.fit(X)
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
assert all(iforest.predict(X) == 1)
|
||||
assert all(iforest.predict(rng.randn(100, 10)) == 1)
|
||||
assert all(iforest.predict(X + 1) == 1)
|
||||
assert all(iforest.predict(X - 1) == 1)
|
||||
|
||||
# 2-d array where columns contain the same value across rows
|
||||
X = np.repeat(rng.randn(1, 10), 100, 0)
|
||||
iforest = IsolationForest()
|
||||
iforest.fit(X)
|
||||
|
||||
assert all(iforest.predict(X) == 1)
|
||||
assert all(iforest.predict(rng.randn(100, 10)) == 1)
|
||||
assert all(iforest.predict(np.ones((100, 10))) == 1)
|
||||
|
||||
# Single row
|
||||
X = rng.randn(1, 10)
|
||||
iforest = IsolationForest()
|
||||
iforest.fit(X)
|
||||
|
||||
assert all(iforest.predict(X) == 1)
|
||||
assert all(iforest.predict(rng.randn(100, 10)) == 1)
|
||||
assert all(iforest.predict(np.ones((100, 10))) == 1)
|
524
venv/Lib/site-packages/sklearn/ensemble/tests/test_stacking.py
Normal file
524
venv/Lib/site-packages/sklearn/ensemble/tests/test_stacking.py
Normal file
|
@ -0,0 +1,524 @@
|
|||
"""Test the stacking classifier and regressor."""
|
||||
|
||||
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
import scipy.sparse as sparse
|
||||
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.base import ClassifierMixin
|
||||
from sklearn.base import RegressorMixin
|
||||
from sklearn.base import clone
|
||||
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.datasets import load_diabetes
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
from sklearn.dummy import DummyClassifier
|
||||
from sklearn.dummy import DummyRegressor
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.svm import LinearSVR
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.preprocessing import scale
|
||||
|
||||
from sklearn.ensemble import StackingClassifier
|
||||
from sklearn.ensemble import StackingRegressor
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
from sklearn.utils._mocking import CheckingClassifier
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils._testing import assert_allclose_dense_sparse
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
from sklearn.utils.estimator_checks import check_estimator
|
||||
from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
|
||||
|
||||
X_diabetes, y_diabetes = load_diabetes(return_X_y=True)
|
||||
X_iris, y_iris = load_iris(return_X_y=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"cv", [3, StratifiedKFold(n_splits=3, shuffle=True, random_state=42)]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"final_estimator", [None, RandomForestClassifier(random_state=42)]
|
||||
)
|
||||
@pytest.mark.parametrize("passthrough", [False, True])
|
||||
def test_stacking_classifier_iris(cv, final_estimator, passthrough):
|
||||
# prescale the data to avoid convergence warning without using a pipeline
|
||||
# for later assert
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
scale(X_iris), y_iris, stratify=y_iris, random_state=42
|
||||
)
|
||||
estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
|
||||
clf = StackingClassifier(
|
||||
estimators=estimators, final_estimator=final_estimator, cv=cv,
|
||||
passthrough=passthrough
|
||||
)
|
||||
clf.fit(X_train, y_train)
|
||||
clf.predict(X_test)
|
||||
clf.predict_proba(X_test)
|
||||
assert clf.score(X_test, y_test) > 0.8
|
||||
|
||||
X_trans = clf.transform(X_test)
|
||||
expected_column_count = 10 if passthrough else 6
|
||||
assert X_trans.shape[1] == expected_column_count
|
||||
if passthrough:
|
||||
assert_allclose(X_test, X_trans[:, -4:])
|
||||
|
||||
clf.set_params(lr='drop')
|
||||
clf.fit(X_train, y_train)
|
||||
clf.predict(X_test)
|
||||
clf.predict_proba(X_test)
|
||||
if final_estimator is None:
|
||||
# LogisticRegression has decision_function method
|
||||
clf.decision_function(X_test)
|
||||
|
||||
X_trans = clf.transform(X_test)
|
||||
expected_column_count_drop = 7 if passthrough else 3
|
||||
assert X_trans.shape[1] == expected_column_count_drop
|
||||
if passthrough:
|
||||
assert_allclose(X_test, X_trans[:, -4:])
|
||||
|
||||
|
||||
def test_stacking_classifier_drop_column_binary_classification():
|
||||
# check that a column is dropped in binary classification
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
scale(X), y, stratify=y, random_state=42
|
||||
)
|
||||
|
||||
# both classifiers implement 'predict_proba' and will both drop one column
|
||||
estimators = [('lr', LogisticRegression()),
|
||||
('rf', RandomForestClassifier(random_state=42))]
|
||||
clf = StackingClassifier(estimators=estimators, cv=3)
|
||||
|
||||
clf.fit(X_train, y_train)
|
||||
X_trans = clf.transform(X_test)
|
||||
assert X_trans.shape[1] == 2
|
||||
|
||||
# LinearSVC does not implement 'predict_proba' and will not drop one column
|
||||
estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
|
||||
clf.set_params(estimators=estimators)
|
||||
|
||||
clf.fit(X_train, y_train)
|
||||
X_trans = clf.transform(X_test)
|
||||
assert X_trans.shape[1] == 2
|
||||
|
||||
|
||||
def test_stacking_classifier_drop_estimator():
|
||||
# prescale the data to avoid convergence warning without using a pipeline
|
||||
# for later assert
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
scale(X_iris), y_iris, stratify=y_iris, random_state=42
|
||||
)
|
||||
estimators = [('lr', 'drop'), ('svc', LinearSVC(random_state=0))]
|
||||
rf = RandomForestClassifier(n_estimators=10, random_state=42)
|
||||
clf = StackingClassifier(
|
||||
estimators=[('svc', LinearSVC(random_state=0))],
|
||||
final_estimator=rf, cv=5
|
||||
)
|
||||
clf_drop = StackingClassifier(
|
||||
estimators=estimators, final_estimator=rf, cv=5
|
||||
)
|
||||
|
||||
clf.fit(X_train, y_train)
|
||||
clf_drop.fit(X_train, y_train)
|
||||
assert_allclose(clf.predict(X_test), clf_drop.predict(X_test))
|
||||
assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test))
|
||||
assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
|
||||
|
||||
|
||||
def test_stacking_regressor_drop_estimator():
|
||||
# prescale the data to avoid convergence warning without using a pipeline
|
||||
# for later assert
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
scale(X_diabetes), y_diabetes, random_state=42
|
||||
)
|
||||
estimators = [('lr', 'drop'), ('svr', LinearSVR(random_state=0))]
|
||||
rf = RandomForestRegressor(n_estimators=10, random_state=42)
|
||||
reg = StackingRegressor(
|
||||
estimators=[('svr', LinearSVR(random_state=0))],
|
||||
final_estimator=rf, cv=5
|
||||
)
|
||||
reg_drop = StackingRegressor(
|
||||
estimators=estimators, final_estimator=rf, cv=5
|
||||
)
|
||||
|
||||
reg.fit(X_train, y_train)
|
||||
reg_drop.fit(X_train, y_train)
|
||||
assert_allclose(reg.predict(X_test), reg_drop.predict(X_test))
|
||||
assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"cv", [3, KFold(n_splits=3, shuffle=True, random_state=42)]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"final_estimator, predict_params",
|
||||
[(None, {}),
|
||||
(RandomForestRegressor(random_state=42), {}),
|
||||
(DummyRegressor(), {'return_std': True})]
|
||||
)
|
||||
@pytest.mark.parametrize("passthrough", [False, True])
|
||||
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params,
|
||||
passthrough):
|
||||
# prescale the data to avoid convergence warning without using a pipeline
|
||||
# for later assert
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
scale(X_diabetes), y_diabetes, random_state=42
|
||||
)
|
||||
estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
|
||||
reg = StackingRegressor(
|
||||
estimators=estimators, final_estimator=final_estimator, cv=cv,
|
||||
passthrough=passthrough
|
||||
)
|
||||
reg.fit(X_train, y_train)
|
||||
result = reg.predict(X_test, **predict_params)
|
||||
expected_result_length = 2 if predict_params else 1
|
||||
if predict_params:
|
||||
assert len(result) == expected_result_length
|
||||
|
||||
X_trans = reg.transform(X_test)
|
||||
expected_column_count = 12 if passthrough else 2
|
||||
assert X_trans.shape[1] == expected_column_count
|
||||
if passthrough:
|
||||
assert_allclose(X_test, X_trans[:, -10:])
|
||||
|
||||
reg.set_params(lr='drop')
|
||||
reg.fit(X_train, y_train)
|
||||
reg.predict(X_test)
|
||||
|
||||
X_trans = reg.transform(X_test)
|
||||
expected_column_count_drop = 11 if passthrough else 1
|
||||
assert X_trans.shape[1] == expected_column_count_drop
|
||||
if passthrough:
|
||||
assert_allclose(X_test, X_trans[:, -10:])
|
||||
|
||||
|
||||
@pytest.mark.parametrize('fmt', ['csc', 'csr', 'coo'])
|
||||
def test_stacking_regressor_sparse_passthrough(fmt):
|
||||
# Check passthrough behavior on a sparse X matrix
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
sparse.coo_matrix(scale(X_diabetes)).asformat(fmt),
|
||||
y_diabetes, random_state=42
|
||||
)
|
||||
estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
|
||||
rf = RandomForestRegressor(n_estimators=10, random_state=42)
|
||||
clf = StackingRegressor(
|
||||
estimators=estimators, final_estimator=rf, cv=5, passthrough=True
|
||||
)
|
||||
clf.fit(X_train, y_train)
|
||||
X_trans = clf.transform(X_test)
|
||||
assert_allclose_dense_sparse(X_test, X_trans[:, -10:])
|
||||
assert sparse.issparse(X_trans)
|
||||
assert X_test.format == X_trans.format
|
||||
|
||||
|
||||
@pytest.mark.parametrize('fmt', ['csc', 'csr', 'coo'])
|
||||
def test_stacking_classifier_sparse_passthrough(fmt):
|
||||
# Check passthrough behavior on a sparse X matrix
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
sparse.coo_matrix(scale(X_iris)).asformat(fmt),
|
||||
y_iris, random_state=42
|
||||
)
|
||||
estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
|
||||
rf = RandomForestClassifier(n_estimators=10, random_state=42)
|
||||
clf = StackingClassifier(
|
||||
estimators=estimators, final_estimator=rf, cv=5, passthrough=True
|
||||
)
|
||||
clf.fit(X_train, y_train)
|
||||
X_trans = clf.transform(X_test)
|
||||
assert_allclose_dense_sparse(X_test, X_trans[:, -4:])
|
||||
assert sparse.issparse(X_trans)
|
||||
assert X_test.format == X_trans.format
|
||||
|
||||
|
||||
def test_stacking_classifier_drop_binary_prob():
|
||||
# check that classifier will drop one of the probability column for
|
||||
# binary classification problem
|
||||
|
||||
# Select only the 2 first classes
|
||||
X_, y_ = scale(X_iris[:100]), y_iris[:100]
|
||||
|
||||
estimators = [
|
||||
('lr', LogisticRegression()), ('rf', RandomForestClassifier())
|
||||
]
|
||||
clf = StackingClassifier(estimators=estimators)
|
||||
clf.fit(X_, y_)
|
||||
X_meta = clf.transform(X_)
|
||||
assert X_meta.shape[1] == 2
|
||||
|
||||
|
||||
class NoWeightRegressor(BaseEstimator, RegressorMixin):
|
||||
def fit(self, X, y):
|
||||
self.reg = DummyRegressor()
|
||||
return self.reg.fit(X, y)
|
||||
|
||||
def predict(self, X):
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
|
||||
class NoWeightClassifier(BaseEstimator, ClassifierMixin):
|
||||
def fit(self, X, y):
|
||||
self.clf = DummyClassifier(strategy='stratified')
|
||||
return self.clf.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y, params, type_err, msg_err",
|
||||
[(y_iris,
|
||||
{'estimators': None},
|
||||
ValueError, "Invalid 'estimators' attribute,"),
|
||||
(y_iris,
|
||||
{'estimators': []},
|
||||
ValueError, "Invalid 'estimators' attribute,"),
|
||||
(y_iris,
|
||||
{'estimators': [('lr', LogisticRegression()),
|
||||
('svm', SVC(max_iter=5e4))],
|
||||
'stack_method': 'predict_proba'},
|
||||
ValueError, 'does not implement the method predict_proba'),
|
||||
(y_iris,
|
||||
{'estimators': [('lr', LogisticRegression()),
|
||||
('cor', NoWeightClassifier())]},
|
||||
TypeError, 'does not support sample weight'),
|
||||
(y_iris,
|
||||
{'estimators': [('lr', LogisticRegression()),
|
||||
('cor', LinearSVC(max_iter=5e4))],
|
||||
'final_estimator': NoWeightClassifier()},
|
||||
TypeError, 'does not support sample weight')]
|
||||
)
|
||||
def test_stacking_classifier_error(y, params, type_err, msg_err):
|
||||
with pytest.raises(type_err, match=msg_err):
|
||||
clf = StackingClassifier(**params, cv=3)
|
||||
clf.fit(
|
||||
scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0])
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y, params, type_err, msg_err",
|
||||
[(y_diabetes,
|
||||
{'estimators': None},
|
||||
ValueError, "Invalid 'estimators' attribute,"),
|
||||
(y_diabetes,
|
||||
{'estimators': []},
|
||||
ValueError, "Invalid 'estimators' attribute,"),
|
||||
(y_diabetes,
|
||||
{'estimators': [('lr', LinearRegression()),
|
||||
('cor', NoWeightRegressor())]},
|
||||
TypeError, 'does not support sample weight'),
|
||||
(y_diabetes,
|
||||
{'estimators': [('lr', LinearRegression()),
|
||||
('cor', LinearSVR())],
|
||||
'final_estimator': NoWeightRegressor()},
|
||||
TypeError, 'does not support sample weight')]
|
||||
)
|
||||
def test_stacking_regressor_error(y, params, type_err, msg_err):
|
||||
with pytest.raises(type_err, match=msg_err):
|
||||
reg = StackingRegressor(**params, cv=3)
|
||||
reg.fit(
|
||||
scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0])
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator, X, y",
|
||||
[(StackingClassifier(
|
||||
estimators=[('lr', LogisticRegression(random_state=0)),
|
||||
('svm', LinearSVC(random_state=0))]),
|
||||
X_iris[:100], y_iris[:100]), # keep only classes 0 and 1
|
||||
(StackingRegressor(
|
||||
estimators=[('lr', LinearRegression()),
|
||||
('svm', LinearSVR(random_state=0))]),
|
||||
X_diabetes, y_diabetes)],
|
||||
ids=['StackingClassifier', 'StackingRegressor']
|
||||
)
|
||||
def test_stacking_randomness(estimator, X, y):
|
||||
# checking that fixing the random state of the CV will lead to the same
|
||||
# results
|
||||
estimator_full = clone(estimator)
|
||||
estimator_full.set_params(
|
||||
cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
|
||||
)
|
||||
|
||||
estimator_drop = clone(estimator)
|
||||
estimator_drop.set_params(lr='drop')
|
||||
estimator_drop.set_params(
|
||||
cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
|
||||
)
|
||||
|
||||
assert_allclose(
|
||||
estimator_full.fit(X, y).transform(X)[:, 1:],
|
||||
estimator_drop.fit(X, y).transform(X)
|
||||
)
|
||||
|
||||
|
||||
# These warnings are raised due to _BaseComposition
|
||||
@pytest.mark.filterwarnings("ignore:TypeError occurred during set_params")
|
||||
@pytest.mark.filterwarnings("ignore:Estimator's parameters changed after")
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[StackingClassifier(
|
||||
estimators=[('lr', LogisticRegression(random_state=0)),
|
||||
('tree', DecisionTreeClassifier(random_state=0))]),
|
||||
StackingRegressor(
|
||||
estimators=[('lr', LinearRegression()),
|
||||
('tree', DecisionTreeRegressor(random_state=0))])],
|
||||
ids=['StackingClassifier', 'StackingRegressor']
|
||||
)
|
||||
def test_check_estimators_stacking_estimator(estimator):
|
||||
check_estimator(estimator)
|
||||
check_no_attributes_set_in_init(estimator.__class__.__name__, estimator)
|
||||
|
||||
|
||||
def test_stacking_classifier_stratify_default():
|
||||
# check that we stratify the classes for the default CV
|
||||
clf = StackingClassifier(
|
||||
estimators=[('lr', LogisticRegression(max_iter=1e4)),
|
||||
('svm', LinearSVC(max_iter=1e4))]
|
||||
)
|
||||
# since iris is not shuffled, a simple k-fold would not contain the
|
||||
# 3 classes during training
|
||||
clf.fit(X_iris, y_iris)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"stacker, X, y",
|
||||
[(StackingClassifier(
|
||||
estimators=[('lr', LogisticRegression()),
|
||||
('svm', LinearSVC(random_state=42))],
|
||||
final_estimator=LogisticRegression(),
|
||||
cv=KFold(shuffle=True, random_state=42)),
|
||||
*load_breast_cancer(return_X_y=True)),
|
||||
(StackingRegressor(
|
||||
estimators=[('lr', LinearRegression()),
|
||||
('svm', LinearSVR(random_state=42))],
|
||||
final_estimator=LinearRegression(),
|
||||
cv=KFold(shuffle=True, random_state=42)),
|
||||
X_diabetes, y_diabetes)],
|
||||
ids=['StackingClassifier', 'StackingRegressor']
|
||||
)
|
||||
def test_stacking_with_sample_weight(stacker, X, y):
|
||||
# check that sample weights has an influence on the fitting
|
||||
# note: ConvergenceWarning are catch since we are not worrying about the
|
||||
# convergence here
|
||||
n_half_samples = len(y) // 2
|
||||
total_sample_weight = np.array(
|
||||
[0.1] * n_half_samples + [0.9] * (len(y) - n_half_samples)
|
||||
)
|
||||
X_train, X_test, y_train, _, sample_weight_train, _ = train_test_split(
|
||||
X, y, total_sample_weight, random_state=42
|
||||
)
|
||||
|
||||
with ignore_warnings(category=ConvergenceWarning):
|
||||
stacker.fit(X_train, y_train)
|
||||
y_pred_no_weight = stacker.predict(X_test)
|
||||
|
||||
with ignore_warnings(category=ConvergenceWarning):
|
||||
stacker.fit(X_train, y_train, sample_weight=np.ones(y_train.shape))
|
||||
y_pred_unit_weight = stacker.predict(X_test)
|
||||
|
||||
assert_allclose(y_pred_no_weight, y_pred_unit_weight)
|
||||
|
||||
with ignore_warnings(category=ConvergenceWarning):
|
||||
stacker.fit(X_train, y_train, sample_weight=sample_weight_train)
|
||||
y_pred_biased = stacker.predict(X_test)
|
||||
|
||||
assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0
|
||||
|
||||
|
||||
def test_stacking_classifier_sample_weight_fit_param():
|
||||
# check sample_weight is passed to all invocations of fit
|
||||
stacker = StackingClassifier(
|
||||
estimators=[
|
||||
('lr', CheckingClassifier(expected_fit_params=['sample_weight']))
|
||||
],
|
||||
final_estimator=CheckingClassifier(
|
||||
expected_fit_params=['sample_weight']
|
||||
)
|
||||
)
|
||||
stacker.fit(X_iris, y_iris, sample_weight=np.ones(X_iris.shape[0]))
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize(
|
||||
"stacker, X, y",
|
||||
[(StackingClassifier(
|
||||
estimators=[('lr', LogisticRegression()),
|
||||
('svm', LinearSVC(random_state=42))],
|
||||
final_estimator=LogisticRegression()),
|
||||
*load_breast_cancer(return_X_y=True)),
|
||||
(StackingRegressor(
|
||||
estimators=[('lr', LinearRegression()),
|
||||
('svm', LinearSVR(random_state=42))],
|
||||
final_estimator=LinearRegression()),
|
||||
X_diabetes, y_diabetes)],
|
||||
ids=['StackingClassifier', 'StackingRegressor']
|
||||
)
|
||||
def test_stacking_cv_influence(stacker, X, y):
|
||||
# check that the stacking affects the fit of the final estimator but not
|
||||
# the fit of the base estimators
|
||||
# note: ConvergenceWarning are catch since we are not worrying about the
|
||||
# convergence here
|
||||
stacker_cv_3 = clone(stacker)
|
||||
stacker_cv_5 = clone(stacker)
|
||||
|
||||
stacker_cv_3.set_params(cv=3)
|
||||
stacker_cv_5.set_params(cv=5)
|
||||
|
||||
stacker_cv_3.fit(X, y)
|
||||
stacker_cv_5.fit(X, y)
|
||||
|
||||
# the base estimators should be identical
|
||||
for est_cv_3, est_cv_5 in zip(stacker_cv_3.estimators_,
|
||||
stacker_cv_5.estimators_):
|
||||
assert_allclose(est_cv_3.coef_, est_cv_5.coef_)
|
||||
|
||||
# the final estimator should be different
|
||||
with pytest.raises(AssertionError, match='Not equal'):
|
||||
assert_allclose(stacker_cv_3.final_estimator_.coef_,
|
||||
stacker_cv_5.final_estimator_.coef_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("make_dataset, Stacking, Estimator", [
|
||||
(make_classification, StackingClassifier, LogisticRegression),
|
||||
(make_regression, StackingRegressor, LinearRegression)
|
||||
])
|
||||
def test_stacking_without_n_features_in(make_dataset, Stacking, Estimator):
|
||||
# Stacking supports estimators without `n_features_in_`. Regression test
|
||||
# for #17353
|
||||
|
||||
class MyEstimator(Estimator):
|
||||
"""Estimator without n_features_in_"""
|
||||
def fit(self, X, y):
|
||||
super().fit(X, y)
|
||||
del self.n_features_in_
|
||||
|
||||
X, y = make_dataset(random_state=0, n_samples=100)
|
||||
stacker = Stacking(estimators=[('lr', MyEstimator())])
|
||||
|
||||
msg = f"{Stacking.__name__} object has no attribute n_features_in_"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
stacker.n_features_in_
|
||||
|
||||
# Does not raise
|
||||
stacker.fit(X, y)
|
||||
|
||||
msg = "'MyEstimator' object has no attribute 'n_features_in_'"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
stacker.n_features_in_
|
574
venv/Lib/site-packages/sklearn/ensemble/tests/test_voting.py
Normal file
574
venv/Lib/site-packages/sklearn/ensemble/tests/test_voting.py
Normal file
|
@ -0,0 +1,574 @@
|
|||
"""Testing for the VotingClassifier and VotingRegressor"""
|
||||
|
||||
import pytest
|
||||
import re
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils._testing import assert_almost_equal, assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_raise_message
|
||||
from sklearn.utils.estimator_checks import check_estimator
|
||||
from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.ensemble import VotingClassifier, VotingRegressor
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn import datasets
|
||||
from sklearn.model_selection import cross_val_score, train_test_split
|
||||
from sklearn.datasets import make_multilabel_classification
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin, clone
|
||||
from sklearn.dummy import DummyRegressor
|
||||
|
||||
|
||||
# Load datasets
|
||||
iris = datasets.load_iris()
|
||||
X, y = iris.data[:, 1:3], iris.target
|
||||
|
||||
X_r, y_r = datasets.load_diabetes(return_X_y=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_msg",
|
||||
[({'estimators': []},
|
||||
"Invalid 'estimators' attribute, 'estimators' should be a list of"),
|
||||
({'estimators': [('lr', LogisticRegression())], 'voting': 'error'},
|
||||
r"Voting must be 'soft' or 'hard'; got \(voting='error'\)"),
|
||||
({'estimators': [('lr', LogisticRegression())], 'weights': [1, 2]},
|
||||
"Number of `estimators` and weights must be equal")]
|
||||
)
|
||||
def test_voting_classifier_estimator_init(params, err_msg):
|
||||
ensemble = VotingClassifier(**params)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
|
||||
def test_predictproba_hardvoting():
|
||||
eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()),
|
||||
('lr2', LogisticRegression())],
|
||||
voting='hard')
|
||||
msg = "predict_proba is not available when voting='hard'"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
eclf.predict_proba
|
||||
|
||||
assert not hasattr(eclf, "predict_proba")
|
||||
eclf.fit(X, y)
|
||||
assert not hasattr(eclf, "predict_proba")
|
||||
|
||||
|
||||
def test_notfitted():
|
||||
eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()),
|
||||
('lr2', LogisticRegression())],
|
||||
voting='soft')
|
||||
ereg = VotingRegressor([('dr', DummyRegressor())])
|
||||
msg = ("This %s instance is not fitted yet. Call \'fit\'"
|
||||
" with appropriate arguments before using this estimator.")
|
||||
assert_raise_message(NotFittedError, msg % 'VotingClassifier',
|
||||
eclf.predict, X)
|
||||
assert_raise_message(NotFittedError, msg % 'VotingClassifier',
|
||||
eclf.predict_proba, X)
|
||||
assert_raise_message(NotFittedError, msg % 'VotingClassifier',
|
||||
eclf.transform, X)
|
||||
assert_raise_message(NotFittedError, msg % 'VotingRegressor',
|
||||
ereg.predict, X_r)
|
||||
assert_raise_message(NotFittedError, msg % 'VotingRegressor',
|
||||
ereg.transform, X_r)
|
||||
|
||||
|
||||
def test_majority_label_iris():
|
||||
"""Check classification by majority label on dataset iris."""
|
||||
clf1 = LogisticRegression(solver='liblinear', random_state=123)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
eclf = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||||
voting='hard')
|
||||
scores = cross_val_score(eclf, X, y, scoring='accuracy')
|
||||
assert_almost_equal(scores.mean(), 0.95, decimal=2)
|
||||
|
||||
|
||||
def test_tie_situation():
|
||||
"""Check voting classifier selects smaller class label in tie situation."""
|
||||
clf1 = LogisticRegression(random_state=123, solver='liblinear')
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)],
|
||||
voting='hard')
|
||||
assert clf1.fit(X, y).predict(X)[73] == 2
|
||||
assert clf2.fit(X, y).predict(X)[73] == 1
|
||||
assert eclf.fit(X, y).predict(X)[73] == 1
|
||||
|
||||
|
||||
def test_weights_iris():
|
||||
"""Check classification by average probabilities on dataset iris."""
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
eclf = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||||
voting='soft',
|
||||
weights=[1, 2, 10])
|
||||
scores = cross_val_score(eclf, X, y, scoring='accuracy')
|
||||
assert_almost_equal(scores.mean(), 0.93, decimal=2)
|
||||
|
||||
|
||||
def test_weights_regressor():
|
||||
"""Check weighted average regression prediction on diabetes dataset."""
|
||||
reg1 = DummyRegressor(strategy='mean')
|
||||
reg2 = DummyRegressor(strategy='median')
|
||||
reg3 = DummyRegressor(strategy='quantile', quantile=.2)
|
||||
ereg = VotingRegressor([('mean', reg1), ('median', reg2),
|
||||
('quantile', reg3)], weights=[1, 2, 10])
|
||||
|
||||
X_r_train, X_r_test, y_r_train, y_r_test = \
|
||||
train_test_split(X_r, y_r, test_size=.25)
|
||||
|
||||
reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
|
||||
avg = np.average(np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0,
|
||||
weights=[1, 2, 10])
|
||||
assert_almost_equal(ereg_pred, avg, decimal=2)
|
||||
|
||||
ereg_weights_none = VotingRegressor([('mean', reg1), ('median', reg2),
|
||||
('quantile', reg3)], weights=None)
|
||||
ereg_weights_equal = VotingRegressor([('mean', reg1), ('median', reg2),
|
||||
('quantile', reg3)],
|
||||
weights=[1, 1, 1])
|
||||
ereg_weights_none.fit(X_r_train, y_r_train)
|
||||
ereg_weights_equal.fit(X_r_train, y_r_train)
|
||||
ereg_none_pred = ereg_weights_none.predict(X_r_test)
|
||||
ereg_equal_pred = ereg_weights_equal.predict(X_r_test)
|
||||
assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
|
||||
|
||||
|
||||
def test_predict_on_toy_problem():
|
||||
"""Manually check predicted class labels for toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
|
||||
X = np.array([[-1.1, -1.5],
|
||||
[-1.2, -1.4],
|
||||
[-3.4, -2.2],
|
||||
[1.1, 1.2],
|
||||
[2.1, 1.4],
|
||||
[3.1, 2.3]])
|
||||
|
||||
y = np.array([1, 1, 1, 2, 2, 2])
|
||||
|
||||
assert_array_equal(clf1.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
assert_array_equal(clf2.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
assert_array_equal(clf3.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
|
||||
eclf = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||||
voting='hard',
|
||||
weights=[1, 1, 1])
|
||||
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
|
||||
eclf = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||||
voting='soft',
|
||||
weights=[1, 1, 1])
|
||||
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
|
||||
|
||||
def test_predict_proba_on_toy_problem():
|
||||
"""Calculate predicted probabilities on toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
clf1_res = np.array([[0.59790391, 0.40209609],
|
||||
[0.57622162, 0.42377838],
|
||||
[0.50728456, 0.49271544],
|
||||
[0.40241774, 0.59758226]])
|
||||
|
||||
clf2_res = np.array([[0.8, 0.2],
|
||||
[0.8, 0.2],
|
||||
[0.2, 0.8],
|
||||
[0.3, 0.7]])
|
||||
|
||||
clf3_res = np.array([[0.9985082, 0.0014918],
|
||||
[0.99845843, 0.00154157],
|
||||
[0., 1.],
|
||||
[0., 1.]])
|
||||
|
||||
t00 = (2*clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4
|
||||
t11 = (2*clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4
|
||||
t21 = (2*clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4
|
||||
t31 = (2*clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4
|
||||
|
||||
eclf = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||||
voting='soft',
|
||||
weights=[2, 1, 1])
|
||||
eclf_res = eclf.fit(X, y).predict_proba(X)
|
||||
|
||||
assert_almost_equal(t00, eclf_res[0][0], decimal=1)
|
||||
assert_almost_equal(t11, eclf_res[1][1], decimal=1)
|
||||
assert_almost_equal(t21, eclf_res[2][1], decimal=1)
|
||||
assert_almost_equal(t31, eclf_res[3][1], decimal=1)
|
||||
|
||||
with pytest.raises(
|
||||
AttributeError,
|
||||
match="predict_proba is not available when voting='hard'"):
|
||||
eclf = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||||
voting='hard')
|
||||
eclf.fit(X, y).predict_proba(X)
|
||||
|
||||
|
||||
def test_multilabel():
|
||||
"""Check if error is raised for multilabel classification."""
|
||||
X, y = make_multilabel_classification(n_classes=2, n_labels=1,
|
||||
allow_unlabeled=False,
|
||||
random_state=123)
|
||||
clf = OneVsRestClassifier(SVC(kernel='linear'))
|
||||
|
||||
eclf = VotingClassifier(estimators=[('ovr', clf)], voting='hard')
|
||||
|
||||
try:
|
||||
eclf.fit(X, y)
|
||||
except NotImplementedError:
|
||||
return
|
||||
|
||||
|
||||
def test_gridsearch():
|
||||
"""Check GridSearch support."""
|
||||
clf1 = LogisticRegression(random_state=1)
|
||||
clf2 = RandomForestClassifier(random_state=1)
|
||||
clf3 = GaussianNB()
|
||||
eclf = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||||
voting='soft')
|
||||
|
||||
params = {'lr__C': [1.0, 100.0],
|
||||
'voting': ['soft', 'hard'],
|
||||
'weights': [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]]}
|
||||
|
||||
grid = GridSearchCV(estimator=eclf, param_grid=params)
|
||||
grid.fit(iris.data, iris.target)
|
||||
|
||||
|
||||
def test_parallel_fit():
|
||||
"""Check parallel backend of VotingClassifier on toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
eclf1 = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||||
voting='soft',
|
||||
n_jobs=1).fit(X, y)
|
||||
eclf2 = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||||
voting='soft',
|
||||
n_jobs=2).fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||||
|
||||
|
||||
def test_sample_weight():
|
||||
"""Tests sample_weight parameter of VotingClassifier"""
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
clf3 = SVC(probability=True, random_state=123)
|
||||
eclf1 = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2), ('svc', clf3)],
|
||||
voting='soft').fit(X, y, sample_weight=np.ones((len(y),)))
|
||||
eclf2 = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2), ('svc', clf3)],
|
||||
voting='soft').fit(X, y)
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||||
|
||||
sample_weight = np.random.RandomState(123).uniform(size=(len(y),))
|
||||
eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft')
|
||||
eclf3.fit(X, y, sample_weight)
|
||||
clf1.fit(X, y, sample_weight)
|
||||
assert_array_equal(eclf3.predict(X), clf1.predict(X))
|
||||
assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X))
|
||||
|
||||
# check that an error is raised and indicative if sample_weight is not
|
||||
# supported.
|
||||
clf4 = KNeighborsClassifier()
|
||||
eclf3 = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('svc', clf3), ('knn', clf4)],
|
||||
voting='soft')
|
||||
msg = ('Underlying estimator KNeighborsClassifier does not support '
|
||||
'sample weights.')
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
eclf3.fit(X, y, sample_weight)
|
||||
|
||||
# check that _fit_single_estimator will raise the right error
|
||||
# it should raise the original error if this is not linked to sample_weight
|
||||
class ClassifierErrorFit(ClassifierMixin, BaseEstimator):
|
||||
def fit(self, X, y, sample_weight):
|
||||
raise TypeError('Error unrelated to sample_weight.')
|
||||
clf = ClassifierErrorFit()
|
||||
with pytest.raises(TypeError, match='Error unrelated to sample_weight'):
|
||||
clf.fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
def test_sample_weight_kwargs():
|
||||
"""Check that VotingClassifier passes sample_weight as kwargs"""
|
||||
class MockClassifier(ClassifierMixin, BaseEstimator):
|
||||
"""Mock Classifier to check that sample_weight is received as kwargs"""
|
||||
def fit(self, X, y, *args, **sample_weight):
|
||||
assert 'sample_weight' in sample_weight
|
||||
|
||||
clf = MockClassifier()
|
||||
eclf = VotingClassifier(estimators=[('mock', clf)], voting='soft')
|
||||
|
||||
# Should not raise an error.
|
||||
eclf.fit(X, y, sample_weight=np.ones((len(y),)))
|
||||
|
||||
|
||||
def test_voting_classifier_set_params():
|
||||
# check equivalence in the output when setting underlying estimators
|
||||
clf1 = LogisticRegression(random_state=123, C=1.0)
|
||||
clf2 = RandomForestClassifier(random_state=123, max_depth=None)
|
||||
clf3 = GaussianNB()
|
||||
|
||||
eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft',
|
||||
weights=[1, 2]).fit(X, y)
|
||||
eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft',
|
||||
weights=[1, 2])
|
||||
eclf2.set_params(nb=clf2).fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||||
assert eclf2.estimators[0][1].get_params() == clf1.get_params()
|
||||
assert eclf2.estimators[1][1].get_params() == clf2.get_params()
|
||||
|
||||
|
||||
# TODO: Remove parametrization in 0.24 when None is removed in Voting*
|
||||
@pytest.mark.parametrize("drop", [None, 'drop'])
|
||||
def test_set_estimator_none(drop):
|
||||
"""VotingClassifier set_params should be able to set estimators as None or
|
||||
drop"""
|
||||
# Test predict
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
|
||||
('nb', clf3)],
|
||||
voting='hard', weights=[1, 0, 0.5]).fit(X, y)
|
||||
|
||||
eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
|
||||
('nb', clf3)],
|
||||
voting='hard', weights=[1, 1, 0.5])
|
||||
with pytest.warns(None) as record:
|
||||
eclf2.set_params(rf=drop).fit(X, y)
|
||||
assert record if drop is None else not record
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
|
||||
assert dict(eclf2.estimators)["rf"] is drop
|
||||
assert len(eclf2.estimators_) == 2
|
||||
assert all(isinstance(est, (LogisticRegression, GaussianNB))
|
||||
for est in eclf2.estimators_)
|
||||
assert eclf2.get_params()["rf"] is drop
|
||||
|
||||
eclf1.set_params(voting='soft').fit(X, y)
|
||||
with pytest.warns(None) as record:
|
||||
eclf2.set_params(voting='soft').fit(X, y)
|
||||
assert record if drop is None else not record
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||||
msg = 'All estimators are dropped. At least one is required'
|
||||
with pytest.warns(None) as record:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
eclf2.set_params(lr=drop, rf=drop, nb=drop).fit(X, y)
|
||||
assert record if drop is None else not record
|
||||
|
||||
# Test soft voting transform
|
||||
X1 = np.array([[1], [2]])
|
||||
y1 = np.array([1, 2])
|
||||
eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
|
||||
voting='soft', weights=[0, 0.5],
|
||||
flatten_transform=False).fit(X1, y1)
|
||||
|
||||
eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
|
||||
voting='soft', weights=[1, 0.5],
|
||||
flatten_transform=False)
|
||||
with pytest.warns(None) as record:
|
||||
eclf2.set_params(rf=drop).fit(X1, y1)
|
||||
assert record if drop is None else not record
|
||||
assert_array_almost_equal(eclf1.transform(X1),
|
||||
np.array([[[0.7, 0.3], [0.3, 0.7]],
|
||||
[[1., 0.], [0., 1.]]]))
|
||||
assert_array_almost_equal(eclf2.transform(X1),
|
||||
np.array([[[1., 0.],
|
||||
[0., 1.]]]))
|
||||
eclf1.set_params(voting='hard')
|
||||
eclf2.set_params(voting='hard')
|
||||
assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
|
||||
assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
|
||||
|
||||
|
||||
def test_estimator_weights_format():
|
||||
# Test estimator weights inputs as list and array
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
eclf1 = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2)],
|
||||
weights=[1, 2],
|
||||
voting='soft')
|
||||
eclf2 = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2)],
|
||||
weights=np.array((1, 2)),
|
||||
voting='soft')
|
||||
eclf1.fit(X, y)
|
||||
eclf2.fit(X, y)
|
||||
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||||
|
||||
|
||||
def test_transform():
|
||||
"""Check transform method of VotingClassifier on toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
eclf1 = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||||
voting='soft').fit(X, y)
|
||||
eclf2 = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||||
voting='soft',
|
||||
flatten_transform=True).fit(X, y)
|
||||
eclf3 = VotingClassifier(estimators=[
|
||||
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||||
voting='soft',
|
||||
flatten_transform=False).fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.transform(X).shape, (4, 6))
|
||||
assert_array_equal(eclf2.transform(X).shape, (4, 6))
|
||||
assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
|
||||
assert_array_almost_equal(eclf1.transform(X),
|
||||
eclf2.transform(X))
|
||||
assert_array_almost_equal(
|
||||
eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)),
|
||||
eclf2.transform(X)
|
||||
)
|
||||
|
||||
|
||||
# TODO: Remove drop=None in 0.24 when None is removed in Voting*
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, voter",
|
||||
[(X, y, VotingClassifier(
|
||||
[('lr', LogisticRegression()),
|
||||
('rf', RandomForestClassifier(n_estimators=5))])),
|
||||
(X_r, y_r, VotingRegressor(
|
||||
[('lr', LinearRegression()),
|
||||
('rf', RandomForestRegressor(n_estimators=5))]))]
|
||||
)
|
||||
@pytest.mark.parametrize("drop", [None, 'drop'])
|
||||
def test_none_estimator_with_weights(X, y, voter, drop):
|
||||
# TODO: remove the parametrization on 'drop' when support for None is
|
||||
# removed.
|
||||
# check that an estimator can be set to 'drop' and passing some weight
|
||||
# regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/13777
|
||||
voter = clone(voter)
|
||||
voter.fit(X, y, sample_weight=np.ones(y.shape))
|
||||
voter.set_params(lr=drop)
|
||||
with pytest.warns(None) as record:
|
||||
voter.fit(X, y, sample_weight=np.ones(y.shape))
|
||||
assert record if drop is None else not record
|
||||
y_pred = voter.predict(X)
|
||||
assert y_pred.shape == y.shape
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[VotingRegressor(
|
||||
estimators=[('lr', LinearRegression()),
|
||||
('tree', DecisionTreeRegressor(random_state=0))]),
|
||||
VotingClassifier(
|
||||
estimators=[('lr', LogisticRegression(random_state=0)),
|
||||
('tree', DecisionTreeClassifier(random_state=0))])],
|
||||
ids=['VotingRegressor', 'VotingClassifier']
|
||||
)
|
||||
def test_check_estimators_voting_estimator(estimator):
|
||||
# FIXME: to be removed when meta-estimators can specified themselves
|
||||
# their testing parameters (for required parameters).
|
||||
check_estimator(estimator)
|
||||
check_no_attributes_set_in_init(estimator.__class__.__name__, estimator)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est",
|
||||
[VotingRegressor(
|
||||
estimators=[('lr', LinearRegression()),
|
||||
('tree', DecisionTreeRegressor(random_state=0))]),
|
||||
VotingClassifier(
|
||||
estimators=[('lr', LogisticRegression(random_state=0)),
|
||||
('tree', DecisionTreeClassifier(random_state=0))])],
|
||||
ids=['VotingRegressor', 'VotingClassifier']
|
||||
)
|
||||
def test_n_features_in(est):
|
||||
|
||||
X = [[1, 2], [3, 4], [5, 6]]
|
||||
y = [0, 1, 2]
|
||||
|
||||
assert not hasattr(est, 'n_features_in_')
|
||||
est.fit(X, y)
|
||||
assert est.n_features_in_ == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[VotingRegressor(
|
||||
estimators=[('lr', LinearRegression()),
|
||||
('rf', RandomForestRegressor(random_state=123))],
|
||||
verbose=True),
|
||||
VotingClassifier(
|
||||
estimators=[('lr', LogisticRegression(random_state=123)),
|
||||
('rf', RandomForestClassifier(random_state=123))],
|
||||
verbose=True)]
|
||||
)
|
||||
def test_voting_verbose(estimator, capsys):
|
||||
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
pattern = (r'\[Voting\].*\(1 of 2\) Processing lr, total=.*\n'
|
||||
r'\[Voting\].*\(2 of 2\) Processing rf, total=.*\n$')
|
||||
|
||||
estimator.fit(X, y)
|
||||
assert re.match(pattern, capsys.readouterr()[0])
|
||||
|
||||
|
||||
# TODO: Remove in 0.24 when None is removed in Voting*
|
||||
@pytest.mark.parametrize(
|
||||
"Voter, BaseEstimator",
|
||||
[(VotingClassifier, DecisionTreeClassifier),
|
||||
(VotingRegressor, DecisionTreeRegressor)]
|
||||
)
|
||||
def test_deprecate_none_transformer(Voter, BaseEstimator):
|
||||
est = Voter(estimators=[('lr', None),
|
||||
('tree', BaseEstimator(random_state=0))])
|
||||
|
||||
msg = ("Using 'None' to drop an estimator from the ensemble is "
|
||||
"deprecated in 0.22 and support will be dropped in 0.24. "
|
||||
"Use the string 'drop' instead.")
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
est.fit(X, y)
|
|
@ -0,0 +1,582 @@
|
|||
"""Testing for the boost module (sklearn.ensemble.boost)."""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from scipy.sparse import csc_matrix
|
||||
from scipy.sparse import csr_matrix
|
||||
from scipy.sparse import coo_matrix
|
||||
from scipy.sparse import dok_matrix
|
||||
from scipy.sparse import lil_matrix
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal, assert_array_less
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_raises, assert_raises_regexp
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.base import clone
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.ensemble import AdaBoostClassifier
|
||||
from sklearn.ensemble import AdaBoostRegressor
|
||||
from sklearn.ensemble._weight_boosting import _samme_proba
|
||||
from sklearn.svm import SVC, SVR
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.utils import shuffle
|
||||
from sklearn.utils._mocking import NoSampleWeightWrapper
|
||||
from sklearn import datasets
|
||||
|
||||
|
||||
# Common random state
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
# Toy sample
|
||||
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
|
||||
y_class = ["foo", "foo", "foo", 1, 1, 1] # test string class labels
|
||||
y_regr = [-1, -1, -1, 1, 1, 1]
|
||||
T = [[-1, -1], [2, 2], [3, 2]]
|
||||
y_t_class = ["foo", 1, 1]
|
||||
y_t_regr = [-1, 1, 1]
|
||||
|
||||
# Load the iris dataset and randomly permute it
|
||||
iris = datasets.load_iris()
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng)
|
||||
|
||||
# Load the boston dataset and randomly permute it
|
||||
boston = datasets.load_boston()
|
||||
boston.data, boston.target = shuffle(boston.data, boston.target,
|
||||
random_state=rng)
|
||||
|
||||
|
||||
def test_samme_proba():
|
||||
# Test the `_samme_proba` helper function.
|
||||
|
||||
# Define some example (bad) `predict_proba` output.
|
||||
probs = np.array([[1, 1e-6, 0],
|
||||
[0.19, 0.6, 0.2],
|
||||
[-999, 0.51, 0.5],
|
||||
[1e-6, 1, 1e-9]])
|
||||
probs /= np.abs(probs.sum(axis=1))[:, np.newaxis]
|
||||
|
||||
# _samme_proba calls estimator.predict_proba.
|
||||
# Make a mock object so I can control what gets returned.
|
||||
class MockEstimator:
|
||||
def predict_proba(self, X):
|
||||
assert_array_equal(X.shape, probs.shape)
|
||||
return probs
|
||||
mock = MockEstimator()
|
||||
|
||||
samme_proba = _samme_proba(mock, 3, np.ones_like(probs))
|
||||
|
||||
assert_array_equal(samme_proba.shape, probs.shape)
|
||||
assert np.isfinite(samme_proba).all()
|
||||
|
||||
# Make sure that the correct elements come out as smallest --
|
||||
# `_samme_proba` should preserve the ordering in each example.
|
||||
assert_array_equal(np.argmin(samme_proba, axis=1), [2, 0, 0, 2])
|
||||
assert_array_equal(np.argmax(samme_proba, axis=1), [0, 1, 1, 1])
|
||||
|
||||
|
||||
def test_oneclass_adaboost_proba():
|
||||
# Test predict_proba robustness for one class label input.
|
||||
# In response to issue #7501
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/7501
|
||||
y_t = np.ones(len(X))
|
||||
clf = AdaBoostClassifier().fit(X, y_t)
|
||||
assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||||
def test_classification_toy(algorithm):
|
||||
# Check classification on a toy dataset.
|
||||
clf = AdaBoostClassifier(algorithm=algorithm, random_state=0)
|
||||
clf.fit(X, y_class)
|
||||
assert_array_equal(clf.predict(T), y_t_class)
|
||||
assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
|
||||
assert clf.predict_proba(T).shape == (len(T), 2)
|
||||
assert clf.decision_function(T).shape == (len(T),)
|
||||
|
||||
|
||||
def test_regression_toy():
|
||||
# Check classification on a toy dataset.
|
||||
clf = AdaBoostRegressor(random_state=0)
|
||||
clf.fit(X, y_regr)
|
||||
assert_array_equal(clf.predict(T), y_t_regr)
|
||||
|
||||
|
||||
def test_iris():
|
||||
# Check consistency on dataset iris.
|
||||
classes = np.unique(iris.target)
|
||||
clf_samme = prob_samme = None
|
||||
|
||||
for alg in ['SAMME', 'SAMME.R']:
|
||||
clf = AdaBoostClassifier(algorithm=alg)
|
||||
clf.fit(iris.data, iris.target)
|
||||
|
||||
assert_array_equal(classes, clf.classes_)
|
||||
proba = clf.predict_proba(iris.data)
|
||||
if alg == "SAMME":
|
||||
clf_samme = clf
|
||||
prob_samme = proba
|
||||
assert proba.shape[1] == len(classes)
|
||||
assert clf.decision_function(iris.data).shape[1] == len(classes)
|
||||
|
||||
score = clf.score(iris.data, iris.target)
|
||||
assert score > 0.9, "Failed with algorithm %s and score = %f" % \
|
||||
(alg, score)
|
||||
|
||||
# Check we used multiple estimators
|
||||
assert len(clf.estimators_) > 1
|
||||
# Check for distinct random states (see issue #7408)
|
||||
assert (len(set(est.random_state for est in clf.estimators_)) ==
|
||||
len(clf.estimators_))
|
||||
|
||||
# Somewhat hacky regression test: prior to
|
||||
# ae7adc880d624615a34bafdb1d75ef67051b8200,
|
||||
# predict_proba returned SAMME.R values for SAMME.
|
||||
clf_samme.algorithm = "SAMME.R"
|
||||
assert_array_less(0,
|
||||
np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('loss', ['linear', 'square', 'exponential'])
|
||||
def test_boston(loss):
|
||||
# Check consistency on dataset boston house prices.
|
||||
reg = AdaBoostRegressor(loss=loss, random_state=0)
|
||||
reg.fit(boston.data, boston.target)
|
||||
score = reg.score(boston.data, boston.target)
|
||||
assert score > 0.85
|
||||
|
||||
# Check we used multiple estimators
|
||||
assert len(reg.estimators_) > 1
|
||||
# Check for distinct random states (see issue #7408)
|
||||
assert (len(set(est.random_state for est in reg.estimators_)) ==
|
||||
len(reg.estimators_))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||||
def test_staged_predict(algorithm):
|
||||
# Check staged predictions.
|
||||
rng = np.random.RandomState(0)
|
||||
iris_weights = rng.randint(10, size=iris.target.shape)
|
||||
boston_weights = rng.randint(10, size=boston.target.shape)
|
||||
|
||||
clf = AdaBoostClassifier(algorithm=algorithm, n_estimators=10)
|
||||
clf.fit(iris.data, iris.target, sample_weight=iris_weights)
|
||||
|
||||
predictions = clf.predict(iris.data)
|
||||
staged_predictions = [p for p in clf.staged_predict(iris.data)]
|
||||
proba = clf.predict_proba(iris.data)
|
||||
staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
|
||||
score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
|
||||
staged_scores = [
|
||||
s for s in clf.staged_score(
|
||||
iris.data, iris.target, sample_weight=iris_weights)]
|
||||
|
||||
assert len(staged_predictions) == 10
|
||||
assert_array_almost_equal(predictions, staged_predictions[-1])
|
||||
assert len(staged_probas) == 10
|
||||
assert_array_almost_equal(proba, staged_probas[-1])
|
||||
assert len(staged_scores) == 10
|
||||
assert_array_almost_equal(score, staged_scores[-1])
|
||||
|
||||
# AdaBoost regression
|
||||
clf = AdaBoostRegressor(n_estimators=10, random_state=0)
|
||||
clf.fit(boston.data, boston.target, sample_weight=boston_weights)
|
||||
|
||||
predictions = clf.predict(boston.data)
|
||||
staged_predictions = [p for p in clf.staged_predict(boston.data)]
|
||||
score = clf.score(boston.data, boston.target, sample_weight=boston_weights)
|
||||
staged_scores = [
|
||||
s for s in clf.staged_score(
|
||||
boston.data, boston.target, sample_weight=boston_weights)]
|
||||
|
||||
assert len(staged_predictions) == 10
|
||||
assert_array_almost_equal(predictions, staged_predictions[-1])
|
||||
assert len(staged_scores) == 10
|
||||
assert_array_almost_equal(score, staged_scores[-1])
|
||||
|
||||
|
||||
def test_gridsearch():
|
||||
# Check that base trees can be grid-searched.
|
||||
# AdaBoost classification
|
||||
boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
|
||||
parameters = {'n_estimators': (1, 2),
|
||||
'base_estimator__max_depth': (1, 2),
|
||||
'algorithm': ('SAMME', 'SAMME.R')}
|
||||
clf = GridSearchCV(boost, parameters)
|
||||
clf.fit(iris.data, iris.target)
|
||||
|
||||
# AdaBoost regression
|
||||
boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),
|
||||
random_state=0)
|
||||
parameters = {'n_estimators': (1, 2),
|
||||
'base_estimator__max_depth': (1, 2)}
|
||||
clf = GridSearchCV(boost, parameters)
|
||||
clf.fit(boston.data, boston.target)
|
||||
|
||||
|
||||
def test_pickle():
|
||||
# Check pickability.
|
||||
import pickle
|
||||
|
||||
# Adaboost classifier
|
||||
for alg in ['SAMME', 'SAMME.R']:
|
||||
obj = AdaBoostClassifier(algorithm=alg)
|
||||
obj.fit(iris.data, iris.target)
|
||||
score = obj.score(iris.data, iris.target)
|
||||
s = pickle.dumps(obj)
|
||||
|
||||
obj2 = pickle.loads(s)
|
||||
assert type(obj2) == obj.__class__
|
||||
score2 = obj2.score(iris.data, iris.target)
|
||||
assert score == score2
|
||||
|
||||
# Adaboost regressor
|
||||
obj = AdaBoostRegressor(random_state=0)
|
||||
obj.fit(boston.data, boston.target)
|
||||
score = obj.score(boston.data, boston.target)
|
||||
s = pickle.dumps(obj)
|
||||
|
||||
obj2 = pickle.loads(s)
|
||||
assert type(obj2) == obj.__class__
|
||||
score2 = obj2.score(boston.data, boston.target)
|
||||
assert score == score2
|
||||
|
||||
|
||||
def test_importances():
|
||||
# Check variable importances.
|
||||
X, y = datasets.make_classification(n_samples=2000,
|
||||
n_features=10,
|
||||
n_informative=3,
|
||||
n_redundant=0,
|
||||
n_repeated=0,
|
||||
shuffle=False,
|
||||
random_state=1)
|
||||
|
||||
for alg in ['SAMME', 'SAMME.R']:
|
||||
clf = AdaBoostClassifier(algorithm=alg)
|
||||
|
||||
clf.fit(X, y)
|
||||
importances = clf.feature_importances_
|
||||
|
||||
assert importances.shape[0] == 10
|
||||
assert (importances[:3, np.newaxis] >= importances[3:]).all()
|
||||
|
||||
|
||||
def test_error():
|
||||
# Test that it gives proper exception on deficient input.
|
||||
assert_raises(ValueError,
|
||||
AdaBoostClassifier(learning_rate=-1).fit,
|
||||
X, y_class)
|
||||
|
||||
assert_raises(ValueError,
|
||||
AdaBoostClassifier(algorithm="foo").fit,
|
||||
X, y_class)
|
||||
|
||||
assert_raises(ValueError,
|
||||
AdaBoostClassifier().fit,
|
||||
X, y_class, sample_weight=np.asarray([-1]))
|
||||
|
||||
|
||||
def test_base_estimator():
|
||||
# Test different base estimators.
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
# XXX doesn't work with y_class because RF doesn't support classes_
|
||||
# Shouldn't AdaBoost run a LabelBinarizer?
|
||||
clf = AdaBoostClassifier(RandomForestClassifier())
|
||||
clf.fit(X, y_regr)
|
||||
|
||||
clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
|
||||
clf.fit(X, y_class)
|
||||
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
|
||||
clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0)
|
||||
clf.fit(X, y_regr)
|
||||
|
||||
clf = AdaBoostRegressor(SVR(), random_state=0)
|
||||
clf.fit(X, y_regr)
|
||||
|
||||
# Check that an empty discrete ensemble fails in fit, not predict.
|
||||
X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]]
|
||||
y_fail = ["foo", "bar", 1, 2]
|
||||
clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
|
||||
assert_raises_regexp(ValueError, "worse than random",
|
||||
clf.fit, X_fail, y_fail)
|
||||
|
||||
|
||||
def test_sparse_classification():
|
||||
# Check classification with sparse input.
|
||||
|
||||
class CustomSVC(SVC):
|
||||
"""SVC variant that records the nature of the training set."""
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Modification on fit caries data type for later verification."""
|
||||
super().fit(X, y, sample_weight=sample_weight)
|
||||
self.data_type_ = type(X)
|
||||
return self
|
||||
|
||||
X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=15,
|
||||
n_features=5,
|
||||
random_state=42)
|
||||
# Flatten y to a 1d array
|
||||
y = np.ravel(y)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||
|
||||
for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
|
||||
dok_matrix]:
|
||||
X_train_sparse = sparse_format(X_train)
|
||||
X_test_sparse = sparse_format(X_test)
|
||||
|
||||
# Trained on sparse format
|
||||
sparse_classifier = AdaBoostClassifier(
|
||||
base_estimator=CustomSVC(probability=True),
|
||||
random_state=1,
|
||||
algorithm="SAMME"
|
||||
).fit(X_train_sparse, y_train)
|
||||
|
||||
# Trained on dense format
|
||||
dense_classifier = AdaBoostClassifier(
|
||||
base_estimator=CustomSVC(probability=True),
|
||||
random_state=1,
|
||||
algorithm="SAMME"
|
||||
).fit(X_train, y_train)
|
||||
|
||||
# predict
|
||||
sparse_results = sparse_classifier.predict(X_test_sparse)
|
||||
dense_results = dense_classifier.predict(X_test)
|
||||
assert_array_equal(sparse_results, dense_results)
|
||||
|
||||
# decision_function
|
||||
sparse_results = sparse_classifier.decision_function(X_test_sparse)
|
||||
dense_results = dense_classifier.decision_function(X_test)
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
# predict_log_proba
|
||||
sparse_results = sparse_classifier.predict_log_proba(X_test_sparse)
|
||||
dense_results = dense_classifier.predict_log_proba(X_test)
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
# predict_proba
|
||||
sparse_results = sparse_classifier.predict_proba(X_test_sparse)
|
||||
dense_results = dense_classifier.predict_proba(X_test)
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
# score
|
||||
sparse_results = sparse_classifier.score(X_test_sparse, y_test)
|
||||
dense_results = dense_classifier.score(X_test, y_test)
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
# staged_decision_function
|
||||
sparse_results = sparse_classifier.staged_decision_function(
|
||||
X_test_sparse)
|
||||
dense_results = dense_classifier.staged_decision_function(X_test)
|
||||
for sprase_res, dense_res in zip(sparse_results, dense_results):
|
||||
assert_array_almost_equal(sprase_res, dense_res)
|
||||
|
||||
# staged_predict
|
||||
sparse_results = sparse_classifier.staged_predict(X_test_sparse)
|
||||
dense_results = dense_classifier.staged_predict(X_test)
|
||||
for sprase_res, dense_res in zip(sparse_results, dense_results):
|
||||
assert_array_equal(sprase_res, dense_res)
|
||||
|
||||
# staged_predict_proba
|
||||
sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse)
|
||||
dense_results = dense_classifier.staged_predict_proba(X_test)
|
||||
for sprase_res, dense_res in zip(sparse_results, dense_results):
|
||||
assert_array_almost_equal(sprase_res, dense_res)
|
||||
|
||||
# staged_score
|
||||
sparse_results = sparse_classifier.staged_score(X_test_sparse,
|
||||
y_test)
|
||||
dense_results = dense_classifier.staged_score(X_test, y_test)
|
||||
for sprase_res, dense_res in zip(sparse_results, dense_results):
|
||||
assert_array_equal(sprase_res, dense_res)
|
||||
|
||||
# Verify sparsity of data is maintained during training
|
||||
types = [i.data_type_ for i in sparse_classifier.estimators_]
|
||||
|
||||
assert all([(t == csc_matrix or t == csr_matrix)
|
||||
for t in types])
|
||||
|
||||
|
||||
def test_sparse_regression():
|
||||
# Check regression with sparse input.
|
||||
|
||||
class CustomSVR(SVR):
|
||||
"""SVR variant that records the nature of the training set."""
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Modification on fit caries data type for later verification."""
|
||||
super().fit(X, y, sample_weight=sample_weight)
|
||||
self.data_type_ = type(X)
|
||||
return self
|
||||
|
||||
X, y = datasets.make_regression(n_samples=15, n_features=50, n_targets=1,
|
||||
random_state=42)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||
|
||||
for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
|
||||
dok_matrix]:
|
||||
X_train_sparse = sparse_format(X_train)
|
||||
X_test_sparse = sparse_format(X_test)
|
||||
|
||||
# Trained on sparse format
|
||||
sparse_classifier = AdaBoostRegressor(
|
||||
base_estimator=CustomSVR(),
|
||||
random_state=1
|
||||
).fit(X_train_sparse, y_train)
|
||||
|
||||
# Trained on dense format
|
||||
dense_classifier = dense_results = AdaBoostRegressor(
|
||||
base_estimator=CustomSVR(),
|
||||
random_state=1
|
||||
).fit(X_train, y_train)
|
||||
|
||||
# predict
|
||||
sparse_results = sparse_classifier.predict(X_test_sparse)
|
||||
dense_results = dense_classifier.predict(X_test)
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
# staged_predict
|
||||
sparse_results = sparse_classifier.staged_predict(X_test_sparse)
|
||||
dense_results = dense_classifier.staged_predict(X_test)
|
||||
for sprase_res, dense_res in zip(sparse_results, dense_results):
|
||||
assert_array_almost_equal(sprase_res, dense_res)
|
||||
|
||||
types = [i.data_type_ for i in sparse_classifier.estimators_]
|
||||
|
||||
assert all([(t == csc_matrix or t == csr_matrix)
|
||||
for t in types])
|
||||
|
||||
|
||||
def test_sample_weight_adaboost_regressor():
|
||||
"""
|
||||
AdaBoostRegressor should work without sample_weights in the base estimator
|
||||
The random weighted sampling is done internally in the _boost method in
|
||||
AdaBoostRegressor.
|
||||
"""
|
||||
class DummyEstimator(BaseEstimator):
|
||||
|
||||
def fit(self, X, y):
|
||||
pass
|
||||
|
||||
def predict(self, X):
|
||||
return np.zeros(X.shape[0])
|
||||
|
||||
boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
|
||||
boost.fit(X, y_regr)
|
||||
assert len(boost.estimator_weights_) == len(boost.estimator_errors_)
|
||||
|
||||
|
||||
def test_multidimensional_X():
|
||||
"""
|
||||
Check that the AdaBoost estimators can work with n-dimensional
|
||||
data matrix
|
||||
"""
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
X = rng.randn(50, 3, 3)
|
||||
yc = rng.choice([0, 1], 50)
|
||||
yr = rng.randn(50)
|
||||
|
||||
boost = AdaBoostClassifier(DummyClassifier(strategy='most_frequent'))
|
||||
boost.fit(X, yc)
|
||||
boost.predict(X)
|
||||
boost.predict_proba(X)
|
||||
|
||||
boost = AdaBoostRegressor(DummyRegressor())
|
||||
boost.fit(X, yr)
|
||||
boost.predict(X)
|
||||
|
||||
|
||||
# TODO: Remove in 0.24 when DummyClassifier's `strategy` default changes
|
||||
@ignore_warnings
|
||||
@pytest.mark.parametrize("algorithm", ['SAMME', 'SAMME.R'])
|
||||
def test_adaboostclassifier_without_sample_weight(algorithm):
|
||||
X, y = iris.data, iris.target
|
||||
base_estimator = NoSampleWeightWrapper(DummyClassifier())
|
||||
clf = AdaBoostClassifier(
|
||||
base_estimator=base_estimator, algorithm=algorithm
|
||||
)
|
||||
err_msg = ("{} doesn't support sample_weight"
|
||||
.format(base_estimator.__class__.__name__))
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_adaboostregressor_sample_weight():
|
||||
# check that giving weight will have an influence on the error computed
|
||||
# for a weak learner
|
||||
rng = np.random.RandomState(42)
|
||||
X = np.linspace(0, 100, num=1000)
|
||||
y = (.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001)
|
||||
X = X.reshape(-1, 1)
|
||||
|
||||
# add an arbitrary outlier
|
||||
X[-1] *= 10
|
||||
y[-1] = 10000
|
||||
|
||||
# random_state=0 ensure that the underlying bootstrap will use the outlier
|
||||
regr_no_outlier = AdaBoostRegressor(
|
||||
base_estimator=LinearRegression(), n_estimators=1, random_state=0
|
||||
)
|
||||
regr_with_weight = clone(regr_no_outlier)
|
||||
regr_with_outlier = clone(regr_no_outlier)
|
||||
|
||||
# fit 3 models:
|
||||
# - a model containing the outlier
|
||||
# - a model without the outlier
|
||||
# - a model containing the outlier but with a null sample-weight
|
||||
regr_with_outlier.fit(X, y)
|
||||
regr_no_outlier.fit(X[:-1], y[:-1])
|
||||
sample_weight = np.ones_like(y)
|
||||
sample_weight[-1] = 0
|
||||
regr_with_weight.fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1])
|
||||
score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1])
|
||||
score_with_weight = regr_with_weight.score(X[:-1], y[:-1])
|
||||
|
||||
assert score_with_outlier < score_no_outlier
|
||||
assert score_with_outlier < score_with_weight
|
||||
assert score_no_outlier == pytest.approx(score_with_weight)
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||||
def test_adaboost_consistent_predict(algorithm):
|
||||
# check that predict_proba and predict give consistent results
|
||||
# regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/14084
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
*datasets.load_digits(return_X_y=True), random_state=42
|
||||
)
|
||||
model = AdaBoostClassifier(algorithm=algorithm, random_state=42)
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
assert_array_equal(
|
||||
np.argmax(model.predict_proba(X_test), axis=1),
|
||||
model.predict(X_test)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'model, X, y',
|
||||
[(AdaBoostClassifier(), iris.data, iris.target),
|
||||
(AdaBoostRegressor(), boston.data, boston.target)]
|
||||
)
|
||||
def test_adaboost_negative_weight_error(model, X, y):
|
||||
sample_weight = np.ones_like(y)
|
||||
sample_weight[-1] = -10
|
||||
|
||||
err_msg = "sample_weight cannot contain negative weight"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
model.fit(X, y, sample_weight=sample_weight)
|
Loading…
Add table
Add a link
Reference in a new issue