Uploaded Test files

This commit is contained in:
Batuhan Berk Başoğlu 2020-11-12 11:05:57 -05:00
parent f584ad9d97
commit 2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions

View file

@ -0,0 +1,40 @@
"""
The :mod:`sklearn.ensemble` module includes ensemble-based methods for
classification, regression and anomaly detection.
"""
import typing
from ._base import BaseEnsemble
from ._forest import RandomForestClassifier
from ._forest import RandomForestRegressor
from ._forest import RandomTreesEmbedding
from ._forest import ExtraTreesClassifier
from ._forest import ExtraTreesRegressor
from ._bagging import BaggingClassifier
from ._bagging import BaggingRegressor
from ._iforest import IsolationForest
from ._weight_boosting import AdaBoostClassifier
from ._weight_boosting import AdaBoostRegressor
from ._gb import GradientBoostingClassifier
from ._gb import GradientBoostingRegressor
from ._voting import VotingClassifier
from ._voting import VotingRegressor
from ._stacking import StackingClassifier
from ._stacking import StackingRegressor
if typing.TYPE_CHECKING:
# Avoid errors in type checkers (e.g. mypy) for experimental estimators.
# TODO: remove this check once the estimator is no longer experimental.
from ._hist_gradient_boosting.gradient_boosting import ( # noqa
HistGradientBoostingRegressor, HistGradientBoostingClassifier
)
__all__ = ["BaseEnsemble",
"RandomForestClassifier", "RandomForestRegressor",
"RandomTreesEmbedding", "ExtraTreesClassifier",
"ExtraTreesRegressor", "BaggingClassifier",
"BaggingRegressor", "IsolationForest", "GradientBoostingClassifier",
"GradientBoostingRegressor", "AdaBoostClassifier",
"AdaBoostRegressor", "VotingClassifier", "VotingRegressor",
"StackingClassifier", "StackingRegressor",
]

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,287 @@
"""Base class for ensemble-based estimators."""
# Authors: Gilles Louppe
# License: BSD 3 clause
from abc import ABCMeta, abstractmethod
import numbers
import warnings
from typing import List
import numpy as np
from joblib import effective_n_jobs
from ..base import clone
from ..base import is_classifier, is_regressor
from ..base import BaseEstimator
from ..base import MetaEstimatorMixin
from ..utils import Bunch, _print_elapsed_time
from ..utils import check_random_state
from ..utils.metaestimators import _BaseComposition
def _fit_single_estimator(estimator, X, y, sample_weight=None,
message_clsname=None, message=None):
"""Private function used to fit an estimator within a job."""
if sample_weight is not None:
try:
with _print_elapsed_time(message_clsname, message):
estimator.fit(X, y, sample_weight=sample_weight)
except TypeError as exc:
if "unexpected keyword argument 'sample_weight'" in str(exc):
raise TypeError(
"Underlying estimator {} does not support sample weights."
.format(estimator.__class__.__name__)
) from exc
raise
else:
with _print_elapsed_time(message_clsname, message):
estimator.fit(X, y)
return estimator
def _set_random_states(estimator, random_state=None):
"""Set fixed random_state parameters for an estimator.
Finds all parameters ending ``random_state`` and sets them to integers
derived from ``random_state``.
Parameters
----------
estimator : estimator supporting get/set_params
Estimator with potential randomness managed by random_state
parameters.
random_state : int or RandomState, default=None
Pseudo-random number generator to control the generation of the random
integers. Pass an int for reproducible output across multiple function
calls.
See :term:`Glossary <random_state>`.
Notes
-----
This does not necessarily set *all* ``random_state`` attributes that
control an estimator's randomness, only those accessible through
``estimator.get_params()``. ``random_state``s not controlled include
those belonging to:
* cross-validation splitters
* ``scipy.stats`` rvs
"""
random_state = check_random_state(random_state)
to_set = {}
for key in sorted(estimator.get_params(deep=True)):
if key == 'random_state' or key.endswith('__random_state'):
to_set[key] = random_state.randint(np.iinfo(np.int32).max)
if to_set:
estimator.set_params(**to_set)
class BaseEnsemble(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
"""Base class for all ensemble classes.
Warning: This class should not be used directly. Use derived classes
instead.
Parameters
----------
base_estimator : object
The base estimator from which the ensemble is built.
n_estimators : int, default=10
The number of estimators in the ensemble.
estimator_params : list of str, default=tuple()
The list of attributes to use as parameters when instantiating a
new base estimator. If none are given, default parameters are used.
Attributes
----------
base_estimator_ : estimator
The base estimator from which the ensemble is grown.
estimators_ : list of estimators
The collection of fitted base estimators.
"""
# overwrite _required_parameters from MetaEstimatorMixin
_required_parameters: List[str] = []
@abstractmethod
def __init__(self, base_estimator, *, n_estimators=10,
estimator_params=tuple()):
# Set parameters
self.base_estimator = base_estimator
self.n_estimators = n_estimators
self.estimator_params = estimator_params
# Don't instantiate estimators now! Parameters of base_estimator might
# still change. Eg., when grid-searching with the nested object syntax.
# self.estimators_ needs to be filled by the derived classes in fit.
def _validate_estimator(self, default=None):
"""Check the estimator and the n_estimator attribute.
Sets the base_estimator_` attributes.
"""
if not isinstance(self.n_estimators, numbers.Integral):
raise ValueError("n_estimators must be an integer, "
"got {0}.".format(type(self.n_estimators)))
if self.n_estimators <= 0:
raise ValueError("n_estimators must be greater than zero, "
"got {0}.".format(self.n_estimators))
if self.base_estimator is not None:
self.base_estimator_ = self.base_estimator
else:
self.base_estimator_ = default
if self.base_estimator_ is None:
raise ValueError("base_estimator cannot be None")
def _make_estimator(self, append=True, random_state=None):
"""Make and configure a copy of the `base_estimator_` attribute.
Warning: This method should be used to properly instantiate new
sub-estimators.
"""
estimator = clone(self.base_estimator_)
estimator.set_params(**{p: getattr(self, p)
for p in self.estimator_params})
if random_state is not None:
_set_random_states(estimator, random_state)
if append:
self.estimators_.append(estimator)
return estimator
def __len__(self):
"""Return the number of estimators in the ensemble."""
return len(self.estimators_)
def __getitem__(self, index):
"""Return the index'th estimator in the ensemble."""
return self.estimators_[index]
def __iter__(self):
"""Return iterator over estimators in the ensemble."""
return iter(self.estimators_)
def _partition_estimators(n_estimators, n_jobs):
"""Private function used to partition estimators between jobs."""
# Compute the number of jobs
n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
# Partition estimators between jobs
n_estimators_per_job = np.full(n_jobs, n_estimators // n_jobs,
dtype=np.int)
n_estimators_per_job[:n_estimators % n_jobs] += 1
starts = np.cumsum(n_estimators_per_job)
return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
class _BaseHeterogeneousEnsemble(MetaEstimatorMixin, _BaseComposition,
metaclass=ABCMeta):
"""Base class for heterogeneous ensemble of learners.
Parameters
----------
estimators : list of (str, estimator) tuples
The ensemble of estimators to use in the ensemble. Each element of the
list is defined as a tuple of string (i.e. name of the estimator) and
an estimator instance. An estimator can be set to `'drop'` using
`set_params`.
Attributes
----------
estimators_ : list of estimators
The elements of the estimators parameter, having been fitted on the
training data. If an estimator has been set to `'drop'`, it will not
appear in `estimators_`.
"""
_required_parameters = ['estimators']
@property
def named_estimators(self):
return Bunch(**dict(self.estimators))
@abstractmethod
def __init__(self, estimators):
self.estimators = estimators
def _validate_estimators(self):
if self.estimators is None or len(self.estimators) == 0:
raise ValueError(
"Invalid 'estimators' attribute, 'estimators' should be a list"
" of (string, estimator) tuples."
)
names, estimators = zip(*self.estimators)
# defined by MetaEstimatorMixin
self._validate_names(names)
# FIXME: deprecate the usage of None to drop an estimator from the
# ensemble. Remove in 0.24
if any(est is None for est in estimators):
warnings.warn(
"Using 'None' to drop an estimator from the ensemble is "
"deprecated in 0.22 and support will be dropped in 0.24. "
"Use the string 'drop' instead.", FutureWarning
)
has_estimator = any(est not in (None, 'drop') for est in estimators)
if not has_estimator:
raise ValueError(
"All estimators are dropped. At least one is required "
"to be an estimator."
)
is_estimator_type = (is_classifier if is_classifier(self)
else is_regressor)
for est in estimators:
if est not in (None, 'drop') and not is_estimator_type(est):
raise ValueError(
"The estimator {} should be a {}.".format(
est.__class__.__name__, is_estimator_type.__name__[3:]
)
)
return names, estimators
def set_params(self, **params):
"""
Set the parameters of an estimator from the ensemble.
Valid parameter keys can be listed with `get_params()`.
Parameters
----------
**params : keyword arguments
Specific parameters using e.g.
`set_params(parameter_name=new_value)`. In addition, to setting the
parameters of the stacking estimator, the individual estimator of
the stacking estimators can also be set, or can be removed by
setting them to 'drop'.
"""
super()._set_params('estimators', **params)
return self
def get_params(self, deep=True):
"""
Get the parameters of an estimator from the ensemble.
Parameters
----------
deep : bool, default=True
Setting it to True gets the various classifiers and the parameters
of the classifiers as well.
"""
return super()._get_params('estimators', deep=deep)

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,881 @@
"""Losses and corresponding default initial estimators for gradient boosting
decision trees.
"""
from abc import ABCMeta
from abc import abstractmethod
import numpy as np
from scipy.special import expit, logsumexp
from ..tree._tree import TREE_LEAF
from ..utils.stats import _weighted_percentile
from ..dummy import DummyClassifier
from ..dummy import DummyRegressor
class LossFunction(metaclass=ABCMeta):
"""Abstract base class for various loss functions.
Parameters
----------
n_classes : int
Number of classes.
Attributes
----------
K : int
The number of regression trees to be induced;
1 for regression and binary classification;
``n_classes`` for multi-class classification.
"""
is_multi_class = False
def __init__(self, n_classes):
self.K = n_classes
def init_estimator(self):
"""Default ``init`` estimator for loss function. """
raise NotImplementedError()
@abstractmethod
def __call__(self, y, raw_predictions, sample_weight=None):
"""Compute the loss.
Parameters
----------
y : ndarray of shape (n_samples,)
True labels.
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves).
sample_weight : ndarray of shape (n_samples,), default=None
Sample weights.
"""
@abstractmethod
def negative_gradient(self, y, raw_predictions, **kargs):
"""Compute the negative gradient.
Parameters
----------
y : ndarray of shape (n_samples,)
The target labels.
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble at iteration ``i - 1``.
"""
def update_terminal_regions(self, tree, X, y, residual, raw_predictions,
sample_weight, sample_mask,
learning_rate=0.1, k=0):
"""Update the terminal regions (=leaves) of the given tree and
updates the current predictions of the model. Traverses tree
and invokes template method `_update_terminal_region`.
Parameters
----------
tree : tree.Tree
The tree object.
X : ndarray of shape (n_samples, n_features)
The data array.
y : ndarray of shape (n_samples,)
The target labels.
residual : ndarray of shape (n_samples,)
The residuals (usually the negative gradient).
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble at iteration ``i - 1``.
sample_weight : ndarray of shape (n_samples,)
The weight of each sample.
sample_mask : ndarray of shape (n_samples,)
The sample mask to be used.
learning_rate : float, default=0.1
Learning rate shrinks the contribution of each tree by
``learning_rate``.
k : int, default=0
The index of the estimator being updated.
"""
# compute leaf for each sample in ``X``.
terminal_regions = tree.apply(X)
# mask all which are not in sample mask.
masked_terminal_regions = terminal_regions.copy()
masked_terminal_regions[~sample_mask] = -1
# update each leaf (= perform line search)
for leaf in np.where(tree.children_left == TREE_LEAF)[0]:
self._update_terminal_region(tree, masked_terminal_regions,
leaf, X, y, residual,
raw_predictions[:, k], sample_weight)
# update predictions (both in-bag and out-of-bag)
raw_predictions[:, k] += \
learning_rate * tree.value[:, 0, 0].take(terminal_regions, axis=0)
@abstractmethod
def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
residual, raw_predictions, sample_weight):
"""Template method for updating terminal regions (i.e., leaves)."""
@abstractmethod
def get_init_raw_predictions(self, X, estimator):
"""Return the initial raw predictions.
Parameters
----------
X : ndarray of shape (n_samples, n_features)
The data array.
estimator : object
The estimator to use to compute the predictions.
Returns
-------
raw_predictions : ndarray of shape (n_samples, K)
The initial raw predictions. K is equal to 1 for binary
classification and regression, and equal to the number of classes
for multiclass classification. ``raw_predictions`` is casted
into float64.
"""
pass
class RegressionLossFunction(LossFunction, metaclass=ABCMeta):
"""Base class for regression loss functions.
Parameters
----------
n_classes : int
Number of classes.
"""
def __init__(self, n_classes):
if n_classes != 1:
raise ValueError("``n_classes`` must be 1 for regression but "
"was %r" % n_classes)
super().__init__(n_classes)
def check_init_estimator(self, estimator):
"""Make sure estimator has the required fit and predict methods.
Parameters
----------
estimator : object
The init estimator to check.
"""
if not (hasattr(estimator, 'fit') and hasattr(estimator, 'predict')):
raise ValueError(
"The init parameter must be a valid estimator and "
"support both fit and predict."
)
def get_init_raw_predictions(self, X, estimator):
predictions = estimator.predict(X)
return predictions.reshape(-1, 1).astype(np.float64)
class LeastSquaresError(RegressionLossFunction):
"""Loss function for least squares (LS) estimation.
Terminal regions do not need to be updated for least squares.
Parameters
----------
n_classes : int
Number of classes.
"""
def init_estimator(self):
return DummyRegressor(strategy='mean')
def __call__(self, y, raw_predictions, sample_weight=None):
"""Compute the least squares loss.
Parameters
----------
y : ndarray of shape (n_samples,)
True labels.
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves).
sample_weight : ndarray of shape (n_samples,), default=None
Sample weights.
"""
if sample_weight is None:
return np.mean((y - raw_predictions.ravel()) ** 2)
else:
return (1 / sample_weight.sum() * np.sum(
sample_weight * ((y - raw_predictions.ravel()) ** 2)))
def negative_gradient(self, y, raw_predictions, **kargs):
"""Compute the negative gradient.
Parameters
----------
y : ndarray of shape (n_samples,)
The target labels.
raw_predictions : ndarray of shape (n_samples,)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble at iteration ``i - 1``.
"""
return y - raw_predictions.ravel()
def update_terminal_regions(self, tree, X, y, residual, raw_predictions,
sample_weight, sample_mask,
learning_rate=0.1, k=0):
"""Least squares does not need to update terminal regions.
But it has to update the predictions.
Parameters
----------
tree : tree.Tree
The tree object.
X : ndarray of shape (n_samples, n_features)
The data array.
y : ndarray of shape (n_samples,)
The target labels.
residual : ndarray of shape (n_samples,)
The residuals (usually the negative gradient).
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble at iteration ``i - 1``.
sample_weight : ndarray of shape (n,)
The weight of each sample.
sample_mask : ndarray of shape (n,)
The sample mask to be used.
learning_rate : float, default=0.1
Learning rate shrinks the contribution of each tree by
``learning_rate``.
k : int, default=0
The index of the estimator being updated.
"""
# update predictions
raw_predictions[:, k] += learning_rate * tree.predict(X).ravel()
def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
residual, raw_predictions, sample_weight):
pass
class LeastAbsoluteError(RegressionLossFunction):
"""Loss function for least absolute deviation (LAD) regression.
Parameters
----------
n_classes : int
Number of classes
"""
def init_estimator(self):
return DummyRegressor(strategy='quantile', quantile=.5)
def __call__(self, y, raw_predictions, sample_weight=None):
"""Compute the least absolute error.
Parameters
----------
y : ndarray of shape (n_samples,)
True labels.
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves).
sample_weight : ndarray of shape (n_samples,), default=None
Sample weights.
"""
if sample_weight is None:
return np.abs(y - raw_predictions.ravel()).mean()
else:
return (1 / sample_weight.sum() * np.sum(
sample_weight * np.abs(y - raw_predictions.ravel())))
def negative_gradient(self, y, raw_predictions, **kargs):
"""Compute the negative gradient.
1.0 if y - raw_predictions > 0.0 else -1.0
Parameters
----------
y : ndarray of shape (n_samples,)
The target labels.
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble at iteration ``i - 1``.
"""
raw_predictions = raw_predictions.ravel()
return 2 * (y - raw_predictions > 0) - 1
def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
residual, raw_predictions, sample_weight):
"""LAD updates terminal regions to median estimates."""
terminal_region = np.where(terminal_regions == leaf)[0]
sample_weight = sample_weight.take(terminal_region, axis=0)
diff = (y.take(terminal_region, axis=0) -
raw_predictions.take(terminal_region, axis=0))
tree.value[leaf, 0, 0] = _weighted_percentile(diff, sample_weight,
percentile=50)
class HuberLossFunction(RegressionLossFunction):
"""Huber loss function for robust regression.
M-Regression proposed in Friedman 2001.
Parameters
----------
n_classes : int
Number of classes.
alpha : float, default=0.9
Percentile at which to extract score.
References
----------
J. Friedman, Greedy Function Approximation: A Gradient Boosting
Machine, The Annals of Statistics, Vol. 29, No. 5, 2001.
"""
def __init__(self, n_classes, alpha=0.9):
super().__init__(n_classes)
self.alpha = alpha
self.gamma = None
def init_estimator(self):
return DummyRegressor(strategy='quantile', quantile=.5)
def __call__(self, y, raw_predictions, sample_weight=None):
"""Compute the Huber loss.
Parameters
----------
y : ndarray of shape (n_samples,)
True labels.
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble.
sample_weight : ndarray of shape (n_samples,), default=None
Sample weights.
"""
raw_predictions = raw_predictions.ravel()
diff = y - raw_predictions
gamma = self.gamma
if gamma is None:
if sample_weight is None:
gamma = np.percentile(np.abs(diff), self.alpha * 100)
else:
gamma = _weighted_percentile(np.abs(diff), sample_weight,
self.alpha * 100)
gamma_mask = np.abs(diff) <= gamma
if sample_weight is None:
sq_loss = np.sum(0.5 * diff[gamma_mask] ** 2)
lin_loss = np.sum(gamma * (np.abs(diff[~gamma_mask]) -
gamma / 2))
loss = (sq_loss + lin_loss) / y.shape[0]
else:
sq_loss = np.sum(0.5 * sample_weight[gamma_mask] *
diff[gamma_mask] ** 2)
lin_loss = np.sum(gamma * sample_weight[~gamma_mask] *
(np.abs(diff[~gamma_mask]) - gamma / 2))
loss = (sq_loss + lin_loss) / sample_weight.sum()
return loss
def negative_gradient(self, y, raw_predictions, sample_weight=None,
**kargs):
"""Compute the negative gradient.
Parameters
----------
y : ndarray of shape (n_samples,)
The target labels.
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble at iteration ``i - 1``.
sample_weight : ndarray of shape (n_samples,), default=None
Sample weights.
"""
raw_predictions = raw_predictions.ravel()
diff = y - raw_predictions
if sample_weight is None:
gamma = np.percentile(np.abs(diff), self.alpha * 100)
else:
gamma = _weighted_percentile(np.abs(diff), sample_weight,
self.alpha * 100)
gamma_mask = np.abs(diff) <= gamma
residual = np.zeros((y.shape[0],), dtype=np.float64)
residual[gamma_mask] = diff[gamma_mask]
residual[~gamma_mask] = gamma * np.sign(diff[~gamma_mask])
self.gamma = gamma
return residual
def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
residual, raw_predictions, sample_weight):
terminal_region = np.where(terminal_regions == leaf)[0]
sample_weight = sample_weight.take(terminal_region, axis=0)
gamma = self.gamma
diff = (y.take(terminal_region, axis=0)
- raw_predictions.take(terminal_region, axis=0))
median = _weighted_percentile(diff, sample_weight, percentile=50)
diff_minus_median = diff - median
tree.value[leaf, 0] = median + np.mean(
np.sign(diff_minus_median) *
np.minimum(np.abs(diff_minus_median), gamma))
class QuantileLossFunction(RegressionLossFunction):
"""Loss function for quantile regression.
Quantile regression allows to estimate the percentiles
of the conditional distribution of the target.
Parameters
----------
n_classes : int
Number of classes.
alpha : float, default=0.9
The percentile.
"""
def __init__(self, n_classes, alpha=0.9):
super().__init__(n_classes)
self.alpha = alpha
self.percentile = alpha * 100
def init_estimator(self):
return DummyRegressor(strategy='quantile', quantile=self.alpha)
def __call__(self, y, raw_predictions, sample_weight=None):
"""Compute the Quantile loss.
Parameters
----------
y : ndarray of shape (n_samples,)
True labels.
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble.
sample_weight : ndarray of shape (n_samples,), default=None
Sample weights.
"""
raw_predictions = raw_predictions.ravel()
diff = y - raw_predictions
alpha = self.alpha
mask = y > raw_predictions
if sample_weight is None:
loss = (alpha * diff[mask].sum() -
(1 - alpha) * diff[~mask].sum()) / y.shape[0]
else:
loss = ((alpha * np.sum(sample_weight[mask] * diff[mask]) -
(1 - alpha) * np.sum(sample_weight[~mask] *
diff[~mask])) / sample_weight.sum())
return loss
def negative_gradient(self, y, raw_predictions, **kargs):
"""Compute the negative gradient.
Parameters
----------
y : ndarray of shape (n_samples,)
The target labels.
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble at iteration ``i - 1``.
"""
alpha = self.alpha
raw_predictions = raw_predictions.ravel()
mask = y > raw_predictions
return (alpha * mask) - ((1 - alpha) * ~mask)
def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
residual, raw_predictions, sample_weight):
terminal_region = np.where(terminal_regions == leaf)[0]
diff = (y.take(terminal_region, axis=0)
- raw_predictions.take(terminal_region, axis=0))
sample_weight = sample_weight.take(terminal_region, axis=0)
val = _weighted_percentile(diff, sample_weight, self.percentile)
tree.value[leaf, 0] = val
class ClassificationLossFunction(LossFunction, metaclass=ABCMeta):
"""Base class for classification loss functions. """
def _raw_prediction_to_proba(self, raw_predictions):
"""Template method to convert raw predictions into probabilities.
Parameters
----------
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble.
Returns
-------
probas : ndarray of shape (n_samples, K)
The predicted probabilities.
"""
@abstractmethod
def _raw_prediction_to_decision(self, raw_predictions):
"""Template method to convert raw predictions to decisions.
Parameters
----------
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble.
Returns
-------
encoded_predictions : ndarray of shape (n_samples, K)
The predicted encoded labels.
"""
def check_init_estimator(self, estimator):
"""Make sure estimator has fit and predict_proba methods.
Parameters
----------
estimator : object
The init estimator to check.
"""
if not (hasattr(estimator, 'fit') and
hasattr(estimator, 'predict_proba')):
raise ValueError(
"The init parameter must be a valid estimator "
"and support both fit and predict_proba."
)
class BinomialDeviance(ClassificationLossFunction):
"""Binomial deviance loss function for binary classification.
Binary classification is a special case; here, we only need to
fit one tree instead of ``n_classes`` trees.
Parameters
----------
n_classes : int
Number of classes.
"""
def __init__(self, n_classes):
if n_classes != 2:
raise ValueError("{0:s} requires 2 classes; got {1:d} class(es)"
.format(self.__class__.__name__, n_classes))
# we only need to fit one tree for binary clf.
super().__init__(n_classes=1)
def init_estimator(self):
# return the most common class, taking into account the samples
# weights
return DummyClassifier(strategy='prior')
def __call__(self, y, raw_predictions, sample_weight=None):
"""Compute the deviance (= 2 * negative log-likelihood).
Parameters
----------
y : ndarray of shape (n_samples,)
True labels.
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble.
sample_weight : ndarray of shape (n_samples,), default=None
Sample weights.
"""
# logaddexp(0, v) == log(1.0 + exp(v))
raw_predictions = raw_predictions.ravel()
if sample_weight is None:
return -2 * np.mean((y * raw_predictions) -
np.logaddexp(0, raw_predictions))
else:
return (-2 / sample_weight.sum() * np.sum(
sample_weight * ((y * raw_predictions) -
np.logaddexp(0, raw_predictions))))
def negative_gradient(self, y, raw_predictions, **kargs):
"""Compute the residual (= negative gradient).
Parameters
----------
y : ndarray of shape (n_samples,)
True labels.
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble at iteration ``i - 1``.
"""
return y - expit(raw_predictions.ravel())
def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
residual, raw_predictions, sample_weight):
"""Make a single Newton-Raphson step.
our node estimate is given by:
sum(w * (y - prob)) / sum(w * prob * (1 - prob))
we take advantage that: y - prob = residual
"""
terminal_region = np.where(terminal_regions == leaf)[0]
residual = residual.take(terminal_region, axis=0)
y = y.take(terminal_region, axis=0)
sample_weight = sample_weight.take(terminal_region, axis=0)
numerator = np.sum(sample_weight * residual)
denominator = np.sum(sample_weight *
(y - residual) * (1 - y + residual))
# prevents overflow and division by zero
if abs(denominator) < 1e-150:
tree.value[leaf, 0, 0] = 0.0
else:
tree.value[leaf, 0, 0] = numerator / denominator
def _raw_prediction_to_proba(self, raw_predictions):
proba = np.ones((raw_predictions.shape[0], 2), dtype=np.float64)
proba[:, 1] = expit(raw_predictions.ravel())
proba[:, 0] -= proba[:, 1]
return proba
def _raw_prediction_to_decision(self, raw_predictions):
proba = self._raw_prediction_to_proba(raw_predictions)
return np.argmax(proba, axis=1)
def get_init_raw_predictions(self, X, estimator):
probas = estimator.predict_proba(X)
proba_pos_class = probas[:, 1]
eps = np.finfo(np.float32).eps
proba_pos_class = np.clip(proba_pos_class, eps, 1 - eps)
# log(x / (1 - x)) is the inverse of the sigmoid (expit) function
raw_predictions = np.log(proba_pos_class / (1 - proba_pos_class))
return raw_predictions.reshape(-1, 1).astype(np.float64)
class MultinomialDeviance(ClassificationLossFunction):
"""Multinomial deviance loss function for multi-class classification.
For multi-class classification we need to fit ``n_classes`` trees at
each stage.
Parameters
----------
n_classes : int
Number of classes.
"""
is_multi_class = True
def __init__(self, n_classes):
if n_classes < 3:
raise ValueError("{0:s} requires more than 2 classes.".format(
self.__class__.__name__))
super().__init__(n_classes)
def init_estimator(self):
return DummyClassifier(strategy='prior')
def __call__(self, y, raw_predictions, sample_weight=None):
"""Compute the Multinomial deviance.
Parameters
----------
y : ndarray of shape (n_samples,)
True labels.
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble.
sample_weight : ndarray of shape (n_samples,), default=None
Sample weights.
"""
# create one-hot label encoding
Y = np.zeros((y.shape[0], self.K), dtype=np.float64)
for k in range(self.K):
Y[:, k] = y == k
return np.average(
-1 * (Y * raw_predictions).sum(axis=1) +
logsumexp(raw_predictions, axis=1),
weights=sample_weight
)
def negative_gradient(self, y, raw_predictions, k=0, **kwargs):
"""Compute negative gradient for the ``k``-th class.
Parameters
----------
y : ndarray of shape (n_samples,)
The target labels.
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble at iteration ``i - 1``.
k : int, default=0
The index of the class.
"""
return y - np.nan_to_num(np.exp(raw_predictions[:, k] -
logsumexp(raw_predictions, axis=1)))
def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
residual, raw_predictions, sample_weight):
"""Make a single Newton-Raphson step. """
terminal_region = np.where(terminal_regions == leaf)[0]
residual = residual.take(terminal_region, axis=0)
y = y.take(terminal_region, axis=0)
sample_weight = sample_weight.take(terminal_region, axis=0)
numerator = np.sum(sample_weight * residual)
numerator *= (self.K - 1) / self.K
denominator = np.sum(sample_weight * (y - residual) *
(1 - y + residual))
# prevents overflow and division by zero
if abs(denominator) < 1e-150:
tree.value[leaf, 0, 0] = 0.0
else:
tree.value[leaf, 0, 0] = numerator / denominator
def _raw_prediction_to_proba(self, raw_predictions):
return np.nan_to_num(
np.exp(raw_predictions -
(logsumexp(raw_predictions, axis=1)[:, np.newaxis])))
def _raw_prediction_to_decision(self, raw_predictions):
proba = self._raw_prediction_to_proba(raw_predictions)
return np.argmax(proba, axis=1)
def get_init_raw_predictions(self, X, estimator):
probas = estimator.predict_proba(X)
eps = np.finfo(np.float32).eps
probas = np.clip(probas, eps, 1 - eps)
raw_predictions = np.log(probas).astype(np.float64)
return raw_predictions
class ExponentialLoss(ClassificationLossFunction):
"""Exponential loss function for binary classification.
Same loss as AdaBoost.
Parameters
----------
n_classes : int
Number of classes.
References
----------
Greg Ridgeway, Generalized Boosted Models: A guide to the gbm package, 2007
"""
def __init__(self, n_classes):
if n_classes != 2:
raise ValueError("{0:s} requires 2 classes; got {1:d} class(es)"
.format(self.__class__.__name__, n_classes))
# we only need to fit one tree for binary clf.
super().__init__(n_classes=1)
def init_estimator(self):
return DummyClassifier(strategy='prior')
def __call__(self, y, raw_predictions, sample_weight=None):
"""Compute the exponential loss
Parameters
----------
y : ndarray of shape (n_samples,)
True labels.
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble.
sample_weight : ndarray of shape (n_samples,), default=None
Sample weights.
"""
raw_predictions = raw_predictions.ravel()
if sample_weight is None:
return np.mean(np.exp(-(2. * y - 1.) * raw_predictions))
else:
return (1.0 / sample_weight.sum() * np.sum(
sample_weight * np.exp(-(2 * y - 1) * raw_predictions)))
def negative_gradient(self, y, raw_predictions, **kargs):
"""Compute the residual (= negative gradient).
Parameters
----------
y : ndarray of shape (n_samples,)
True labels.
raw_predictions : ndarray of shape (n_samples, K)
The raw predictions (i.e. values from the tree leaves) of the
tree ensemble at iteration ``i - 1``.
"""
y_ = -(2. * y - 1.)
return y_ * np.exp(y_ * raw_predictions.ravel())
def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
residual, raw_predictions, sample_weight):
terminal_region = np.where(terminal_regions == leaf)[0]
raw_predictions = raw_predictions.take(terminal_region, axis=0)
y = y.take(terminal_region, axis=0)
sample_weight = sample_weight.take(terminal_region, axis=0)
y_ = 2. * y - 1.
numerator = np.sum(y_ * sample_weight * np.exp(-y_ * raw_predictions))
denominator = np.sum(sample_weight * np.exp(-y_ * raw_predictions))
# prevents overflow and division by zero
if abs(denominator) < 1e-150:
tree.value[leaf, 0, 0] = 0.0
else:
tree.value[leaf, 0, 0] = numerator / denominator
def _raw_prediction_to_proba(self, raw_predictions):
proba = np.ones((raw_predictions.shape[0], 2), dtype=np.float64)
proba[:, 1] = expit(2.0 * raw_predictions.ravel())
proba[:, 0] -= proba[:, 1]
return proba
def _raw_prediction_to_decision(self, raw_predictions):
return (raw_predictions.ravel() >= 0).astype(np.int)
def get_init_raw_predictions(self, X, estimator):
probas = estimator.predict_proba(X)
proba_pos_class = probas[:, 1]
eps = np.finfo(np.float32).eps
proba_pos_class = np.clip(proba_pos_class, eps, 1 - eps)
# according to The Elements of Statistical Learning sec. 10.5, the
# minimizer of the exponential loss is .5 * log odds ratio. So this is
# the equivalent to .5 * binomial_deviance.get_init_raw_predictions()
raw_predictions = .5 * np.log(proba_pos_class / (1 - proba_pos_class))
return raw_predictions.reshape(-1, 1).astype(np.float64)
LOSS_FUNCTIONS = {
'ls': LeastSquaresError,
'lad': LeastAbsoluteError,
'huber': HuberLossFunction,
'quantile': QuantileLossFunction,
'deviance': None, # for both, multinomial and binomial
'exponential': ExponentialLoss,
}

View file

@ -0,0 +1,5 @@
"""This module implements histogram-based gradient boosting estimators.
The implementation is a port from pygbm which is itself strongly inspired
from LightGBM.
"""

View file

@ -0,0 +1,204 @@
"""
This module contains the BinMapper class.
BinMapper is used for mapping a real-valued dataset into integer-valued bins.
Bin thresholds are computed with the quantiles so that each bin contains
approximately the same number of samples.
"""
# Author: Nicolas Hug
import numpy as np
from ...utils import check_random_state, check_array
from ...base import BaseEstimator, TransformerMixin
from ...utils.validation import check_is_fitted
from ._binning import _map_to_bins
from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF
def _find_binning_thresholds(data, max_bins, subsample, random_state):
"""Extract feature-wise quantiles from numerical data.
Missing values are ignored for finding the thresholds.
Parameters
----------
data : array-like, shape (n_samples, n_features)
The data to bin.
max_bins: int
The maximum number of bins to use for non-missing values. If for a
given feature the number of unique values is less than ``max_bins``,
then those unique values will be used to compute the bin thresholds,
instead of the quantiles.
subsample : int or None
If ``n_samples > subsample``, then ``sub_samples`` samples will be
randomly chosen to compute the quantiles. If ``None``, the whole data
is used.
random_state: int, RandomState instance or None
Pseudo-random number generator to control the random sub-sampling.
Pass an int for reproducible output across multiple
function calls.
See :term: `Glossary <random_state>`.
Return
------
binning_thresholds: list of arrays
For each feature, stores the increasing numeric values that can
be used to separate the bins. Thus ``len(binning_thresholds) ==
n_features``.
"""
rng = check_random_state(random_state)
if subsample is not None and data.shape[0] > subsample:
subset = rng.choice(data.shape[0], subsample, replace=False)
data = data.take(subset, axis=0)
binning_thresholds = []
for f_idx in range(data.shape[1]):
col_data = data[:, f_idx]
# ignore missing values when computing bin thresholds
missing_mask = np.isnan(col_data)
if missing_mask.any():
col_data = col_data[~missing_mask]
col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
distinct_values = np.unique(col_data)
if len(distinct_values) <= max_bins:
midpoints = distinct_values[:-1] + distinct_values[1:]
midpoints *= .5
else:
# We sort again the data in this case. We could compute
# approximate midpoint percentiles using the output of
# np.unique(col_data, return_counts) instead but this is more
# work and the performance benefit will be limited because we
# work on a fixed-size subsample of the full data.
percentiles = np.linspace(0, 100, num=max_bins + 1)
percentiles = percentiles[1:-1]
midpoints = np.percentile(col_data, percentiles,
interpolation='midpoint').astype(X_DTYPE)
assert midpoints.shape[0] == max_bins - 1
# We avoid having +inf thresholds: +inf thresholds are only allowed in
# a "split on nan" situation.
np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)
binning_thresholds.append(midpoints)
return binning_thresholds
class _BinMapper(TransformerMixin, BaseEstimator):
"""Transformer that maps a dataset into integer-valued bins.
The bins are created in a feature-wise fashion, using quantiles so that
each bins contains approximately the same number of samples.
For large datasets, quantiles are computed on a subset of the data to
speed-up the binning, but the quantiles should remain stable.
Features with a small number of values may be binned into less than
``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved
for missing values.
Parameters
----------
n_bins : int, optional (default=256)
The maximum number of bins to use (including the bin for missing
values). Non-missing values are binned on ``max_bins = n_bins - 1``
bins. The last bin is always reserved for missing values. If for a
given feature the number of unique values is less than ``max_bins``,
then those unique values will be used to compute the bin thresholds,
instead of the quantiles.
subsample : int or None, optional (default=2e5)
If ``n_samples > subsample``, then ``sub_samples`` samples will be
randomly chosen to compute the quantiles. If ``None``, the whole data
is used.
random_state: int, RandomState instance or None
Pseudo-random number generator to control the random sub-sampling.
Pass an int for reproducible output across multiple
function calls.
See :term: `Glossary <random_state>`.
Attributes
----------
bin_thresholds_ : list of arrays
For each feature, gives the real-valued bin threhsolds. There are
``max_bins - 1`` thresholds, where ``max_bins = n_bins - 1`` is the
number of bins used for non-missing values.
n_bins_non_missing_ : array of uint32
For each feature, gives the number of bins actually used for
non-missing values. For features with a lot of unique values, this is
equal to ``n_bins - 1``.
missing_values_bin_idx_ : uint8
The index of the bin where missing values are mapped. This is a
constant across all features. This corresponds to the last bin, and
it is always equal to ``n_bins - 1``. Note that if ``n_bins_missing_``
is less than ``n_bins - 1`` for a given feature, then there are
empty (and unused) bins.
"""
def __init__(self, n_bins=256, subsample=int(2e5), random_state=None):
self.n_bins = n_bins
self.subsample = subsample
self.random_state = random_state
def fit(self, X, y=None):
"""Fit data X by computing the binning thresholds.
The last bin is reserved for missing values, whether missing values
are present in the data or not.
Parameters
----------
X : array-like, shape (n_samples, n_features)
The data to bin.
y: None
Ignored.
Returns
-------
self : object
"""
if not (3 <= self.n_bins <= 256):
# min is 3: at least 2 distinct bins and a missing values bin
raise ValueError('n_bins={} should be no smaller than 3 '
'and no larger than 256.'.format(self.n_bins))
X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
max_bins = self.n_bins - 1
self.bin_thresholds_ = _find_binning_thresholds(
X, max_bins, subsample=self.subsample,
random_state=self.random_state)
self.n_bins_non_missing_ = np.array(
[thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_],
dtype=np.uint32)
self.missing_values_bin_idx_ = self.n_bins - 1
return self
def transform(self, X):
"""Bin data X.
Missing values will be mapped to the last bin.
Parameters
----------
X : array-like, shape (n_samples, n_features)
The data to bin.
Returns
-------
X_binned : array-like, shape (n_samples, n_features)
The binned data (fortran-aligned).
"""
X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
check_is_fitted(self)
if X.shape[1] != self.n_bins_non_missing_.shape[0]:
raise ValueError(
'This estimator was fitted with {} features but {} got passed '
'to transform()'.format(self.n_bins_non_missing_.shape[0],
X.shape[1])
)
binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F')
_map_to_bins(X, self.bin_thresholds_, self.missing_values_bin_idx_,
binned)
return binned

View file

@ -0,0 +1,40 @@
# cython: language_level=3
import numpy as np
cimport numpy as np
np.import_array()
ctypedef np.npy_float64 X_DTYPE_C
ctypedef np.npy_uint8 X_BINNED_DTYPE_C
ctypedef np.npy_float64 Y_DTYPE_C
ctypedef np.npy_float32 G_H_DTYPE_C
cdef packed struct hist_struct:
# Same as histogram dtype but we need a struct to declare views. It needs
# to be packed since by default numpy dtypes aren't aligned
Y_DTYPE_C sum_gradients
Y_DTYPE_C sum_hessians
unsigned int count
cdef packed struct node_struct:
# Equivalent struct to PREDICTOR_RECORD_DTYPE to use in memory views. It
# needs to be packed since by default numpy dtypes aren't aligned
Y_DTYPE_C value
unsigned int count
unsigned int feature_idx
X_DTYPE_C threshold
unsigned char missing_go_to_left
unsigned int left
unsigned int right
Y_DTYPE_C gain
unsigned int depth
unsigned char is_leaf
X_BINNED_DTYPE_C bin_threshold
cpdef enum MonotonicConstraint:
NO_CST = 0
POS = 1
NEG = -1

View file

@ -0,0 +1,571 @@
"""
This module contains the TreeGrower class.
TreeGrowee builds a regression tree fitting a Newton-Raphson step, based on
the gradients and hessians of the training data.
"""
# Author: Nicolas Hug
from heapq import heappush, heappop
import numpy as np
from timeit import default_timer as time
import numbers
from .splitting import Splitter
from .histogram import HistogramBuilder
from .predictor import TreePredictor
from .utils import sum_parallel
from .common import PREDICTOR_RECORD_DTYPE
from .common import Y_DTYPE
from .common import MonotonicConstraint
EPS = np.finfo(Y_DTYPE).eps # to avoid zero division errors
class TreeNode:
"""Tree Node class used in TreeGrower.
This isn't used for prediction purposes, only for training (see
TreePredictor).
Parameters
----------
depth : int
The depth of the node, i.e. its distance from the root.
sample_indices : ndarray of unsigned int, shape (n_samples_at_node,)
The indices of the samples at the node.
sum_gradients : float
The sum of the gradients of the samples at the node.
sum_hessians : float
The sum of the hessians of the samples at the node.
parent : TreeNode or None, optional (default=None)
The parent of the node. None for root.
Attributes
----------
depth : int
The depth of the node, i.e. its distance from the root.
sample_indices : ndarray of unsigned int, shape (n_samples_at_node,)
The indices of the samples at the node.
sum_gradients : float
The sum of the gradients of the samples at the node.
sum_hessians : float
The sum of the hessians of the samples at the node.
parent : TreeNode or None
The parent of the node. None for root.
split_info : SplitInfo or None
The result of the split evaluation.
left_child : TreeNode or None
The left child of the node. None for leaves.
right_child : TreeNode or None
The right child of the node. None for leaves.
value : float or None
The value of the leaf, as computed in finalize_leaf(). None for
non-leaf nodes.
partition_start : int
start position of the node's sample_indices in splitter.partition.
partition_stop : int
stop position of the node's sample_indices in splitter.partition.
"""
split_info = None
left_child = None
right_child = None
histograms = None
sibling = None
parent = None
# start and stop indices of the node in the splitter.partition
# array. Concretely,
# self.sample_indices = view(self.splitter.partition[start:stop])
# Please see the comments about splitter.partition and
# splitter.split_indices for more info about this design.
# These 2 attributes are only used in _update_raw_prediction, because we
# need to iterate over the leaves and I don't know how to efficiently
# store the sample_indices views because they're all of different sizes.
partition_start = 0
partition_stop = 0
def __init__(self, depth, sample_indices, sum_gradients,
sum_hessians, parent=None, value=None):
self.depth = depth
self.sample_indices = sample_indices
self.n_samples = sample_indices.shape[0]
self.sum_gradients = sum_gradients
self.sum_hessians = sum_hessians
self.parent = parent
self.value = value
self.is_leaf = False
self.set_children_bounds(float('-inf'), float('+inf'))
def set_children_bounds(self, lower, upper):
"""Set children values bounds to respect monotonic constraints."""
# These are bounds for the node's *children* values, not the node's
# value. The bounds are used in the splitter when considering potential
# left and right child.
self.children_lower_bound = lower
self.children_upper_bound = upper
def __lt__(self, other_node):
"""Comparison for priority queue.
Nodes with high gain are higher priority than nodes with low gain.
heapq.heappush only need the '<' operator.
heapq.heappop take the smallest item first (smaller is higher
priority).
Parameters
----------
other_node : TreeNode
The node to compare with.
"""
return self.split_info.gain > other_node.split_info.gain
class TreeGrower:
"""Tree grower class used to build a tree.
The tree is fitted to predict the values of a Newton-Raphson step. The
splits are considered in a best-first fashion, and the quality of a
split is defined in splitting._split_gain.
Parameters
----------
X_binned : ndarray of int, shape (n_samples, n_features)
The binned input samples. Must be Fortran-aligned.
gradients : ndarray, shape (n_samples,)
The gradients of each training sample. Those are the gradients of the
loss w.r.t the predictions, evaluated at iteration ``i - 1``.
hessians : ndarray, shape (n_samples,)
The hessians of each training sample. Those are the hessians of the
loss w.r.t the predictions, evaluated at iteration ``i - 1``.
max_leaf_nodes : int or None, optional (default=None)
The maximum number of leaves for each tree. If None, there is no
maximum limit.
max_depth : int or None, optional (default=None)
The maximum depth of each tree. The depth of a tree is the number of
edges to go from the root to the deepest leaf.
Depth isn't constrained by default.
min_samples_leaf : int, optional (default=20)
The minimum number of samples per leaf.
min_gain_to_split : float, optional (default=0.)
The minimum gain needed to split a node. Splits with lower gain will
be ignored.
n_bins : int, optional (default=256)
The total number of bins, including the bin for missing values. Used
to define the shape of the histograms.
n_bins_non_missing_ : array of uint32
For each feature, gives the number of bins actually used for
non-missing values. For features with a lot of unique values, this
is equal to ``n_bins - 1``. If it's an int, all features are
considered to have the same number of bins. If None, all features
are considered to have ``n_bins - 1`` bins.
has_missing_values : ndarray of bool or bool, optional (default=False)
Whether each feature contains missing values (in the training data).
If it's a bool, the same value is used for all features.
l2_regularization : float, optional (default=0)
The L2 regularization parameter.
min_hessian_to_split : float, optional (default=1e-3)
The minimum sum of hessians needed in each node. Splits that result in
at least one child having a sum of hessians less than
``min_hessian_to_split`` are discarded.
shrinkage : float, optional (default=1)
The shrinkage parameter to apply to the leaves values, also known as
learning rate.
"""
def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,
max_depth=None, min_samples_leaf=20, min_gain_to_split=0.,
n_bins=256, n_bins_non_missing=None, has_missing_values=False,
monotonic_cst=None, l2_regularization=0.,
min_hessian_to_split=1e-3, shrinkage=1.):
self._validate_parameters(X_binned, max_leaf_nodes, max_depth,
min_samples_leaf, min_gain_to_split,
l2_regularization, min_hessian_to_split)
if n_bins_non_missing is None:
n_bins_non_missing = n_bins - 1
if isinstance(n_bins_non_missing, numbers.Integral):
n_bins_non_missing = np.array(
[n_bins_non_missing] * X_binned.shape[1],
dtype=np.uint32)
else:
n_bins_non_missing = np.asarray(n_bins_non_missing,
dtype=np.uint32)
if isinstance(has_missing_values, bool):
has_missing_values = [has_missing_values] * X_binned.shape[1]
has_missing_values = np.asarray(has_missing_values, dtype=np.uint8)
if monotonic_cst is None:
self.with_monotonic_cst = False
monotonic_cst = np.full(shape=X_binned.shape[1],
fill_value=MonotonicConstraint.NO_CST,
dtype=np.int8)
else:
self.with_monotonic_cst = True
monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
if monotonic_cst.shape[0] != X_binned.shape[1]:
raise ValueError(
"monotonic_cst has shape {} but the input data "
"X has {} features.".format(
monotonic_cst.shape[0], X_binned.shape[1]
)
)
if np.any(monotonic_cst < -1) or np.any(monotonic_cst > 1):
raise ValueError(
"monotonic_cst must be None or an array-like of "
"-1, 0 or 1."
)
hessians_are_constant = hessians.shape[0] == 1
self.histogram_builder = HistogramBuilder(
X_binned, n_bins, gradients, hessians, hessians_are_constant)
missing_values_bin_idx = n_bins - 1
self.splitter = Splitter(
X_binned, n_bins_non_missing, missing_values_bin_idx,
has_missing_values, monotonic_cst,
l2_regularization, min_hessian_to_split,
min_samples_leaf, min_gain_to_split, hessians_are_constant)
self.n_bins_non_missing = n_bins_non_missing
self.max_leaf_nodes = max_leaf_nodes
self.has_missing_values = has_missing_values
self.monotonic_cst = monotonic_cst
self.l2_regularization = l2_regularization
self.n_features = X_binned.shape[1]
self.max_depth = max_depth
self.min_samples_leaf = min_samples_leaf
self.X_binned = X_binned
self.min_gain_to_split = min_gain_to_split
self.shrinkage = shrinkage
self.splittable_nodes = []
self.finalized_leaves = []
self.total_find_split_time = 0. # time spent finding the best splits
self.total_compute_hist_time = 0. # time spent computing histograms
self.total_apply_split_time = 0. # time spent splitting nodes
self._intilialize_root(gradients, hessians, hessians_are_constant)
self.n_nodes = 1
def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth,
min_samples_leaf, min_gain_to_split,
l2_regularization, min_hessian_to_split):
"""Validate parameters passed to __init__.
Also validate parameters passed to splitter.
"""
if X_binned.dtype != np.uint8:
raise NotImplementedError(
"X_binned must be of type uint8.")
if not X_binned.flags.f_contiguous:
raise ValueError(
"X_binned should be passed as Fortran contiguous "
"array for maximum efficiency.")
if max_leaf_nodes is not None and max_leaf_nodes <= 1:
raise ValueError('max_leaf_nodes={} should not be'
' smaller than 2'.format(max_leaf_nodes))
if max_depth is not None and max_depth < 1:
raise ValueError('max_depth={} should not be'
' smaller than 1'.format(max_depth))
if min_samples_leaf < 1:
raise ValueError('min_samples_leaf={} should '
'not be smaller than 1'.format(min_samples_leaf))
if min_gain_to_split < 0:
raise ValueError('min_gain_to_split={} '
'must be positive.'.format(min_gain_to_split))
if l2_regularization < 0:
raise ValueError('l2_regularization={} must be '
'positive.'.format(l2_regularization))
if min_hessian_to_split < 0:
raise ValueError('min_hessian_to_split={} '
'must be positive.'.format(min_hessian_to_split))
def grow(self):
"""Grow the tree, from root to leaves."""
while self.splittable_nodes:
self.split_next()
self._apply_shrinkage()
def _apply_shrinkage(self):
"""Multiply leaves values by shrinkage parameter.
This must be done at the very end of the growing process. If this were
done during the growing process e.g. in finalize_leaf(), then a leaf
would be shrunk but its sibling would potentially not be (if it's a
non-leaf), which would lead to a wrong computation of the 'middle'
value needed to enforce the monotonic constraints.
"""
for leaf in self.finalized_leaves:
leaf.value *= self.shrinkage
def _intilialize_root(self, gradients, hessians, hessians_are_constant):
"""Initialize root node and finalize it if needed."""
n_samples = self.X_binned.shape[0]
depth = 0
sum_gradients = sum_parallel(gradients)
if self.histogram_builder.hessians_are_constant:
sum_hessians = hessians[0] * n_samples
else:
sum_hessians = sum_parallel(hessians)
self.root = TreeNode(
depth=depth,
sample_indices=self.splitter.partition,
sum_gradients=sum_gradients,
sum_hessians=sum_hessians,
value=0
)
self.root.partition_start = 0
self.root.partition_stop = n_samples
if self.root.n_samples < 2 * self.min_samples_leaf:
# Do not even bother computing any splitting statistics.
self._finalize_leaf(self.root)
return
if sum_hessians < self.splitter.min_hessian_to_split:
self._finalize_leaf(self.root)
return
self.root.histograms = self.histogram_builder.compute_histograms_brute(
self.root.sample_indices)
self._compute_best_split_and_push(self.root)
def _compute_best_split_and_push(self, node):
"""Compute the best possible split (SplitInfo) of a given node.
Also push it in the heap of splittable nodes if gain isn't zero.
The gain of a node is 0 if either all the leaves are pure
(best gain = 0), or if no split would satisfy the constraints,
(min_hessians_to_split, min_gain_to_split, min_samples_leaf)
"""
node.split_info = self.splitter.find_node_split(
node.n_samples, node.histograms, node.sum_gradients,
node.sum_hessians, node.value, node.children_lower_bound,
node.children_upper_bound)
if node.split_info.gain <= 0: # no valid split
self._finalize_leaf(node)
else:
heappush(self.splittable_nodes, node)
def split_next(self):
"""Split the node with highest potential gain.
Returns
-------
left : TreeNode
The resulting left child.
right : TreeNode
The resulting right child.
"""
# Consider the node with the highest loss reduction (a.k.a. gain)
node = heappop(self.splittable_nodes)
tic = time()
(sample_indices_left,
sample_indices_right,
right_child_pos) = self.splitter.split_indices(node.split_info,
node.sample_indices)
self.total_apply_split_time += time() - tic
depth = node.depth + 1
n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes)
n_leaf_nodes += 2
left_child_node = TreeNode(depth,
sample_indices_left,
node.split_info.sum_gradient_left,
node.split_info.sum_hessian_left,
parent=node,
value=node.split_info.value_left,
)
right_child_node = TreeNode(depth,
sample_indices_right,
node.split_info.sum_gradient_right,
node.split_info.sum_hessian_right,
parent=node,
value=node.split_info.value_right,
)
left_child_node.sibling = right_child_node
right_child_node.sibling = left_child_node
node.right_child = right_child_node
node.left_child = left_child_node
# set start and stop indices
left_child_node.partition_start = node.partition_start
left_child_node.partition_stop = node.partition_start + right_child_pos
right_child_node.partition_start = left_child_node.partition_stop
right_child_node.partition_stop = node.partition_stop
if not self.has_missing_values[node.split_info.feature_idx]:
# If no missing values are encountered at fit time, then samples
# with missing values during predict() will go to whichever child
# has the most samples.
node.split_info.missing_go_to_left = (
left_child_node.n_samples > right_child_node.n_samples)
self.n_nodes += 2
if (self.max_leaf_nodes is not None
and n_leaf_nodes == self.max_leaf_nodes):
self._finalize_leaf(left_child_node)
self._finalize_leaf(right_child_node)
self._finalize_splittable_nodes()
return left_child_node, right_child_node
if self.max_depth is not None and depth == self.max_depth:
self._finalize_leaf(left_child_node)
self._finalize_leaf(right_child_node)
return left_child_node, right_child_node
if left_child_node.n_samples < self.min_samples_leaf * 2:
self._finalize_leaf(left_child_node)
if right_child_node.n_samples < self.min_samples_leaf * 2:
self._finalize_leaf(right_child_node)
if self.with_monotonic_cst:
# Set value bounds for respecting monotonic constraints
# See test_nodes_values() for details
if (self.monotonic_cst[node.split_info.feature_idx] ==
MonotonicConstraint.NO_CST):
lower_left = lower_right = node.children_lower_bound
upper_left = upper_right = node.children_upper_bound
else:
mid = (left_child_node.value + right_child_node.value) / 2
if (self.monotonic_cst[node.split_info.feature_idx] ==
MonotonicConstraint.POS):
lower_left, upper_left = node.children_lower_bound, mid
lower_right, upper_right = mid, node.children_upper_bound
else: # NEG
lower_left, upper_left = mid, node.children_upper_bound
lower_right, upper_right = node.children_lower_bound, mid
left_child_node.set_children_bounds(lower_left, upper_left)
right_child_node.set_children_bounds(lower_right, upper_right)
# Compute histograms of children, and compute their best possible split
# (if needed)
should_split_left = not left_child_node.is_leaf
should_split_right = not right_child_node.is_leaf
if should_split_left or should_split_right:
# We will compute the histograms of both nodes even if one of them
# is a leaf, since computing the second histogram is very cheap
# (using histogram subtraction).
n_samples_left = left_child_node.sample_indices.shape[0]
n_samples_right = right_child_node.sample_indices.shape[0]
if n_samples_left < n_samples_right:
smallest_child = left_child_node
largest_child = right_child_node
else:
smallest_child = right_child_node
largest_child = left_child_node
# We use the brute O(n_samples) method on the child that has the
# smallest number of samples, and the subtraction trick O(n_bins)
# on the other one.
tic = time()
smallest_child.histograms = \
self.histogram_builder.compute_histograms_brute(
smallest_child.sample_indices)
largest_child.histograms = \
self.histogram_builder.compute_histograms_subtraction(
node.histograms, smallest_child.histograms)
self.total_compute_hist_time += time() - tic
tic = time()
if should_split_left:
self._compute_best_split_and_push(left_child_node)
if should_split_right:
self._compute_best_split_and_push(right_child_node)
self.total_find_split_time += time() - tic
return left_child_node, right_child_node
def _finalize_leaf(self, node):
"""Make node a leaf of the tree being grown."""
node.is_leaf = True
self.finalized_leaves.append(node)
def _finalize_splittable_nodes(self):
"""Transform all splittable nodes into leaves.
Used when some constraint is met e.g. maximum number of leaves or
maximum depth."""
while len(self.splittable_nodes) > 0:
node = self.splittable_nodes.pop()
self._finalize_leaf(node)
def make_predictor(self, bin_thresholds=None):
"""Make a TreePredictor object out of the current tree.
Parameters
----------
bin_thresholds : array-like of floats, optional (default=None)
The actual thresholds values of each bin.
Returns
-------
A TreePredictor object.
"""
predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE)
_fill_predictor_node_array(predictor_nodes, self.root,
bin_thresholds, self.n_bins_non_missing)
return TreePredictor(predictor_nodes)
def _fill_predictor_node_array(predictor_nodes, grower_node,
bin_thresholds, n_bins_non_missing,
next_free_idx=0):
"""Helper used in make_predictor to set the TreePredictor fields."""
node = predictor_nodes[next_free_idx]
node['count'] = grower_node.n_samples
node['depth'] = grower_node.depth
if grower_node.split_info is not None:
node['gain'] = grower_node.split_info.gain
else:
node['gain'] = -1
node['value'] = grower_node.value
if grower_node.is_leaf:
# Leaf node
node['is_leaf'] = True
return next_free_idx + 1
else:
# Decision node
split_info = grower_node.split_info
feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx
node['feature_idx'] = feature_idx
node['bin_threshold'] = bin_idx
node['missing_go_to_left'] = split_info.missing_go_to_left
if split_info.bin_idx == n_bins_non_missing[feature_idx] - 1:
# Split is on the last non-missing bin: it's a "split on nans". All
# nans go to the right, the rest go to the left.
node['threshold'] = np.inf
elif bin_thresholds is not None:
node['threshold'] = bin_thresholds[feature_idx][bin_idx]
next_free_idx += 1
node['left'] = next_free_idx
next_free_idx = _fill_predictor_node_array(
predictor_nodes, grower_node.left_child,
bin_thresholds=bin_thresholds,
n_bins_non_missing=n_bins_non_missing,
next_free_idx=next_free_idx)
node['right'] = next_free_idx
return _fill_predictor_node_array(
predictor_nodes, grower_node.right_child,
bin_thresholds=bin_thresholds,
n_bins_non_missing=n_bins_non_missing,
next_free_idx=next_free_idx)

View file

@ -0,0 +1,426 @@
"""
This module contains the loss classes.
Specific losses are used for regression, binary classification or multiclass
classification.
"""
# Author: Nicolas Hug
from abc import ABC, abstractmethod
import numpy as np
from scipy.special import expit, logsumexp, xlogy
from .common import Y_DTYPE
from .common import G_H_DTYPE
from ._loss import _update_gradients_least_squares
from ._loss import _update_gradients_hessians_least_squares
from ._loss import _update_gradients_least_absolute_deviation
from ._loss import _update_gradients_hessians_least_absolute_deviation
from ._loss import _update_gradients_hessians_binary_crossentropy
from ._loss import _update_gradients_hessians_categorical_crossentropy
from ._loss import _update_gradients_hessians_poisson
from ...utils.stats import _weighted_percentile
class BaseLoss(ABC):
"""Base class for a loss."""
def __init__(self, hessians_are_constant):
self.hessians_are_constant = hessians_are_constant
def __call__(self, y_true, raw_predictions, sample_weight):
"""Return the weighted average loss"""
return np.average(self.pointwise_loss(y_true, raw_predictions),
weights=sample_weight)
@abstractmethod
def pointwise_loss(self, y_true, raw_predictions):
"""Return loss value for each input"""
# This variable indicates whether the loss requires the leaves values to
# be updated once the tree has been trained. The trees are trained to
# predict a Newton-Raphson step (see grower._finalize_leaf()). But for
# some losses (e.g. least absolute deviation) we need to adjust the tree
# values to account for the "line search" of the gradient descent
# procedure. See the original paper Greedy Function Approximation: A
# Gradient Boosting Machine by Friedman
# (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
need_update_leaves_values = False
def init_gradients_and_hessians(self, n_samples, prediction_dim,
sample_weight):
"""Return initial gradients and hessians.
Unless hessians are constant, arrays are initialized with undefined
values.
Parameters
----------
n_samples : int
The number of samples passed to `fit()`.
prediction_dim : int
The dimension of a raw prediction, i.e. the number of trees
built at each iteration. Equals 1 for regression and binary
classification, or K where K is the number of classes for
multiclass classification.
sample_weight : array-like of shape(n_samples,) default=None
Weights of training data.
Returns
-------
gradients : ndarray, shape (prediction_dim, n_samples)
The initial gradients. The array is not initialized.
hessians : ndarray, shape (prediction_dim, n_samples)
If hessians are constant (e.g. for `LeastSquares` loss, the
array is initialized to ``1``. Otherwise, the array is allocated
without being initialized.
"""
shape = (prediction_dim, n_samples)
gradients = np.empty(shape=shape, dtype=G_H_DTYPE)
if self.hessians_are_constant:
# If the hessians are constant, we consider they are equal to 1.
# - This is correct for the half LS loss
# - For LAD loss, hessians are actually 0, but they are always
# ignored anyway.
hessians = np.ones(shape=(1, 1), dtype=G_H_DTYPE)
else:
hessians = np.empty(shape=shape, dtype=G_H_DTYPE)
return gradients, hessians
@abstractmethod
def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
"""Return initial predictions (before the first iteration).
Parameters
----------
y_train : ndarray, shape (n_samples,)
The target training values.
sample_weight : array-like of shape(n_samples,) default=None
Weights of training data.
prediction_dim : int
The dimension of one prediction: 1 for binary classification and
regression, n_classes for multiclass classification.
Returns
-------
baseline_prediction : float or ndarray, shape (1, prediction_dim)
The baseline prediction.
"""
@abstractmethod
def update_gradients_and_hessians(self, gradients, hessians, y_true,
raw_predictions, sample_weight):
"""Update gradients and hessians arrays, inplace.
The gradients (resp. hessians) are the first (resp. second) order
derivatives of the loss for each sample with respect to the
predictions of model, evaluated at iteration ``i - 1``.
Parameters
----------
gradients : ndarray, shape (prediction_dim, n_samples)
The gradients (treated as OUT array).
hessians : ndarray, shape (prediction_dim, n_samples) or \
(1,)
The hessians (treated as OUT array).
y_true : ndarray, shape (n_samples,)
The true target values or each training sample.
raw_predictions : ndarray, shape (prediction_dim, n_samples)
The raw_predictions (i.e. values from the trees) of the tree
ensemble at iteration ``i - 1``.
sample_weight : array-like of shape(n_samples,) default=None
Weights of training data.
"""
class LeastSquares(BaseLoss):
"""Least squares loss, for regression.
For a given sample x_i, least squares loss is defined as::
loss(x_i) = 0.5 * (y_true_i - raw_pred_i)**2
This actually computes the half least squares loss to simplify
the computation of the gradients and get a unit hessian (and be consistent
with what is done in LightGBM).
"""
def __init__(self, sample_weight):
# If sample weights are provided, the hessians and gradients
# are multiplied by sample_weight, which means the hessians are
# equal to sample weights.
super().__init__(hessians_are_constant=sample_weight is None)
def pointwise_loss(self, y_true, raw_predictions):
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
# return a view.
raw_predictions = raw_predictions.reshape(-1)
loss = 0.5 * np.power(y_true - raw_predictions, 2)
return loss
def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
return np.average(y_train, weights=sample_weight)
@staticmethod
def inverse_link_function(raw_predictions):
return raw_predictions
def update_gradients_and_hessians(self, gradients, hessians, y_true,
raw_predictions, sample_weight):
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
# return a view.
raw_predictions = raw_predictions.reshape(-1)
gradients = gradients.reshape(-1)
if sample_weight is None:
_update_gradients_least_squares(gradients, y_true, raw_predictions)
else:
hessians = hessians.reshape(-1)
_update_gradients_hessians_least_squares(gradients, hessians,
y_true, raw_predictions,
sample_weight)
class LeastAbsoluteDeviation(BaseLoss):
"""Least absolute deviation, for regression.
For a given sample x_i, the loss is defined as::
loss(x_i) = |y_true_i - raw_pred_i|
"""
def __init__(self, sample_weight):
# If sample weights are provided, the hessians and gradients
# are multiplied by sample_weight, which means the hessians are
# equal to sample weights.
super().__init__(hessians_are_constant=sample_weight is None)
# This variable indicates whether the loss requires the leaves values to
# be updated once the tree has been trained. The trees are trained to
# predict a Newton-Raphson step (see grower._finalize_leaf()). But for
# some losses (e.g. least absolute deviation) we need to adjust the tree
# values to account for the "line search" of the gradient descent
# procedure. See the original paper Greedy Function Approximation: A
# Gradient Boosting Machine by Friedman
# (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
need_update_leaves_values = True
def pointwise_loss(self, y_true, raw_predictions):
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
# return a view.
raw_predictions = raw_predictions.reshape(-1)
loss = np.abs(y_true - raw_predictions)
return loss
def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
if sample_weight is None:
return np.median(y_train)
else:
return _weighted_percentile(y_train, sample_weight, 50)
@staticmethod
def inverse_link_function(raw_predictions):
return raw_predictions
def update_gradients_and_hessians(self, gradients, hessians, y_true,
raw_predictions, sample_weight):
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
# return a view.
raw_predictions = raw_predictions.reshape(-1)
gradients = gradients.reshape(-1)
if sample_weight is None:
_update_gradients_least_absolute_deviation(gradients, y_true,
raw_predictions)
else:
hessians = hessians.reshape(-1)
_update_gradients_hessians_least_absolute_deviation(
gradients, hessians, y_true, raw_predictions, sample_weight)
def update_leaves_values(self, grower, y_true, raw_predictions,
sample_weight):
# Update the values predicted by the tree with
# median(y_true - raw_predictions).
# See note about need_update_leaves_values in BaseLoss.
# TODO: ideally this should be computed in parallel over the leaves
# using something similar to _update_raw_predictions(), but this
# requires a cython version of median()
for leaf in grower.finalized_leaves:
indices = leaf.sample_indices
if sample_weight is None:
median_res = np.median(y_true[indices]
- raw_predictions[indices])
else:
median_res = _weighted_percentile(y_true[indices]
- raw_predictions[indices],
sample_weight=sample_weight,
percentile=50)
leaf.value = grower.shrinkage * median_res
# Note that the regularization is ignored here
class Poisson(BaseLoss):
"""Poisson deviance loss with log-link, for regression.
For a given sample x_i, Poisson deviance loss is defined as::
loss(x_i) = y_true_i * log(y_true_i/exp(raw_pred_i))
- y_true_i + exp(raw_pred_i))
This actually computes half the Poisson deviance to simplify
the computation of the gradients.
"""
def __init__(self, sample_weight):
super().__init__(hessians_are_constant=False)
inverse_link_function = staticmethod(np.exp)
def pointwise_loss(self, y_true, raw_predictions):
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
# return a view.
raw_predictions = raw_predictions.reshape(-1)
# TODO: For speed, we could remove the constant xlogy(y_true, y_true)
# Advantage of this form: minimum of zero at raw_predictions = y_true.
loss = (xlogy(y_true, y_true) - y_true * (raw_predictions + 1)
+ np.exp(raw_predictions))
return loss
def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
y_pred = np.average(y_train, weights=sample_weight)
eps = np.finfo(y_train.dtype).eps
y_pred = np.clip(y_pred, eps, None)
return np.log(y_pred)
def update_gradients_and_hessians(self, gradients, hessians, y_true,
raw_predictions, sample_weight):
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
# return a view.
raw_predictions = raw_predictions.reshape(-1)
gradients = gradients.reshape(-1)
hessians = hessians.reshape(-1)
_update_gradients_hessians_poisson(gradients, hessians,
y_true, raw_predictions,
sample_weight)
class BinaryCrossEntropy(BaseLoss):
"""Binary cross-entropy loss, for binary classification.
For a given sample x_i, the binary cross-entropy loss is defined as the
negative log-likelihood of the model which can be expressed as::
loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
section 4.4.1 (about logistic regression).
"""
def __init__(self, sample_weight):
super().__init__(hessians_are_constant=False)
inverse_link_function = staticmethod(expit)
def pointwise_loss(self, y_true, raw_predictions):
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
# return a view.
raw_predictions = raw_predictions.reshape(-1)
# logaddexp(0, x) = log(1 + exp(x))
loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions
return loss
def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
if prediction_dim > 2:
raise ValueError(
"loss='binary_crossentropy' is not defined for multiclass"
" classification with n_classes=%d, use"
" loss='categorical_crossentropy' instead" % prediction_dim)
proba_positive_class = np.average(y_train, weights=sample_weight)
eps = np.finfo(y_train.dtype).eps
proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps)
# log(x / 1 - x) is the anti function of sigmoid, or the link function
# of the Binomial model.
return np.log(proba_positive_class / (1 - proba_positive_class))
def update_gradients_and_hessians(self, gradients, hessians, y_true,
raw_predictions, sample_weight):
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
# return a view.
raw_predictions = raw_predictions.reshape(-1)
gradients = gradients.reshape(-1)
hessians = hessians.reshape(-1)
_update_gradients_hessians_binary_crossentropy(
gradients, hessians, y_true, raw_predictions, sample_weight)
def predict_proba(self, raw_predictions):
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
# return a view.
raw_predictions = raw_predictions.reshape(-1)
proba = np.empty((raw_predictions.shape[0], 2), dtype=Y_DTYPE)
proba[:, 1] = expit(raw_predictions)
proba[:, 0] = 1 - proba[:, 1]
return proba
class CategoricalCrossEntropy(BaseLoss):
"""Categorical cross-entropy loss, for multiclass classification.
For a given sample x_i, the categorical cross-entropy loss is defined as
the negative log-likelihood of the model and generalizes the binary
cross-entropy to more than 2 classes.
"""
def __init__(self, sample_weight):
super().__init__(hessians_are_constant=False)
def pointwise_loss(self, y_true, raw_predictions):
one_hot_true = np.zeros_like(raw_predictions)
prediction_dim = raw_predictions.shape[0]
for k in range(prediction_dim):
one_hot_true[k, :] = (y_true == k)
loss = (logsumexp(raw_predictions, axis=0) -
(one_hot_true * raw_predictions).sum(axis=0))
return loss
def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE)
eps = np.finfo(y_train.dtype).eps
for k in range(prediction_dim):
proba_kth_class = np.average(y_train == k,
weights=sample_weight)
proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps)
init_value[k, :] += np.log(proba_kth_class)
return init_value
def update_gradients_and_hessians(self, gradients, hessians, y_true,
raw_predictions, sample_weight):
_update_gradients_hessians_categorical_crossentropy(
gradients, hessians, y_true, raw_predictions, sample_weight)
def predict_proba(self, raw_predictions):
# TODO: This could be done in parallel
# compute softmax (using exp(log(softmax)))
proba = np.exp(raw_predictions -
logsumexp(raw_predictions, axis=0)[np.newaxis, :])
return proba.T
_LOSSES = {
'least_squares': LeastSquares,
'least_absolute_deviation': LeastAbsoluteDeviation,
'binary_crossentropy': BinaryCrossEntropy,
'categorical_crossentropy': CategoricalCrossEntropy,
'poisson': Poisson,
}

View file

@ -0,0 +1,86 @@
"""
This module contains the TreePredictor class which is used for prediction.
"""
# Author: Nicolas Hug
import numpy as np
from .common import Y_DTYPE
from ._predictor import _predict_from_numeric_data
from ._predictor import _predict_from_binned_data
from ._predictor import _compute_partial_dependence
class TreePredictor:
"""Tree class used for predictions.
Parameters
----------
nodes : ndarray of PREDICTOR_RECORD_DTYPE
The nodes of the tree.
"""
def __init__(self, nodes):
self.nodes = nodes
def get_n_leaf_nodes(self):
"""Return number of leaves."""
return int(self.nodes['is_leaf'].sum())
def get_max_depth(self):
"""Return maximum depth among all leaves."""
return int(self.nodes['depth'].max())
def predict(self, X):
"""Predict raw values for non-binned data.
Parameters
----------
X : ndarray, shape (n_samples, n_features)
The input samples.
Returns
-------
y : ndarray, shape (n_samples,)
The raw predicted values.
"""
out = np.empty(X.shape[0], dtype=Y_DTYPE)
_predict_from_numeric_data(self.nodes, X, out)
return out
def predict_binned(self, X, missing_values_bin_idx):
"""Predict raw values for binned data.
Parameters
----------
X : ndarray, shape (n_samples, n_features)
The input samples.
missing_values_bin_idx : uint8
Index of the bin that is used for missing values. This is the
index of the last bin and is always equal to max_bins (as passed
to the GBDT classes), or equivalently to n_bins - 1.
Returns
-------
y : ndarray, shape (n_samples,)
The raw predicted values.
"""
out = np.empty(X.shape[0], dtype=Y_DTYPE)
_predict_from_binned_data(self.nodes, X, missing_values_bin_idx, out)
return out
def compute_partial_dependence(self, grid, target_features, out):
"""Fast partial dependence computation.
Parameters
----------
grid : ndarray, shape (n_samples, n_target_features)
The grid points on which the partial dependence should be
evaluated.
target_features : ndarray, shape (n_target_features)
The set of target features for which the partial dependence
should be evaluated.
out : ndarray, shape (n_samples)
The value of the partial dependence function on each grid
point.
"""
_compute_partial_dependence(self.nodes, grid, target_features, out)

View file

@ -0,0 +1,314 @@
import numpy as np
from numpy.testing import assert_array_equal, assert_allclose
import pytest
from sklearn.ensemble._hist_gradient_boosting.binning import (
_BinMapper,
_find_binning_thresholds as _find_binning_thresholds_orig,
_map_to_bins
)
from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import ALMOST_INF
DATA = np.random.RandomState(42).normal(
loc=[0, 10], scale=[1, 0.01], size=(int(1e6), 2)
).astype(X_DTYPE)
def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5),
random_state=None):
# Just a redef to avoid having to pass arguments all the time (as the
# function is private we don't use default values for parameters)
return _find_binning_thresholds_orig(data, max_bins, subsample,
random_state)
def test_find_binning_thresholds_regular_data():
data = np.linspace(0, 10, 1001).reshape(-1, 1)
bin_thresholds = _find_binning_thresholds(data, max_bins=10)
assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9])
assert len(bin_thresholds) == 1
bin_thresholds = _find_binning_thresholds(data, max_bins=5)
assert_allclose(bin_thresholds[0], [2, 4, 6, 8])
assert len(bin_thresholds) == 1
def test_find_binning_thresholds_small_regular_data():
data = np.linspace(0, 10, 11).reshape(-1, 1)
bin_thresholds = _find_binning_thresholds(data, max_bins=5)
assert_allclose(bin_thresholds[0], [2, 4, 6, 8])
bin_thresholds = _find_binning_thresholds(data, max_bins=10)
assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9])
bin_thresholds = _find_binning_thresholds(data, max_bins=11)
assert_allclose(bin_thresholds[0], np.arange(10) + .5)
bin_thresholds = _find_binning_thresholds(data, max_bins=255)
assert_allclose(bin_thresholds[0], np.arange(10) + .5)
def test_find_binning_thresholds_random_data():
bin_thresholds = _find_binning_thresholds(DATA, max_bins=255,
random_state=0)
assert len(bin_thresholds) == 2
for i in range(len(bin_thresholds)):
assert bin_thresholds[i].shape == (254,) # 255 - 1
assert bin_thresholds[i].dtype == DATA.dtype
assert_allclose(bin_thresholds[0][[64, 128, 192]],
np.array([-0.7, 0.0, 0.7]), atol=1e-1)
assert_allclose(bin_thresholds[1][[64, 128, 192]],
np.array([9.99, 10.00, 10.01]), atol=1e-2)
def test_find_binning_thresholds_low_n_bins():
bin_thresholds = _find_binning_thresholds(DATA, max_bins=128,
random_state=0)
assert len(bin_thresholds) == 2
for i in range(len(bin_thresholds)):
assert bin_thresholds[i].shape == (127,) # 128 - 1
assert bin_thresholds[i].dtype == DATA.dtype
@pytest.mark.parametrize('n_bins', (2, 257))
def test_invalid_n_bins(n_bins):
err_msg = (
'n_bins={} should be no smaller than 3 and no larger than 256'
.format(n_bins))
with pytest.raises(ValueError, match=err_msg):
_BinMapper(n_bins=n_bins).fit(DATA)
def test_bin_mapper_n_features_transform():
mapper = _BinMapper(n_bins=42, random_state=42).fit(DATA)
err_msg = 'This estimator was fitted with 2 features but 4 got passed'
with pytest.raises(ValueError, match=err_msg):
mapper.transform(np.repeat(DATA, 2, axis=1))
@pytest.mark.parametrize('max_bins', [16, 128, 255])
def test_map_to_bins(max_bins):
bin_thresholds = _find_binning_thresholds(DATA, max_bins=max_bins,
random_state=0)
binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order='F')
last_bin_idx = max_bins
_map_to_bins(DATA, bin_thresholds, last_bin_idx, binned)
assert binned.shape == DATA.shape
assert binned.dtype == np.uint8
assert binned.flags.f_contiguous
min_indices = DATA.argmin(axis=0)
max_indices = DATA.argmax(axis=0)
for feature_idx, min_idx in enumerate(min_indices):
assert binned[min_idx, feature_idx] == 0
for feature_idx, max_idx in enumerate(max_indices):
assert binned[max_idx, feature_idx] == max_bins - 1
@pytest.mark.parametrize("max_bins", [5, 10, 42])
def test_bin_mapper_random_data(max_bins):
n_samples, n_features = DATA.shape
expected_count_per_bin = n_samples // max_bins
tol = int(0.05 * expected_count_per_bin)
# max_bins is the number of bins for non-missing values
n_bins = max_bins + 1
mapper = _BinMapper(n_bins=n_bins, random_state=42).fit(DATA)
binned = mapper.transform(DATA)
assert binned.shape == (n_samples, n_features)
assert binned.dtype == np.uint8
assert_array_equal(binned.min(axis=0), np.array([0, 0]))
assert_array_equal(binned.max(axis=0),
np.array([max_bins - 1, max_bins - 1]))
assert len(mapper.bin_thresholds_) == n_features
for bin_thresholds_feature in mapper.bin_thresholds_:
assert bin_thresholds_feature.shape == (max_bins - 1,)
assert bin_thresholds_feature.dtype == DATA.dtype
assert np.all(mapper.n_bins_non_missing_ == max_bins)
# Check that the binned data is approximately balanced across bins.
for feature_idx in range(n_features):
for bin_idx in range(max_bins):
count = (binned[:, feature_idx] == bin_idx).sum()
assert abs(count - expected_count_per_bin) < tol
@pytest.mark.parametrize("n_samples, max_bins", [
(5, 5),
(5, 10),
(5, 11),
(42, 255)
])
def test_bin_mapper_small_random_data(n_samples, max_bins):
data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1)
assert len(np.unique(data)) == n_samples
# max_bins is the number of bins for non-missing values
n_bins = max_bins + 1
mapper = _BinMapper(n_bins=n_bins, random_state=42)
binned = mapper.fit_transform(data)
assert binned.shape == data.shape
assert binned.dtype == np.uint8
assert_array_equal(binned.ravel()[np.argsort(data.ravel())],
np.arange(n_samples))
@pytest.mark.parametrize("max_bins, n_distinct, multiplier", [
(5, 5, 1),
(5, 5, 3),
(255, 12, 42),
])
def test_bin_mapper_identity_repeated_values(max_bins, n_distinct, multiplier):
data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1)
# max_bins is the number of bins for non-missing values
n_bins = max_bins + 1
binned = _BinMapper(n_bins=n_bins).fit_transform(data)
assert_array_equal(data, binned)
@pytest.mark.parametrize('n_distinct', [2, 7, 42])
def test_bin_mapper_repeated_values_invariance(n_distinct):
rng = np.random.RandomState(42)
distinct_values = rng.normal(size=n_distinct)
assert len(np.unique(distinct_values)) == n_distinct
repeated_indices = rng.randint(low=0, high=n_distinct, size=1000)
data = distinct_values[repeated_indices]
rng.shuffle(data)
assert_array_equal(np.unique(data), np.sort(distinct_values))
data = data.reshape(-1, 1)
mapper_1 = _BinMapper(n_bins=n_distinct + 1)
binned_1 = mapper_1.fit_transform(data)
assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct))
# Adding more bins to the mapper yields the same results (same thresholds)
mapper_2 = _BinMapper(n_bins=min(256, n_distinct * 3) + 1)
binned_2 = mapper_2.fit_transform(data)
assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0])
assert_array_equal(binned_1, binned_2)
@pytest.mark.parametrize("max_bins, scale, offset", [
(3, 2, -1),
(42, 1, 0),
(255, 0.3, 42),
])
def test_bin_mapper_identity_small(max_bins, scale, offset):
data = np.arange(max_bins).reshape(-1, 1) * scale + offset
# max_bins is the number of bins for non-missing values
n_bins = max_bins + 1
binned = _BinMapper(n_bins=n_bins).fit_transform(data)
assert_array_equal(binned, np.arange(max_bins).reshape(-1, 1))
@pytest.mark.parametrize('max_bins_small, max_bins_large', [
(2, 2),
(3, 3),
(4, 4),
(42, 42),
(255, 255),
(5, 17),
(42, 255),
])
def test_bin_mapper_idempotence(max_bins_small, max_bins_large):
assert max_bins_large >= max_bins_small
data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1)
mapper_small = _BinMapper(n_bins=max_bins_small + 1)
mapper_large = _BinMapper(n_bins=max_bins_small + 1)
binned_small = mapper_small.fit_transform(data)
binned_large = mapper_large.fit_transform(binned_small)
assert_array_equal(binned_small, binned_large)
@pytest.mark.parametrize('n_bins', [10, 100, 256])
@pytest.mark.parametrize('diff', [-5, 0, 5])
def test_n_bins_non_missing(n_bins, diff):
# Check that n_bins_non_missing is n_unique_values when
# there are not a lot of unique values, else n_bins - 1.
n_unique_values = n_bins + diff
X = list(range(n_unique_values)) * 2
X = np.array(X).reshape(-1, 1)
mapper = _BinMapper(n_bins=n_bins).fit(X)
assert np.all(mapper.n_bins_non_missing_ == min(
n_bins - 1, n_unique_values))
def test_subsample():
# Make sure bin thresholds are different when applying subsampling
mapper_no_subsample = _BinMapper(subsample=None, random_state=0).fit(DATA)
mapper_subsample = _BinMapper(subsample=256, random_state=0).fit(DATA)
for feature in range(DATA.shape[1]):
assert not np.allclose(mapper_no_subsample.bin_thresholds_[feature],
mapper_subsample.bin_thresholds_[feature],
rtol=1e-4)
@pytest.mark.parametrize(
'n_bins, n_bins_non_missing, X_trans_expected', [
(256, [4, 2, 2], [[0, 0, 0], # 255 <=> missing value
[255, 255, 0],
[1, 0, 0],
[255, 1, 1],
[2, 1, 1],
[3, 0, 0]]),
(3, [2, 2, 2], [[0, 0, 0], # 2 <=> missing value
[2, 2, 0],
[0, 0, 0],
[2, 1, 1],
[1, 1, 1],
[1, 0, 0]])])
def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected):
# check for missing values: make sure nans are mapped to the last bin
# and that the _BinMapper attributes are correct
X = [[1, 1, 0],
[np.NaN, np.NaN, 0],
[2, 1, 0],
[np.NaN, 2, 1],
[3, 2, 1],
[4, 1, 0]]
X = np.array(X)
mapper = _BinMapper(n_bins=n_bins)
mapper.fit(X)
assert_array_equal(mapper.n_bins_non_missing_, n_bins_non_missing)
for feature_idx in range(X.shape[1]):
assert len(mapper.bin_thresholds_[feature_idx]) == \
n_bins_non_missing[feature_idx] - 1
assert mapper.missing_values_bin_idx_ == n_bins - 1
X_trans = mapper.transform(X)
assert_array_equal(X_trans, X_trans_expected)
def test_infinite_values():
# Make sure infinite values are properly handled.
bin_mapper = _BinMapper()
X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
bin_mapper.fit(X)
assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, .5, ALMOST_INF])
assert bin_mapper.n_bins_non_missing_ == [4]
expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1)
assert_array_equal(bin_mapper.transform(X), expected_binned_X)

View file

@ -0,0 +1,223 @@
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification, make_regression
import numpy as np
import pytest
# To use this experimental feature, we need to explicitly ask for it:
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
from sklearn.ensemble._hist_gradient_boosting.utils import (
get_equivalent_estimator)
@pytest.mark.parametrize('seed', range(5))
@pytest.mark.parametrize('min_samples_leaf', (1, 20))
@pytest.mark.parametrize('n_samples, max_leaf_nodes', [
(255, 4096),
(1000, 8),
])
def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
max_leaf_nodes):
# Make sure sklearn has the same predictions as lightgbm for easy targets.
#
# In particular when the size of the trees are bound and the number of
# samples is large enough, the structure of the prediction trees found by
# LightGBM and sklearn should be exactly identical.
#
# Notes:
# - Several candidate splits may have equal gains when the number of
# samples in a node is low (and because of float errors). Therefore the
# predictions on the test set might differ if the structure of the tree
# is not exactly the same. To avoid this issue we only compare the
# predictions on the test set when the number of samples is large enough
# and max_leaf_nodes is low enough.
# - To ignore discrepancies caused by small differences the binning
# strategy, data is pre-binned if n_samples > 255.
# - We don't check the least_absolute_deviation loss here. This is because
# LightGBM's computation of the median (used for the initial value of
# raw_prediction) is a bit off (they'll e.g. return midpoints when there
# is no need to.). Since these tests only run 1 iteration, the
# discrepancy between the initial values leads to biggish differences in
# the predictions. These differences are much smaller with more
# iterations.
pytest.importorskip("lightgbm")
rng = np.random.RandomState(seed=seed)
n_samples = n_samples
max_iter = 1
max_bins = 255
X, y = make_regression(n_samples=n_samples, n_features=5,
n_informative=5, random_state=0)
if n_samples > 255:
# bin data and convert it to float32 so that the estimator doesn't
# treat it as pre-binned
X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
est_sklearn = HistGradientBoostingRegressor(
max_iter=max_iter,
max_bins=max_bins,
learning_rate=1,
early_stopping=False,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=max_leaf_nodes)
est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
est_lightgbm.fit(X_train, y_train)
est_sklearn.fit(X_train, y_train)
# We need X to be treated an numerical data, not pre-binned data.
X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
pred_lightgbm = est_lightgbm.predict(X_train)
pred_sklearn = est_sklearn.predict(X_train)
# less than 1% of the predictions are different up to the 3rd decimal
assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < .011
if max_leaf_nodes < 10 and n_samples >= 1000:
pred_lightgbm = est_lightgbm.predict(X_test)
pred_sklearn = est_sklearn.predict(X_test)
# less than 1% of the predictions are different up to the 4th decimal
assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < .01
@pytest.mark.parametrize('seed', range(5))
@pytest.mark.parametrize('min_samples_leaf', (1, 20))
@pytest.mark.parametrize('n_samples, max_leaf_nodes', [
(255, 4096),
(1000, 8),
])
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
max_leaf_nodes):
# Same as test_same_predictions_regression but for classification
pytest.importorskip("lightgbm")
rng = np.random.RandomState(seed=seed)
n_samples = n_samples
max_iter = 1
max_bins = 255
X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5,
n_informative=5, n_redundant=0, random_state=0)
if n_samples > 255:
# bin data and convert it to float32 so that the estimator doesn't
# treat it as pre-binned
X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
est_sklearn = HistGradientBoostingClassifier(
loss='binary_crossentropy',
max_iter=max_iter,
max_bins=max_bins,
learning_rate=1,
early_stopping=False,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=max_leaf_nodes)
est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
est_lightgbm.fit(X_train, y_train)
est_sklearn.fit(X_train, y_train)
# We need X to be treated an numerical data, not pre-binned data.
X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
pred_lightgbm = est_lightgbm.predict(X_train)
pred_sklearn = est_sklearn.predict(X_train)
assert np.mean(pred_sklearn == pred_lightgbm) > .89
acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
acc_sklearn = accuracy_score(y_train, pred_sklearn)
np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)
if max_leaf_nodes < 10 and n_samples >= 1000:
pred_lightgbm = est_lightgbm.predict(X_test)
pred_sklearn = est_sklearn.predict(X_test)
assert np.mean(pred_sklearn == pred_lightgbm) > .89
acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
acc_sklearn = accuracy_score(y_test, pred_sklearn)
np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
@pytest.mark.parametrize('seed', range(5))
@pytest.mark.parametrize('min_samples_leaf', (1, 20))
@pytest.mark.parametrize('n_samples, max_leaf_nodes', [
(255, 4096),
(10000, 8),
])
def test_same_predictions_multiclass_classification(
seed, min_samples_leaf, n_samples, max_leaf_nodes):
# Same as test_same_predictions_regression but for classification
pytest.importorskip("lightgbm")
rng = np.random.RandomState(seed=seed)
n_samples = n_samples
max_iter = 1
max_bins = 255
lr = 1
X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5,
n_informative=5, n_redundant=0,
n_clusters_per_class=1, random_state=0)
if n_samples > 255:
# bin data and convert it to float32 so that the estimator doesn't
# treat it as pre-binned
X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
est_sklearn = HistGradientBoostingClassifier(
loss='categorical_crossentropy',
max_iter=max_iter,
max_bins=max_bins,
learning_rate=lr,
early_stopping=False,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=max_leaf_nodes)
est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
est_lightgbm.fit(X_train, y_train)
est_sklearn.fit(X_train, y_train)
# We need X to be treated an numerical data, not pre-binned data.
X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
pred_lightgbm = est_lightgbm.predict(X_train)
pred_sklearn = est_sklearn.predict(X_train)
assert np.mean(pred_sklearn == pred_lightgbm) > .89
proba_lightgbm = est_lightgbm.predict_proba(X_train)
proba_sklearn = est_sklearn.predict_proba(X_train)
# assert more than 75% of the predicted probabilities are the same up to
# the second decimal
assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75
acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
acc_sklearn = accuracy_score(y_train, pred_sklearn)
np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
if max_leaf_nodes < 10 and n_samples >= 1000:
pred_lightgbm = est_lightgbm.predict(X_test)
pred_sklearn = est_sklearn.predict(X_test)
assert np.mean(pred_sklearn == pred_lightgbm) > .89
proba_lightgbm = est_lightgbm.predict_proba(X_train)
proba_sklearn = est_sklearn.predict_proba(X_train)
# assert more than 75% of the predicted probabilities are the same up
# to the second decimal
assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75
acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
acc_sklearn = accuracy_score(y_test, pred_sklearn)
np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)

View file

@ -0,0 +1,746 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_equal
from sklearn.datasets import make_classification, make_regression
from sklearn.datasets import make_low_rank_matrix
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.base import clone, BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_poisson_deviance
from sklearn.dummy import DummyRegressor
# To use this experimental feature, we need to explicitly ask for it:
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
from sklearn.ensemble._hist_gradient_boosting.loss import LeastSquares
from sklearn.ensemble._hist_gradient_boosting.loss import BinaryCrossEntropy
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
from sklearn.utils import shuffle
X_classification, y_classification = make_classification(random_state=0)
X_regression, y_regression = make_regression(random_state=0)
def _make_dumb_dataset(n_samples):
"""Make a dumb dataset to test early stopping."""
rng = np.random.RandomState(42)
X_dumb = rng.randn(n_samples, 1)
y_dumb = (X_dumb[:, 0] > 0).astype('int64')
return X_dumb, y_dumb
@pytest.mark.parametrize('GradientBoosting, X, y', [
(HistGradientBoostingClassifier, X_classification, y_classification),
(HistGradientBoostingRegressor, X_regression, y_regression)
])
@pytest.mark.parametrize(
'params, err_msg',
[({'loss': 'blah'}, 'Loss blah is not supported for'),
({'learning_rate': 0}, 'learning_rate=0 must be strictly positive'),
({'learning_rate': -1}, 'learning_rate=-1 must be strictly positive'),
({'max_iter': 0}, 'max_iter=0 must not be smaller than 1'),
({'max_leaf_nodes': 0}, 'max_leaf_nodes=0 should not be smaller than 2'),
({'max_leaf_nodes': 1}, 'max_leaf_nodes=1 should not be smaller than 2'),
({'max_depth': 0}, 'max_depth=0 should not be smaller than 1'),
({'min_samples_leaf': 0}, 'min_samples_leaf=0 should not be smaller'),
({'l2_regularization': -1}, 'l2_regularization=-1 must be positive'),
({'max_bins': 1}, 'max_bins=1 should be no smaller than 2 and no larger'),
({'max_bins': 256}, 'max_bins=256 should be no smaller than 2 and no'),
({'n_iter_no_change': -1}, 'n_iter_no_change=-1 must be positive'),
({'validation_fraction': -1}, 'validation_fraction=-1 must be strictly'),
({'validation_fraction': 0}, 'validation_fraction=0 must be strictly'),
({'tol': -1}, 'tol=-1 must not be smaller than 0')]
)
def test_init_parameters_validation(GradientBoosting, X, y, params, err_msg):
with pytest.raises(ValueError, match=err_msg):
GradientBoosting(**params).fit(X, y)
def test_invalid_classification_loss():
binary_clf = HistGradientBoostingClassifier(loss="binary_crossentropy")
err_msg = ("loss='binary_crossentropy' is not defined for multiclass "
"classification with n_classes=3, use "
"loss='categorical_crossentropy' instead")
with pytest.raises(ValueError, match=err_msg):
binary_clf.fit(np.zeros(shape=(3, 2)), np.arange(3))
@pytest.mark.parametrize(
'scoring, validation_fraction, early_stopping, n_iter_no_change, tol', [
('neg_mean_squared_error', .1, True, 5, 1e-7), # use scorer
('neg_mean_squared_error', None, True, 5, 1e-1), # use scorer on train
(None, .1, True, 5, 1e-7), # same with default scorer
(None, None, True, 5, 1e-1),
('loss', .1, True, 5, 1e-7), # use loss
('loss', None, True, 5, 1e-1), # use loss on training data
(None, None, False, 5, None), # no early stopping
])
def test_early_stopping_regression(scoring, validation_fraction,
early_stopping, n_iter_no_change, tol):
max_iter = 200
X, y = make_regression(n_samples=50, random_state=0)
gb = HistGradientBoostingRegressor(
verbose=1, # just for coverage
min_samples_leaf=5, # easier to overfit fast
scoring=scoring,
tol=tol,
early_stopping=early_stopping,
validation_fraction=validation_fraction,
max_iter=max_iter,
n_iter_no_change=n_iter_no_change,
random_state=0
)
gb.fit(X, y)
if early_stopping:
assert n_iter_no_change <= gb.n_iter_ < max_iter
else:
assert gb.n_iter_ == max_iter
@pytest.mark.parametrize('data', (
make_classification(n_samples=30, random_state=0),
make_classification(n_samples=30, n_classes=3, n_clusters_per_class=1,
random_state=0)
))
@pytest.mark.parametrize(
'scoring, validation_fraction, early_stopping, n_iter_no_change, tol', [
('accuracy', .1, True, 5, 1e-7), # use scorer
('accuracy', None, True, 5, 1e-1), # use scorer on training data
(None, .1, True, 5, 1e-7), # same with default scorer
(None, None, True, 5, 1e-1),
('loss', .1, True, 5, 1e-7), # use loss
('loss', None, True, 5, 1e-1), # use loss on training data
(None, None, False, 5, None), # no early stopping
])
def test_early_stopping_classification(data, scoring, validation_fraction,
early_stopping, n_iter_no_change, tol):
max_iter = 50
X, y = data
gb = HistGradientBoostingClassifier(
verbose=1, # just for coverage
min_samples_leaf=5, # easier to overfit fast
scoring=scoring,
tol=tol,
early_stopping=early_stopping,
validation_fraction=validation_fraction,
max_iter=max_iter,
n_iter_no_change=n_iter_no_change,
random_state=0
)
gb.fit(X, y)
if early_stopping is True:
assert n_iter_no_change <= gb.n_iter_ < max_iter
else:
assert gb.n_iter_ == max_iter
@pytest.mark.parametrize('GradientBoosting, X, y', [
(HistGradientBoostingClassifier, *_make_dumb_dataset(10000)),
(HistGradientBoostingClassifier, *_make_dumb_dataset(10001)),
(HistGradientBoostingRegressor, *_make_dumb_dataset(10000)),
(HistGradientBoostingRegressor, *_make_dumb_dataset(10001))
])
def test_early_stopping_default(GradientBoosting, X, y):
# Test that early stopping is enabled by default if and only if there
# are more than 10000 samples
gb = GradientBoosting(max_iter=10, n_iter_no_change=2, tol=1e-1)
gb.fit(X, y)
if X.shape[0] > 10000:
assert gb.n_iter_ < gb.max_iter
else:
assert gb.n_iter_ == gb.max_iter
@pytest.mark.parametrize(
'scores, n_iter_no_change, tol, stopping',
[
([], 1, 0.001, False), # not enough iterations
([1, 1, 1], 5, 0.001, False), # not enough iterations
([1, 1, 1, 1, 1], 5, 0.001, False), # not enough iterations
([1, 2, 3, 4, 5, 6], 5, 0.001, False), # significant improvement
([1, 2, 3, 4, 5, 6], 5, 0., False), # significant improvement
([1, 2, 3, 4, 5, 6], 5, 0.999, False), # significant improvement
([1, 2, 3, 4, 5, 6], 5, 5 - 1e-5, False), # significant improvement
([1] * 6, 5, 0., True), # no significant improvement
([1] * 6, 5, 0.001, True), # no significant improvement
([1] * 6, 5, 5, True), # no significant improvement
]
)
def test_should_stop(scores, n_iter_no_change, tol, stopping):
gbdt = HistGradientBoostingClassifier(
n_iter_no_change=n_iter_no_change, tol=tol
)
assert gbdt._should_stop(scores) == stopping
def test_least_absolute_deviation():
# For coverage only.
X, y = make_regression(n_samples=500, random_state=0)
gbdt = HistGradientBoostingRegressor(loss='least_absolute_deviation',
random_state=0)
gbdt.fit(X, y)
assert gbdt.score(X, y) > .9
@pytest.mark.parametrize('y', [([1., -2., 0.]), ([0., 0., 0.])])
def test_poisson_y_positive(y):
# Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0.
err_msg = r"loss='poisson' requires non-negative y and sum\(y\) > 0."
gbdt = HistGradientBoostingRegressor(loss='poisson', random_state=0)
with pytest.raises(ValueError, match=err_msg):
gbdt.fit(np.zeros(shape=(len(y), 1)), y)
def test_poisson():
# For Poisson distributed target, Poisson loss should give better results
# than least squares measured in Poisson deviance as metric.
rng = np.random.RandomState(42)
n_train, n_test, n_features = 500, 100, 100
X = make_low_rank_matrix(n_samples=n_train+n_test, n_features=n_features,
random_state=rng)
# We create a log-linear Poisson model and downscale coef as it will get
# exponentiated.
coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
y = rng.poisson(lam=np.exp(X @ coef))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test,
random_state=rng)
gbdt_pois = HistGradientBoostingRegressor(loss='poisson', random_state=rng)
gbdt_ls = HistGradientBoostingRegressor(loss='least_squares',
random_state=rng)
gbdt_pois.fit(X_train, y_train)
gbdt_ls.fit(X_train, y_train)
dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)
for X, y in [(X_train, y_train), (X_test, y_test)]:
metric_pois = mean_poisson_deviance(y, gbdt_pois.predict(X))
# least_squares might produce non-positive predictions => clip
metric_ls = mean_poisson_deviance(y, np.clip(gbdt_ls.predict(X), 1e-15,
None))
metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
assert metric_pois < metric_ls
assert metric_pois < metric_dummy
def test_binning_train_validation_are_separated():
# Make sure training and validation data are binned separately.
# See issue 13926
rng = np.random.RandomState(0)
validation_fraction = .2
gb = HistGradientBoostingClassifier(
early_stopping=True,
validation_fraction=validation_fraction,
random_state=rng
)
gb.fit(X_classification, y_classification)
mapper_training_data = gb.bin_mapper_
# Note that since the data is small there is no subsampling and the
# random_state doesn't matter
mapper_whole_data = _BinMapper(random_state=0)
mapper_whole_data.fit(X_classification)
n_samples = X_classification.shape[0]
assert np.all(mapper_training_data.n_bins_non_missing_ ==
int((1 - validation_fraction) * n_samples))
assert np.all(mapper_training_data.n_bins_non_missing_ !=
mapper_whole_data.n_bins_non_missing_)
def test_missing_values_trivial():
# sanity check for missing values support. With only one feature and
# y == isnan(X), the gbdt is supposed to reach perfect accuracy on the
# training set.
n_samples = 100
n_features = 1
rng = np.random.RandomState(0)
X = rng.normal(size=(n_samples, n_features))
mask = rng.binomial(1, .5, size=X.shape).astype(np.bool)
X[mask] = np.nan
y = mask.ravel()
gb = HistGradientBoostingClassifier()
gb.fit(X, y)
assert gb.score(X, y) == pytest.approx(1)
@pytest.mark.parametrize('problem', ('classification', 'regression'))
@pytest.mark.parametrize(
'missing_proportion, expected_min_score_classification, '
'expected_min_score_regression', [
(.1, .97, .89),
(.2, .93, .81),
(.5, .79, .52)])
def test_missing_values_resilience(problem, missing_proportion,
expected_min_score_classification,
expected_min_score_regression):
# Make sure the estimators can deal with missing values and still yield
# decent predictions
rng = np.random.RandomState(0)
n_samples = 1000
n_features = 2
if problem == 'regression':
X, y = make_regression(n_samples=n_samples, n_features=n_features,
n_informative=n_features, random_state=rng)
gb = HistGradientBoostingRegressor()
expected_min_score = expected_min_score_regression
else:
X, y = make_classification(n_samples=n_samples, n_features=n_features,
n_informative=n_features, n_redundant=0,
n_repeated=0, random_state=rng)
gb = HistGradientBoostingClassifier()
expected_min_score = expected_min_score_classification
mask = rng.binomial(1, missing_proportion, size=X.shape).astype(np.bool)
X[mask] = np.nan
gb.fit(X, y)
assert gb.score(X, y) > expected_min_score
@pytest.mark.parametrize('data', [
make_classification(random_state=0, n_classes=2),
make_classification(random_state=0, n_classes=3, n_informative=3)
], ids=['binary_crossentropy', 'categorical_crossentropy'])
def test_zero_division_hessians(data):
# non regression test for issue #14018
# make sure we avoid zero division errors when computing the leaves values.
# If the learning rate is too high, the raw predictions are bad and will
# saturate the softmax (or sigmoid in binary classif). This leads to
# probabilities being exactly 0 or 1, gradients being constant, and
# hessians being zero.
X, y = data
gb = HistGradientBoostingClassifier(learning_rate=100, max_iter=10)
gb.fit(X, y)
def test_small_trainset():
# Make sure that the small trainset is stratified and has the expected
# length (10k samples)
n_samples = 20000
original_distrib = {0: 0.1, 1: 0.2, 2: 0.3, 3: 0.4}
rng = np.random.RandomState(42)
X = rng.randn(n_samples).reshape(n_samples, 1)
y = [[class_] * int(prop * n_samples) for (class_, prop)
in original_distrib.items()]
y = shuffle(np.concatenate(y))
gb = HistGradientBoostingClassifier()
# Compute the small training set
X_small, y_small, _ = gb._get_small_trainset(X, y, seed=42,
sample_weight_train=None)
# Compute the class distribution in the small training set
unique, counts = np.unique(y_small, return_counts=True)
small_distrib = {class_: count / 10000 for (class_, count)
in zip(unique, counts)}
# Test that the small training set has the expected length
assert X_small.shape[0] == 10000
assert y_small.shape[0] == 10000
# Test that the class distributions in the whole dataset and in the small
# training set are identical
assert small_distrib == pytest.approx(original_distrib)
def test_missing_values_minmax_imputation():
# Compare the buit-in missing value handling of Histogram GBC with an
# a-priori missing value imputation strategy that should yield the same
# results in terms of decision function.
#
# Each feature (containing NaNs) is replaced by 2 features:
# - one where the nans are replaced by min(feature) - 1
# - one where the nans are replaced by max(feature) + 1
# A split where nans go to the left has an equivalent split in the
# first (min) feature, and a split where nans go to the right has an
# equivalent split in the second (max) feature.
#
# Assuming the data is such that there is never a tie to select the best
# feature to split on during training, the learned decision trees should be
# strictly equivalent (learn a sequence of splits that encode the same
# decision function).
#
# The MinMaxImputer transformer is meant to be a toy implementation of the
# "Missing In Attributes" (MIA) missing value handling for decision trees
# https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305
# The implementation of MIA as an imputation transformer was suggested by
# "Remark 3" in https://arxiv.org/abs/1902.06931
class MinMaxImputer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
mm = MinMaxScaler().fit(X)
self.data_min_ = mm.data_min_
self.data_max_ = mm.data_max_
return self
def transform(self, X):
X_min, X_max = X.copy(), X.copy()
for feature_idx in range(X.shape[1]):
nan_mask = np.isnan(X[:, feature_idx])
X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1
X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1
return np.concatenate([X_min, X_max], axis=1)
def make_missing_value_data(n_samples=int(1e4), seed=0):
rng = np.random.RandomState(seed)
X, y = make_regression(n_samples=n_samples, n_features=4,
random_state=rng)
# Pre-bin the data to ensure a deterministic handling by the 2
# strategies and also make it easier to insert np.nan in a structured
# way:
X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X)
# First feature has missing values completely at random:
rnd_mask = rng.rand(X.shape[0]) > 0.9
X[rnd_mask, 0] = np.nan
# Second and third features have missing values for extreme values
# (censoring missingness):
low_mask = X[:, 1] == 0
X[low_mask, 1] = np.nan
high_mask = X[:, 2] == X[:, 2].max()
X[high_mask, 2] = np.nan
# Make the last feature nan pattern very informative:
y_max = np.percentile(y, 70)
y_max_mask = y >= y_max
y[y_max_mask] = y_max
X[y_max_mask, 3] = np.nan
# Check that there is at least one missing value in each feature:
for feature_idx in range(X.shape[1]):
assert any(np.isnan(X[:, feature_idx]))
# Let's use a test set to check that the learned decision function is
# the same as evaluated on unseen data. Otherwise it could just be the
# case that we find two independent ways to overfit the training set.
return train_test_split(X, y, random_state=rng)
# n_samples need to be large enough to minimize the likelihood of having
# several candidate splits with the same gain value in a given tree.
X_train, X_test, y_train, y_test = make_missing_value_data(
n_samples=int(1e4), seed=0)
# Use a small number of leaf nodes and iterations so as to keep
# under-fitting models to minimize the likelihood of ties when training the
# model.
gbm1 = HistGradientBoostingRegressor(max_iter=100,
max_leaf_nodes=5,
random_state=0)
gbm1.fit(X_train, y_train)
gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1))
gbm2.fit(X_train, y_train)
# Check that the model reach the same score:
assert gbm1.score(X_train, y_train) == \
pytest.approx(gbm2.score(X_train, y_train))
assert gbm1.score(X_test, y_test) == \
pytest.approx(gbm2.score(X_test, y_test))
# Check the individual prediction match as a finer grained
# decision function check.
assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train))
assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test))
def test_infinite_values():
# Basic test for infinite values
X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
y = np.array([0, 0, 1, 1])
gbdt = HistGradientBoostingRegressor(min_samples_leaf=1)
gbdt.fit(X, y)
np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4)
def test_consistent_lengths():
X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
y = np.array([0, 0, 1, 1])
sample_weight = np.array([.1, .3, .1])
gbdt = HistGradientBoostingRegressor()
with pytest.raises(ValueError,
match=r"sample_weight.shape == \(3,\), expected"):
gbdt.fit(X, y, sample_weight)
with pytest.raises(ValueError,
match="Found input variables with inconsistent number"):
gbdt.fit(X, y[1:])
def test_infinite_values_missing_values():
# High level test making sure that inf and nan values are properly handled
# when both are present. This is similar to
# test_split_on_nan_with_infinite_values() in test_grower.py, though we
# cannot check the predictions for binned values here.
X = np.asarray([-np.inf, 0, 1, np.inf, np.nan]).reshape(-1, 1)
y_isnan = np.isnan(X.ravel())
y_isinf = X.ravel() == np.inf
stump_clf = HistGradientBoostingClassifier(min_samples_leaf=1, max_iter=1,
learning_rate=1, max_depth=2)
assert stump_clf.fit(X, y_isinf).score(X, y_isinf) == 1
assert stump_clf.fit(X, y_isnan).score(X, y_isnan) == 1
def test_crossentropy_binary_problem():
# categorical_crossentropy should only be used if there are more than two
# classes present. PR #14869
X = [[1], [0]]
y = [0, 1]
gbrt = HistGradientBoostingClassifier(loss='categorical_crossentropy')
with pytest.raises(ValueError,
match="'categorical_crossentropy' is not suitable for"):
gbrt.fit(X, y)
@pytest.mark.parametrize("scoring", [None, 'loss'])
def test_string_target_early_stopping(scoring):
# Regression tests for #14709 where the targets need to be encoded before
# to compute the score
rng = np.random.RandomState(42)
X = rng.randn(100, 10)
y = np.array(['x'] * 50 + ['y'] * 50, dtype=object)
gbrt = HistGradientBoostingClassifier(n_iter_no_change=10, scoring=scoring)
gbrt.fit(X, y)
def test_zero_sample_weights_regression():
# Make sure setting a SW to zero amounts to ignoring the corresponding
# sample
X = [[1, 0],
[1, 0],
[1, 0],
[0, 1]]
y = [0, 0, 1, 0]
# ignore the first 2 training samples by setting their weight to 0
sample_weight = [0, 0, 1, 1]
gb = HistGradientBoostingRegressor(min_samples_leaf=1)
gb.fit(X, y, sample_weight=sample_weight)
assert gb.predict([[1, 0]])[0] > 0.5
def test_zero_sample_weights_classification():
# Make sure setting a SW to zero amounts to ignoring the corresponding
# sample
X = [[1, 0],
[1, 0],
[1, 0],
[0, 1]]
y = [0, 0, 1, 0]
# ignore the first 2 training samples by setting their weight to 0
sample_weight = [0, 0, 1, 1]
gb = HistGradientBoostingClassifier(loss='binary_crossentropy',
min_samples_leaf=1)
gb.fit(X, y, sample_weight=sample_weight)
assert_array_equal(gb.predict([[1, 0]]), [1])
X = [[1, 0],
[1, 0],
[1, 0],
[0, 1],
[1, 1]]
y = [0, 0, 1, 0, 2]
# ignore the first 2 training samples by setting their weight to 0
sample_weight = [0, 0, 1, 1, 1]
gb = HistGradientBoostingClassifier(loss='categorical_crossentropy',
min_samples_leaf=1)
gb.fit(X, y, sample_weight=sample_weight)
assert_array_equal(gb.predict([[1, 0]]), [1])
@pytest.mark.parametrize('problem', (
'regression',
'binary_classification',
'multiclass_classification'
))
@pytest.mark.parametrize('duplication', ('half', 'all'))
def test_sample_weight_effect(problem, duplication):
# High level test to make sure that duplicating a sample is equivalent to
# giving it weight of 2.
# fails for n_samples > 255 because binning does not take sample weights
# into account. Keeping n_samples <= 255 makes
# sure only unique values are used so SW have no effect on binning.
n_samples = 255
n_features = 2
if problem == 'regression':
X, y = make_regression(n_samples=n_samples, n_features=n_features,
n_informative=n_features, random_state=0)
Klass = HistGradientBoostingRegressor
else:
n_classes = 2 if problem == 'binary_classification' else 3
X, y = make_classification(n_samples=n_samples, n_features=n_features,
n_informative=n_features, n_redundant=0,
n_clusters_per_class=1,
n_classes=n_classes, random_state=0)
Klass = HistGradientBoostingClassifier
# This test can't pass if min_samples_leaf > 1 because that would force 2
# samples to be in the same node in est_sw, while these samples would be
# free to be separate in est_dup: est_dup would just group together the
# duplicated samples.
est = Klass(min_samples_leaf=1)
# Create dataset with duplicate and corresponding sample weights
if duplication == 'half':
lim = n_samples // 2
else:
lim = n_samples
X_dup = np.r_[X, X[:lim]]
y_dup = np.r_[y, y[:lim]]
sample_weight = np.ones(shape=(n_samples))
sample_weight[:lim] = 2
est_sw = clone(est).fit(X, y, sample_weight=sample_weight)
est_dup = clone(est).fit(X_dup, y_dup)
# checking raw_predict is stricter than just predict for classification
assert np.allclose(est_sw._raw_predict(X_dup),
est_dup._raw_predict(X_dup))
@pytest.mark.parametrize('loss_name', ('least_squares',
'least_absolute_deviation'))
def test_sum_hessians_are_sample_weight(loss_name):
# For losses with constant hessians, the sum_hessians field of the
# histograms must be equal to the sum of the sample weight of samples at
# the corresponding bin.
rng = np.random.RandomState(0)
n_samples = 1000
n_features = 2
X, y = make_regression(n_samples=n_samples, n_features=n_features,
random_state=rng)
bin_mapper = _BinMapper()
X_binned = bin_mapper.fit_transform(X)
sample_weight = rng.normal(size=n_samples)
loss = _LOSSES[loss_name](sample_weight=sample_weight)
gradients, hessians = loss.init_gradients_and_hessians(
n_samples=n_samples, prediction_dim=1, sample_weight=sample_weight)
raw_predictions = rng.normal(size=(1, n_samples))
loss.update_gradients_and_hessians(gradients, hessians, y,
raw_predictions, sample_weight)
# build sum_sample_weight which contains the sum of the sample weights at
# each bin (for each feature). This must be equal to the sum_hessians
# field of the corresponding histogram
sum_sw = np.zeros(shape=(n_features, bin_mapper.n_bins))
for feature_idx in range(n_features):
for sample_idx in range(n_samples):
sum_sw[feature_idx, X_binned[sample_idx, feature_idx]] += (
sample_weight[sample_idx])
# Build histogram
grower = TreeGrower(X_binned, gradients[0], hessians[0],
n_bins=bin_mapper.n_bins)
histograms = grower.histogram_builder.compute_histograms_brute(
grower.root.sample_indices)
for feature_idx in range(n_features):
for bin_idx in range(bin_mapper.n_bins):
assert histograms[feature_idx, bin_idx]['sum_hessians'] == (
pytest.approx(sum_sw[feature_idx, bin_idx], rel=1e-5))
def test_max_depth_max_leaf_nodes():
# Non regression test for
# https://github.com/scikit-learn/scikit-learn/issues/16179
# there was a bug when the max_depth and the max_leaf_nodes criteria were
# met at the same time, which would lead to max_leaf_nodes not being
# respected.
X, y = make_classification(random_state=0)
est = HistGradientBoostingClassifier(max_depth=2, max_leaf_nodes=3,
max_iter=1).fit(X, y)
tree = est._predictors[0][0]
assert tree.get_max_depth() == 2
assert tree.get_n_leaf_nodes() == 3 # would be 4 prior to bug fix
def test_early_stopping_on_test_set_with_warm_start():
# Non regression test for #16661 where second fit fails with
# warm_start=True, early_stopping is on, and no validation set
X, y = make_classification(random_state=0)
gb = HistGradientBoostingClassifier(
max_iter=1, scoring='loss', warm_start=True, early_stopping=True,
n_iter_no_change=1, validation_fraction=None)
gb.fit(X, y)
# does not raise on second call
gb.set_params(max_iter=2)
gb.fit(X, y)
@pytest.mark.parametrize('Est', (HistGradientBoostingClassifier,
HistGradientBoostingRegressor))
def test_single_node_trees(Est):
# Make sure it's still possible to build single-node trees. In that case
# the value of the root is set to 0. That's a correct value: if the tree is
# single-node that's because min_gain_to_split is not respected right from
# the root, so we don't want the tree to have any impact on the
# predictions.
X, y = make_classification(random_state=0)
y[:] = 1 # constant target will lead to a single root node
est = Est(max_iter=20)
est.fit(X, y)
assert all(len(predictor[0].nodes) == 1 for predictor in est._predictors)
assert all(predictor[0].nodes[0]['value'] == 0
for predictor in est._predictors)
# Still gives correct predictions thanks to the baseline prediction
assert_allclose(est.predict(X), y)
@pytest.mark.parametrize('Est, loss, X, y', [
(
HistGradientBoostingClassifier,
BinaryCrossEntropy(sample_weight=None),
X_classification,
y_classification
),
(
HistGradientBoostingRegressor,
LeastSquares(sample_weight=None),
X_regression,
y_regression
)
])
def test_custom_loss(Est, loss, X, y):
est = Est(loss=loss, max_iter=20)
est.fit(X, y)

View file

@ -0,0 +1,399 @@
import numpy as np
import pytest
from pytest import approx
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
def _make_training_data(n_bins=256, constant_hessian=True):
rng = np.random.RandomState(42)
n_samples = 10000
# Generate some test data directly binned so as to test the grower code
# independently of the binning logic.
X_binned = rng.randint(0, n_bins - 1, size=(n_samples, 2),
dtype=X_BINNED_DTYPE)
X_binned = np.asfortranarray(X_binned)
def true_decision_function(input_features):
"""Ground truth decision function
This is a very simple yet asymmetric decision tree. Therefore the
grower code should have no trouble recovering the decision function
from 10000 training samples.
"""
if input_features[0] <= n_bins // 2:
return -1
else:
return -1 if input_features[1] <= n_bins // 3 else 1
target = np.array([true_decision_function(x) for x in X_binned],
dtype=Y_DTYPE)
# Assume a square loss applied to an initial model that always predicts 0
# (hardcoded for this test):
all_gradients = target.astype(G_H_DTYPE)
shape_hessians = 1 if constant_hessian else all_gradients.shape
all_hessians = np.ones(shape=shape_hessians, dtype=G_H_DTYPE)
return X_binned, all_gradients, all_hessians
def _check_children_consistency(parent, left, right):
# Make sure the samples are correctly dispatched from a parent to its
# children
assert parent.left_child is left
assert parent.right_child is right
# each sample from the parent is propagated to one of the two children
assert (len(left.sample_indices) + len(right.sample_indices)
== len(parent.sample_indices))
assert (set(left.sample_indices).union(set(right.sample_indices))
== set(parent.sample_indices))
# samples are sent either to the left or the right node, never to both
assert (set(left.sample_indices).intersection(set(right.sample_indices))
== set())
@pytest.mark.parametrize(
'n_bins, constant_hessian, stopping_param, shrinkage',
[
(11, True, "min_gain_to_split", 0.5),
(11, False, "min_gain_to_split", 1.),
(11, True, "max_leaf_nodes", 1.),
(11, False, "max_leaf_nodes", 0.1),
(42, True, "max_leaf_nodes", 0.01),
(42, False, "max_leaf_nodes", 1.),
(256, True, "min_gain_to_split", 1.),
(256, True, "max_leaf_nodes", 0.1),
]
)
def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage):
X_binned, all_gradients, all_hessians = _make_training_data(
n_bins=n_bins, constant_hessian=constant_hessian)
n_samples = X_binned.shape[0]
if stopping_param == "max_leaf_nodes":
stopping_param = {"max_leaf_nodes": 3}
else:
stopping_param = {"min_gain_to_split": 0.01}
grower = TreeGrower(X_binned, all_gradients, all_hessians,
n_bins=n_bins, shrinkage=shrinkage,
min_samples_leaf=1, **stopping_param)
# The root node is not yet splitted, but the best possible split has
# already been evaluated:
assert grower.root.left_child is None
assert grower.root.right_child is None
root_split = grower.root.split_info
assert root_split.feature_idx == 0
assert root_split.bin_idx == n_bins // 2
assert len(grower.splittable_nodes) == 1
# Calling split next applies the next split and computes the best split
# for each of the two newly introduced children nodes.
left_node, right_node = grower.split_next()
# All training samples have ben splitted in the two nodes, approximately
# 50%/50%
_check_children_consistency(grower.root, left_node, right_node)
assert len(left_node.sample_indices) > 0.4 * n_samples
assert len(left_node.sample_indices) < 0.6 * n_samples
if grower.min_gain_to_split > 0:
# The left node is too pure: there is no gain to split it further.
assert left_node.split_info.gain < grower.min_gain_to_split
assert left_node in grower.finalized_leaves
# The right node can still be splitted further, this time on feature #1
split_info = right_node.split_info
assert split_info.gain > 1.
assert split_info.feature_idx == 1
assert split_info.bin_idx == n_bins // 3
assert right_node.left_child is None
assert right_node.right_child is None
# The right split has not been applied yet. Let's do it now:
assert len(grower.splittable_nodes) == 1
right_left_node, right_right_node = grower.split_next()
_check_children_consistency(right_node, right_left_node, right_right_node)
assert len(right_left_node.sample_indices) > 0.1 * n_samples
assert len(right_left_node.sample_indices) < 0.2 * n_samples
assert len(right_right_node.sample_indices) > 0.2 * n_samples
assert len(right_right_node.sample_indices) < 0.4 * n_samples
# All the leafs are pure, it is not possible to split any further:
assert not grower.splittable_nodes
grower._apply_shrinkage()
# Check the values of the leaves:
assert grower.root.left_child.value == approx(shrinkage)
assert grower.root.right_child.left_child.value == approx(shrinkage)
assert grower.root.right_child.right_child.value == approx(-shrinkage,
rel=1e-3)
def test_predictor_from_grower():
# Build a tree on the toy 3-leaf dataset to extract the predictor.
n_bins = 256
X_binned, all_gradients, all_hessians = _make_training_data(
n_bins=n_bins)
grower = TreeGrower(X_binned, all_gradients, all_hessians,
n_bins=n_bins, shrinkage=1.,
max_leaf_nodes=3, min_samples_leaf=5)
grower.grow()
assert grower.n_nodes == 5 # (2 decision nodes + 3 leaves)
# Check that the node structure can be converted into a predictor
# object to perform predictions at scale
predictor = grower.make_predictor()
assert predictor.nodes.shape[0] == 5
assert predictor.nodes['is_leaf'].sum() == 3
# Probe some predictions for each leaf of the tree
# each group of 3 samples corresponds to a condition in _make_training_data
input_data = np.array([
[0, 0],
[42, 99],
[128, 254],
[129, 0],
[129, 85],
[254, 85],
[129, 86],
[129, 254],
[242, 100],
], dtype=np.uint8)
missing_values_bin_idx = n_bins - 1
predictions = predictor.predict_binned(input_data, missing_values_bin_idx)
expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1]
assert np.allclose(predictions, expected_targets)
# Check that training set can be recovered exactly:
predictions = predictor.predict_binned(X_binned, missing_values_bin_idx)
assert np.allclose(predictions, -all_gradients)
@pytest.mark.parametrize(
'n_samples, min_samples_leaf, n_bins, constant_hessian, noise',
[
(11, 10, 7, True, 0),
(13, 10, 42, False, 0),
(56, 10, 255, True, 0.1),
(101, 3, 7, True, 0),
(200, 42, 42, False, 0),
(300, 55, 255, True, 0.1),
(300, 301, 255, True, 0.1),
]
)
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins,
constant_hessian, noise):
rng = np.random.RandomState(seed=0)
# data = linear target, 3 features, 1 irrelevant.
X = rng.normal(size=(n_samples, 3))
y = X[:, 0] - X[:, 1]
if noise:
y_scale = y.std()
y += rng.normal(scale=noise, size=n_samples) * y_scale
mapper = _BinMapper(n_bins=n_bins)
X = mapper.fit_transform(X)
all_gradients = y.astype(G_H_DTYPE)
shape_hessian = 1 if constant_hessian else all_gradients.shape
all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE)
grower = TreeGrower(X, all_gradients, all_hessians,
n_bins=n_bins, shrinkage=1.,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=n_samples)
grower.grow()
predictor = grower.make_predictor(
bin_thresholds=mapper.bin_thresholds_)
if n_samples >= min_samples_leaf:
for node in predictor.nodes:
if node['is_leaf']:
assert node['count'] >= min_samples_leaf
else:
assert predictor.nodes.shape[0] == 1
assert predictor.nodes[0]['is_leaf']
assert predictor.nodes[0]['count'] == n_samples
@pytest.mark.parametrize('n_samples, min_samples_leaf', [
(99, 50),
(100, 50)])
def test_min_samples_leaf_root(n_samples, min_samples_leaf):
# Make sure root node isn't split if n_samples is not at least twice
# min_samples_leaf
rng = np.random.RandomState(seed=0)
n_bins = 256
# data = linear target, 3 features, 1 irrelevant.
X = rng.normal(size=(n_samples, 3))
y = X[:, 0] - X[:, 1]
mapper = _BinMapper(n_bins=n_bins)
X = mapper.fit_transform(X)
all_gradients = y.astype(G_H_DTYPE)
all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
grower = TreeGrower(X, all_gradients, all_hessians,
n_bins=n_bins, shrinkage=1.,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=n_samples)
grower.grow()
if n_samples >= min_samples_leaf * 2:
assert len(grower.finalized_leaves) >= 2
else:
assert len(grower.finalized_leaves) == 1
def assert_is_stump(grower):
# To assert that stumps are created when max_depth=1
for leaf in (grower.root.left_child, grower.root.right_child):
assert leaf.left_child is None
assert leaf.right_child is None
@pytest.mark.parametrize('max_depth', [1, 2, 3])
def test_max_depth(max_depth):
# Make sure max_depth parameter works as expected
rng = np.random.RandomState(seed=0)
n_bins = 256
n_samples = 1000
# data = linear target, 3 features, 1 irrelevant.
X = rng.normal(size=(n_samples, 3))
y = X[:, 0] - X[:, 1]
mapper = _BinMapper(n_bins=n_bins)
X = mapper.fit_transform(X)
all_gradients = y.astype(G_H_DTYPE)
all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
grower = TreeGrower(X, all_gradients, all_hessians, max_depth=max_depth)
grower.grow()
depth = max(leaf.depth for leaf in grower.finalized_leaves)
assert depth == max_depth
if max_depth == 1:
assert_is_stump(grower)
def test_input_validation():
X_binned, all_gradients, all_hessians = _make_training_data()
X_binned_float = X_binned.astype(np.float32)
with pytest.raises(NotImplementedError,
match="X_binned must be of type uint8"):
TreeGrower(X_binned_float, all_gradients, all_hessians)
X_binned_C_array = np.ascontiguousarray(X_binned)
with pytest.raises(
ValueError,
match="X_binned should be passed as Fortran contiguous array"):
TreeGrower(X_binned_C_array, all_gradients, all_hessians)
def test_init_parameters_validation():
X_binned, all_gradients, all_hessians = _make_training_data()
with pytest.raises(ValueError,
match="min_gain_to_split=-1 must be positive"):
TreeGrower(X_binned, all_gradients, all_hessians,
min_gain_to_split=-1)
with pytest.raises(ValueError,
match="min_hessian_to_split=-1 must be positive"):
TreeGrower(X_binned, all_gradients, all_hessians,
min_hessian_to_split=-1)
def test_missing_value_predict_only():
# Make sure that missing values are supported at predict time even if they
# were not encountered in the training data: the missing values are
# assigned to whichever child has the most samples.
rng = np.random.RandomState(0)
n_samples = 100
X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8)
X_binned = np.asfortranarray(X_binned)
gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
grower = TreeGrower(X_binned, gradients, hessians, min_samples_leaf=5,
has_missing_values=False)
grower.grow()
predictor = grower.make_predictor()
# go from root to a leaf, always following node with the most samples.
# That's the path nans are supposed to take
node = predictor.nodes[0]
while not node['is_leaf']:
left = predictor.nodes[node['left']]
right = predictor.nodes[node['right']]
node = left if left['count'] > right['count'] else right
prediction_main_path = node['value']
# now build X_test with only nans, and make sure all predictions are equal
# to prediction_main_path
all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan)
assert np.all(predictor.predict(all_nans) == prediction_main_path)
def test_split_on_nan_with_infinite_values():
# Make sure the split on nan situations are respected even when there are
# samples with +inf values (we set the threshold to +inf when we have a
# split on nan so this test makes sure this does not introduce edge-case
# bugs). We need to use the private API so that we can also test
# predict_binned().
X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1)
# the gradient values will force a split on nan situation
gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE)
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
bin_mapper = _BinMapper()
X_binned = bin_mapper.fit_transform(X)
n_bins_non_missing = 3
has_missing_values = True
grower = TreeGrower(X_binned, gradients, hessians,
n_bins_non_missing=n_bins_non_missing,
has_missing_values=has_missing_values,
min_samples_leaf=1)
grower.grow()
predictor = grower.make_predictor(
bin_thresholds=bin_mapper.bin_thresholds_
)
# sanity check: this was a split on nan
assert predictor.nodes[0]['threshold'] == np.inf
assert predictor.nodes[0]['bin_threshold'] == n_bins_non_missing - 1
# Make sure in particular that the +inf sample is mapped to the left child
# Note that lightgbm "fails" here and will assign the inf sample to the
# right child, even though it's a "split on nan" situation.
predictions = predictor.predict(X)
predictions_binned = predictor.predict_binned(
X_binned, missing_values_bin_idx=bin_mapper.missing_values_bin_idx_)
np.testing.assert_allclose(predictions, -gradients)
np.testing.assert_allclose(predictions_binned, -gradients)

View file

@ -0,0 +1,202 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose
from numpy.testing import assert_array_equal
from sklearn.ensemble._hist_gradient_boosting.histogram import (
_build_histogram_naive,
_build_histogram,
_build_histogram_no_hessian,
_build_histogram_root_no_hessian,
_build_histogram_root,
_subtract_histograms
)
from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
@pytest.mark.parametrize(
'build_func', [_build_histogram_naive, _build_histogram])
def test_build_histogram(build_func):
binned_feature = np.array([0, 2, 0, 1, 2, 0, 2, 1], dtype=X_BINNED_DTYPE)
# Small sample_indices (below unrolling threshold)
ordered_gradients = np.array([0, 1, 3], dtype=G_H_DTYPE)
ordered_hessians = np.array([1, 1, 2], dtype=G_H_DTYPE)
sample_indices = np.array([0, 2, 3], dtype=np.uint32)
hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE)
build_func(0, sample_indices, binned_feature, ordered_gradients,
ordered_hessians, hist)
hist = hist[0]
assert_array_equal(hist['count'], [2, 1, 0])
assert_allclose(hist['sum_gradients'], [1, 3, 0])
assert_allclose(hist['sum_hessians'], [2, 2, 0])
# Larger sample_indices (above unrolling threshold)
sample_indices = np.array([0, 2, 3, 6, 7], dtype=np.uint32)
ordered_gradients = np.array([0, 1, 3, 0, 1], dtype=G_H_DTYPE)
ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=G_H_DTYPE)
hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE)
build_func(0, sample_indices, binned_feature, ordered_gradients,
ordered_hessians, hist)
hist = hist[0]
assert_array_equal(hist['count'], [2, 2, 1])
assert_allclose(hist['sum_gradients'], [1, 4, 0])
assert_allclose(hist['sum_hessians'], [2, 2, 1])
def test_histogram_sample_order_independence():
# Make sure the order of the samples has no impact on the histogram
# computations
rng = np.random.RandomState(42)
n_sub_samples = 100
n_samples = 1000
n_bins = 256
binned_feature = rng.randint(0, n_bins - 1, size=n_samples,
dtype=X_BINNED_DTYPE)
sample_indices = rng.choice(np.arange(n_samples, dtype=np.uint32),
n_sub_samples, replace=False)
ordered_gradients = rng.randn(n_sub_samples).astype(G_H_DTYPE)
hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
_build_histogram_no_hessian(0, sample_indices, binned_feature,
ordered_gradients, hist_gc)
ordered_hessians = rng.exponential(size=n_sub_samples).astype(G_H_DTYPE)
hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
_build_histogram(0, sample_indices, binned_feature,
ordered_gradients, ordered_hessians, hist_ghc)
permutation = rng.permutation(n_sub_samples)
hist_gc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
_build_histogram_no_hessian(0, sample_indices[permutation],
binned_feature, ordered_gradients[permutation],
hist_gc_perm)
hist_ghc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
_build_histogram(0, sample_indices[permutation], binned_feature,
ordered_gradients[permutation],
ordered_hessians[permutation], hist_ghc_perm)
hist_gc = hist_gc[0]
hist_ghc = hist_ghc[0]
hist_gc_perm = hist_gc_perm[0]
hist_ghc_perm = hist_ghc_perm[0]
assert_allclose(hist_gc['sum_gradients'], hist_gc_perm['sum_gradients'])
assert_array_equal(hist_gc['count'], hist_gc_perm['count'])
assert_allclose(hist_ghc['sum_gradients'], hist_ghc_perm['sum_gradients'])
assert_allclose(hist_ghc['sum_hessians'], hist_ghc_perm['sum_hessians'])
assert_array_equal(hist_ghc['count'], hist_ghc_perm['count'])
@pytest.mark.parametrize("constant_hessian", [True, False])
def test_unrolled_equivalent_to_naive(constant_hessian):
# Make sure the different unrolled histogram computations give the same
# results as the naive one.
rng = np.random.RandomState(42)
n_samples = 10
n_bins = 5
sample_indices = np.arange(n_samples).astype(np.uint32)
binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8)
ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
if constant_hessian:
ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
else:
ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
hist_gc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
hist_ghc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
hist_naive = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
_build_histogram_root_no_hessian(0, binned_feature,
ordered_gradients, hist_gc_root)
_build_histogram_root(0, binned_feature, ordered_gradients,
ordered_hessians, hist_ghc_root)
_build_histogram_no_hessian(0, sample_indices, binned_feature,
ordered_gradients, hist_gc)
_build_histogram(0, sample_indices, binned_feature,
ordered_gradients, ordered_hessians, hist_ghc)
_build_histogram_naive(0, sample_indices, binned_feature,
ordered_gradients, ordered_hessians, hist_naive)
hist_naive = hist_naive[0]
hist_gc_root = hist_gc_root[0]
hist_ghc_root = hist_ghc_root[0]
hist_gc = hist_gc[0]
hist_ghc = hist_ghc[0]
for hist in (hist_gc_root, hist_ghc_root, hist_gc, hist_ghc):
assert_array_equal(hist['count'], hist_naive['count'])
assert_allclose(hist['sum_gradients'], hist_naive['sum_gradients'])
for hist in (hist_ghc_root, hist_ghc):
assert_allclose(hist['sum_hessians'], hist_naive['sum_hessians'])
for hist in (hist_gc_root, hist_gc):
assert_array_equal(hist['sum_hessians'], np.zeros(n_bins))
@pytest.mark.parametrize("constant_hessian", [True, False])
def test_hist_subtraction(constant_hessian):
# Make sure the histogram subtraction trick gives the same result as the
# classical method.
rng = np.random.RandomState(42)
n_samples = 10
n_bins = 5
sample_indices = np.arange(n_samples).astype(np.uint32)
binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8)
ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
if constant_hessian:
ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
else:
ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
hist_parent = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
if constant_hessian:
_build_histogram_no_hessian(0, sample_indices, binned_feature,
ordered_gradients, hist_parent)
else:
_build_histogram(0, sample_indices, binned_feature,
ordered_gradients, ordered_hessians, hist_parent)
mask = rng.randint(0, 2, n_samples).astype(np.bool)
sample_indices_left = sample_indices[mask]
ordered_gradients_left = ordered_gradients[mask]
ordered_hessians_left = ordered_hessians[mask]
hist_left = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
if constant_hessian:
_build_histogram_no_hessian(0, sample_indices_left,
binned_feature, ordered_gradients_left,
hist_left)
else:
_build_histogram(0, sample_indices_left, binned_feature,
ordered_gradients_left, ordered_hessians_left,
hist_left)
sample_indices_right = sample_indices[~mask]
ordered_gradients_right = ordered_gradients[~mask]
ordered_hessians_right = ordered_hessians[~mask]
hist_right = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
if constant_hessian:
_build_histogram_no_hessian(0, sample_indices_right,
binned_feature, ordered_gradients_right,
hist_right)
else:
_build_histogram(0, sample_indices_right, binned_feature,
ordered_gradients_right, ordered_hessians_right,
hist_right)
hist_left_sub = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
hist_right_sub = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
_subtract_histograms(0, n_bins, hist_parent, hist_right, hist_left_sub)
_subtract_histograms(0, n_bins, hist_parent, hist_left, hist_right_sub)
for key in ('count', 'sum_hessians', 'sum_gradients'):
assert_allclose(hist_left[key], hist_left_sub[key], rtol=1e-6)
assert_allclose(hist_right[key], hist_right_sub[key], rtol=1e-6)

View file

@ -0,0 +1,318 @@
import numpy as np
from numpy.testing import assert_almost_equal
from numpy.testing import assert_allclose
from scipy.optimize import newton
from sklearn.utils import assert_all_finite
from sklearn.utils.fixes import sp_version, parse_version
import pytest
from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
from sklearn.utils._testing import skip_if_32bit
def get_derivatives_helper(loss):
"""Return get_gradients() and get_hessians() functions for a given loss.
"""
def get_gradients(y_true, raw_predictions):
# create gradients and hessians array, update inplace, and return
gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
loss.update_gradients_and_hessians(gradients, hessians, y_true,
raw_predictions, None)
return gradients
def get_hessians(y_true, raw_predictions):
# create gradients and hessians array, update inplace, and return
gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
loss.update_gradients_and_hessians(gradients, hessians, y_true,
raw_predictions, None)
if loss.__class__.__name__ == 'LeastSquares':
# hessians aren't updated because they're constant:
# the value is 1 (and not 2) because the loss is actually an half
# least squares loss.
hessians = np.full_like(raw_predictions, fill_value=1)
elif loss.__class__.__name__ == 'LeastAbsoluteDeviation':
# hessians aren't updated because they're constant
hessians = np.full_like(raw_predictions, fill_value=0)
return hessians
return get_gradients, get_hessians
@pytest.mark.parametrize('loss, x0, y_true', [
('least_squares', -2., 42),
('least_squares', 117., 1.05),
('least_squares', 0., 0.),
# I don't understand why but y_true == 0 fails :/
# ('binary_crossentropy', 0.3, 0),
('binary_crossentropy', -12, 1),
('binary_crossentropy', 30, 1),
('poisson', 12., 1.),
('poisson', 0., 2.),
('poisson', -22., 10.),
])
@pytest.mark.skipif(sp_version == parse_version('1.2.0'),
reason='bug in scipy 1.2.0, see scipy issue #9608')
@skip_if_32bit
def test_derivatives(loss, x0, y_true):
# Check that gradients are zero when the loss is minimized on 1D array
# using Halley's method with the first and second order derivatives
# computed by the Loss instance.
loss = _LOSSES[loss](sample_weight=None)
y_true = np.array([y_true], dtype=Y_DTYPE)
x0 = np.array([x0], dtype=Y_DTYPE).reshape(1, 1)
get_gradients, get_hessians = get_derivatives_helper(loss)
def func(x):
return loss.pointwise_loss(y_true, x)
def fprime(x):
return get_gradients(y_true, x)
def fprime2(x):
return get_hessians(y_true, x)
optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2,
maxiter=70, tol=2e-8)
assert np.allclose(loss.inverse_link_function(optimum), y_true)
assert np.allclose(loss.pointwise_loss(y_true, optimum), 0)
assert np.allclose(get_gradients(y_true, optimum), 0, atol=1e-7)
@pytest.mark.parametrize('loss, n_classes, prediction_dim', [
('least_squares', 0, 1),
('least_absolute_deviation', 0, 1),
('binary_crossentropy', 2, 1),
('categorical_crossentropy', 3, 3),
('poisson', 0, 1),
])
@pytest.mark.skipif(Y_DTYPE != np.float64,
reason='Need 64 bits float precision for numerical checks')
def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
# Make sure gradients and hessians computed in the loss are correct, by
# comparing with their approximations computed with finite central
# differences.
# See https://en.wikipedia.org/wiki/Finite_difference.
rng = np.random.RandomState(seed)
n_samples = 100
if loss in ('least_squares', 'least_absolute_deviation'):
y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
elif loss in ('poisson'):
y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
else:
y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
raw_predictions = rng.normal(
size=(prediction_dim, n_samples)
).astype(Y_DTYPE)
loss = _LOSSES[loss](sample_weight=None)
get_gradients, get_hessians = get_derivatives_helper(loss)
# only take gradients and hessians of first tree / class.
gradients = get_gradients(y_true, raw_predictions)[0, :].ravel()
hessians = get_hessians(y_true, raw_predictions)[0, :].ravel()
# Approximate gradients
# For multiclass loss, we should only change the predictions of one tree
# (here the first), hence the use of offset[0, :] += eps
# As a softmax is computed, offsetting the whole array by a constant would
# have no effect on the probabilities, and thus on the loss
eps = 1e-9
offset = np.zeros_like(raw_predictions)
offset[0, :] = eps
f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset / 2)
f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset / 2)
numerical_gradients = (f_plus_eps - f_minus_eps) / eps
# Approximate hessians
eps = 1e-4 # need big enough eps as we divide by its square
offset[0, :] = eps
f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset)
f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset)
f = loss.pointwise_loss(y_true, raw_predictions)
numerical_hessians = (f_plus_eps + f_minus_eps - 2 * f) / eps**2
assert_allclose(numerical_gradients, gradients, rtol=1e-4, atol=1e-7)
assert_allclose(numerical_hessians, hessians, rtol=1e-4, atol=1e-7)
def test_baseline_least_squares():
rng = np.random.RandomState(0)
loss = _LOSSES['least_squares'](sample_weight=None)
y_train = rng.normal(size=100)
baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
assert baseline_prediction.shape == tuple() # scalar
assert baseline_prediction.dtype == y_train.dtype
# Make sure baseline prediction is the mean of all targets
assert_almost_equal(baseline_prediction, y_train.mean())
assert np.allclose(loss.inverse_link_function(baseline_prediction),
baseline_prediction)
def test_baseline_least_absolute_deviation():
rng = np.random.RandomState(0)
loss = _LOSSES['least_absolute_deviation'](sample_weight=None)
y_train = rng.normal(size=100)
baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
assert baseline_prediction.shape == tuple() # scalar
assert baseline_prediction.dtype == y_train.dtype
# Make sure baseline prediction is the median of all targets
assert np.allclose(loss.inverse_link_function(baseline_prediction),
baseline_prediction)
assert baseline_prediction == pytest.approx(np.median(y_train))
def test_baseline_poisson():
rng = np.random.RandomState(0)
loss = _LOSSES['poisson'](sample_weight=None)
y_train = rng.poisson(size=100).astype(np.float64)
# Sanity check, make sure at least one sample is non-zero so we don't take
# log(0)
assert y_train.sum() > 0
baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
assert np.isscalar(baseline_prediction)
assert baseline_prediction.dtype == y_train.dtype
assert_all_finite(baseline_prediction)
# Make sure baseline prediction produces the log of the mean of all targets
assert_almost_equal(np.log(y_train.mean()), baseline_prediction)
# Test baseline for y_true = 0
y_train.fill(0.)
baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
assert_all_finite(baseline_prediction)
def test_baseline_binary_crossentropy():
rng = np.random.RandomState(0)
loss = _LOSSES['binary_crossentropy'](sample_weight=None)
for y_train in (np.zeros(shape=100), np.ones(shape=100)):
y_train = y_train.astype(np.float64)
baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
assert_all_finite(baseline_prediction)
assert np.allclose(loss.inverse_link_function(baseline_prediction),
y_train[0])
# Make sure baseline prediction is equal to link_function(p), where p
# is the proba of the positive class. We want predict_proba() to return p,
# and by definition
# p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction)
# So we want raw_prediction = link_function(p) = log(p / (1 - p))
y_train = rng.randint(0, 2, size=100).astype(np.float64)
baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
assert baseline_prediction.shape == tuple() # scalar
assert baseline_prediction.dtype == y_train.dtype
p = y_train.mean()
assert np.allclose(baseline_prediction, np.log(p / (1 - p)))
def test_baseline_categorical_crossentropy():
rng = np.random.RandomState(0)
prediction_dim = 4
loss = _LOSSES['categorical_crossentropy'](sample_weight=None)
for y_train in (np.zeros(shape=100), np.ones(shape=100)):
y_train = y_train.astype(np.float64)
baseline_prediction = loss.get_baseline_prediction(y_train, None,
prediction_dim)
assert baseline_prediction.dtype == y_train.dtype
assert_all_finite(baseline_prediction)
# Same logic as for above test. Here inverse_link_function = softmax and
# link_function = log
y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32)
baseline_prediction = loss.get_baseline_prediction(y_train, None,
prediction_dim)
assert baseline_prediction.shape == (prediction_dim, 1)
for k in range(prediction_dim):
p = (y_train == k).mean()
assert np.allclose(baseline_prediction[k, :], np.log(p))
@pytest.mark.parametrize('loss, problem', [
('least_squares', 'regression'),
('least_absolute_deviation', 'regression'),
('binary_crossentropy', 'classification'),
('categorical_crossentropy', 'classification'),
('poisson', 'poisson_regression'),
])
@pytest.mark.parametrize('sample_weight', ['ones', 'random'])
def test_sample_weight_multiplies_gradients(loss, problem, sample_weight):
# Make sure that passing sample weights to the gradient and hessians
# computation methods is equivalent to multiplying by the weights.
rng = np.random.RandomState(42)
n_samples = 1000
if loss == 'categorical_crossentropy':
n_classes = prediction_dim = 3
else:
n_classes = prediction_dim = 1
if problem == 'regression':
y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
elif problem == 'poisson_regression':
y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
else:
y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
if sample_weight == 'ones':
sample_weight = np.ones(shape=n_samples, dtype=Y_DTYPE)
else:
sample_weight = rng.normal(size=n_samples).astype(Y_DTYPE)
loss_ = _LOSSES[loss](sample_weight=sample_weight)
baseline_prediction = loss_.get_baseline_prediction(
y_true, None, prediction_dim
)
raw_predictions = np.zeros(shape=(prediction_dim, n_samples),
dtype=baseline_prediction.dtype)
raw_predictions += baseline_prediction
gradients = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
hessians = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
loss_.update_gradients_and_hessians(gradients, hessians, y_true,
raw_predictions, None)
gradients_sw = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
hessians_sw = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
loss_.update_gradients_and_hessians(gradients_sw, hessians_sw, y_true,
raw_predictions, sample_weight)
assert np.allclose(gradients * sample_weight, gradients_sw)
assert np.allclose(hessians * sample_weight, hessians_sw)
def test_init_gradient_and_hessians_sample_weight():
# Make sure that passing sample_weight to a loss correctly influences the
# hessians_are_constant attribute, and consequently the shape of the
# hessians array.
prediction_dim = 2
n_samples = 5
sample_weight = None
loss = _LOSSES['least_squares'](sample_weight=sample_weight)
_, hessians = loss.init_gradients_and_hessians(
n_samples=n_samples, prediction_dim=prediction_dim,
sample_weight=None)
assert loss.hessians_are_constant
assert hessians.shape == (1, 1)
sample_weight = np.ones(n_samples)
loss = _LOSSES['least_squares'](sample_weight=sample_weight)
_, hessians = loss.init_gradients_and_hessians(
n_samples=n_samples, prediction_dim=prediction_dim,
sample_weight=sample_weight)
assert not loss.hessians_are_constant
assert hessians.shape == (prediction_dim, n_samples)

View file

@ -0,0 +1,341 @@
import numpy as np
import pytest
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
from sklearn.ensemble._hist_gradient_boosting.splitting import (
Splitter,
compute_node_value
)
from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
def is_increasing(a):
return (np.diff(a) >= 0.0).all()
def is_decreasing(a):
return (np.diff(a) <= 0.0).all()
def assert_leaves_values_monotonic(predictor, monotonic_cst):
# make sure leaves values (from left to right) are either all increasing
# or all decreasing (or neither) depending on the monotonic constraint.
nodes = predictor.nodes
def get_leaves_values():
"""get leaves values from left to right"""
values = []
def depth_first_collect_leaf_values(node_idx):
node = nodes[node_idx]
if node['is_leaf']:
values.append(node['value'])
return
depth_first_collect_leaf_values(node['left'])
depth_first_collect_leaf_values(node['right'])
depth_first_collect_leaf_values(0) # start at root (0)
return values
values = get_leaves_values()
if monotonic_cst == MonotonicConstraint.NO_CST:
# some increasing, some decreasing
assert not is_increasing(values) and not is_decreasing(values)
elif monotonic_cst == MonotonicConstraint.POS:
# all increasing
assert is_increasing(values)
else: # NEG
# all decreasing
assert is_decreasing(values)
def assert_children_values_monotonic(predictor, monotonic_cst):
# Make sure siblings values respect the monotonic constraints. Left should
# be lower (resp greater) than right child if constraint is POS (resp.
# NEG).
# Note that this property alone isn't enough to ensure full monotonicity,
# since we also need to guanrantee that all the descendents of the left
# child won't be greater (resp. lower) than the right child, or its
# descendents. That's why we need to bound the predicted values (this is
# tested in assert_children_values_bounded)
nodes = predictor.nodes
left_lower = []
left_greater = []
for node in nodes:
if node['is_leaf']:
continue
left_idx = node['left']
right_idx = node['right']
if nodes[left_idx]['value'] < nodes[right_idx]['value']:
left_lower.append(node)
elif nodes[left_idx]['value'] > nodes[right_idx]['value']:
left_greater.append(node)
if monotonic_cst == MonotonicConstraint.NO_CST:
assert left_lower and left_greater
elif monotonic_cst == MonotonicConstraint.POS:
assert left_lower and not left_greater
else: # NEG
assert not left_lower and left_greater
def assert_children_values_bounded(grower, monotonic_cst):
# Make sure that the values of the children of a node are bounded by the
# middle value between that node and its sibling (if there is a monotonic
# constraint).
# As a bonus, we also check that the siblings values are properly ordered
# which is slightly redundant with assert_children_values_monotonic (but
# this check is done on the grower nodes whereas
# assert_children_values_monotonic is done on the predictor nodes)
if monotonic_cst == MonotonicConstraint.NO_CST:
return
def recursively_check_children_node_values(node):
if node.is_leaf:
return
if node is not grower.root and node is node.parent.left_child:
sibling = node.sibling # on the right
middle = (node.value + sibling.value) / 2
if monotonic_cst == MonotonicConstraint.POS:
assert (node.left_child.value <=
node.right_child.value <=
middle)
if not sibling.is_leaf:
assert (middle <=
sibling.left_child.value <=
sibling.right_child.value)
else: # NEG
assert (node.left_child.value >=
node.right_child.value >=
middle)
if not sibling.is_leaf:
assert (middle >=
sibling.left_child.value >=
sibling.right_child.value)
recursively_check_children_node_values(node.left_child)
recursively_check_children_node_values(node.right_child)
recursively_check_children_node_values(grower.root)
@pytest.mark.parametrize('seed', range(3))
@pytest.mark.parametrize('monotonic_cst', (
MonotonicConstraint.NO_CST,
MonotonicConstraint.POS,
MonotonicConstraint.NEG,
))
def test_nodes_values(monotonic_cst, seed):
# Build a single tree with only one feature, and make sure the nodes
# values respect the monotonic constraints.
# Considering the following tree with a monotonic POS constraint, we
# should have:
#
# root
# / \
# 5 10 # middle = 7.5
# / \ / \
# a b c d
#
# a <= b and c <= d (assert_children_values_monotonic)
# a, b <= middle <= c, d (assert_children_values_bounded)
# a <= b <= c <= d (assert_leaves_values_monotonic)
#
# The last one is a consequence of the others, but can't hurt to check
rng = np.random.RandomState(seed)
n_samples = 1000
n_features = 1
X_binned = rng.randint(0, 255, size=(n_samples, n_features),
dtype=np.uint8)
X_binned = np.asfortranarray(X_binned)
gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
grower = TreeGrower(X_binned, gradients, hessians,
monotonic_cst=[monotonic_cst],
shrinkage=.1)
grower.grow()
# grow() will shrink the leaves values at the very end. For our comparison
# tests, we need to revert the shrinkage of the leaves, else we would
# compare the value of a leaf (shrunk) with a node (not shrunk) and the
# test would not be correct.
for leave in grower.finalized_leaves:
leave.value /= grower.shrinkage
# The consistency of the bounds can only be checked on the tree grower
# as the node bounds are not copied into the predictor tree. The
# consistency checks on the values of node children and leaves can be
# done either on the grower tree or on the predictor tree. We only
# do those checks on the predictor tree as the latter is derived from
# the former.
predictor = grower.make_predictor()
assert_children_values_monotonic(predictor, monotonic_cst)
assert_children_values_bounded(grower, monotonic_cst)
assert_leaves_values_monotonic(predictor, monotonic_cst)
@pytest.mark.parametrize('seed', range(3))
def test_predictions(seed):
# Train a model with a POS constraint on the first feature and a NEG
# constraint on the second feature, and make sure the constraints are
# respected by checking the predictions.
# test adapted from lightgbm's test_monotone_constraint(), itself inspired
# by https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html
rng = np.random.RandomState(seed)
n_samples = 1000
f_0 = rng.rand(n_samples) # positive correlation with y
f_1 = rng.rand(n_samples) # negative correslation with y
X = np.c_[f_0, f_1]
noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
y = (5 * f_0 + np.sin(10 * np.pi * f_0) -
5 * f_1 - np.cos(10 * np.pi * f_1) +
noise)
gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1])
gbdt.fit(X, y)
linspace = np.linspace(0, 1, 100)
sin = np.sin(linspace)
constant = np.full_like(linspace, fill_value=.5)
# We now assert the predictions properly respect the constraints, on each
# feature. When testing for a feature we need to set the other one to a
# constant, because the monotonic constraints are only a "all else being
# equal" type of constraints:
# a constraint on the first feature only means that
# x0 < x0' => f(x0, x1) < f(x0', x1)
# while x1 stays constant.
# The constraint does not guanrantee that
# x0 < x0' => f(x0, x1) < f(x0', x1')
# First feature (POS)
# assert pred is all increasing when f_0 is all increasing
X = np.c_[linspace, constant]
pred = gbdt.predict(X)
assert is_increasing(pred)
# assert pred actually follows the variations of f_0
X = np.c_[sin, constant]
pred = gbdt.predict(X)
assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0))
# Second feature (NEG)
# assert pred is all decreasing when f_1 is all increasing
X = np.c_[constant, linspace]
pred = gbdt.predict(X)
assert is_decreasing(pred)
# assert pred actually follows the inverse variations of f_1
X = np.c_[constant, sin]
pred = gbdt.predict(X)
assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all()
def test_input_error():
X = [[1, 2], [2, 3], [3, 4]]
y = [0, 1, 2]
gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, 0, -1])
with pytest.raises(ValueError,
match='monotonic_cst has shape 3 but the input data'):
gbdt.fit(X, y)
for monotonic_cst in ([1, 3], [1, -3]):
gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
with pytest.raises(ValueError,
match='must be None or an array-like of '
'-1, 0 or 1'):
gbdt.fit(X, y)
gbdt = HistGradientBoostingClassifier(monotonic_cst=[0, 1])
with pytest.raises(
ValueError,
match='monotonic constraints are not supported '
'for multiclass classification'
):
gbdt.fit(X, y)
def test_bounded_value_min_gain_to_split():
# The purpose of this test is to show that when computing the gain at a
# given split, the value of the current node should be properly bounded to
# respect the monotonic constraints, because it strongly interacts with
# min_gain_to_split. We build a simple example where gradients are [1, 1,
# 100, 1, 1] (hessians are all ones). The best split happens on the 3rd
# bin, and depending on whether the value of the node is bounded or not,
# the min_gain_to_split constraint is or isn't satisfied.
l2_regularization = 0
min_hessian_to_split = 0
min_samples_leaf = 1
n_bins = n_samples = 5
X_binned = np.arange(n_samples).reshape(-1, 1).astype(X_BINNED_DTYPE)
sample_indices = np.arange(n_samples, dtype=np.uint32)
all_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
all_gradients = np.array([1, 1, 100, 1, 1], dtype=G_H_DTYPE)
sum_gradients = all_gradients.sum()
sum_hessians = all_hessians.sum()
hessians_are_constant = False
builder = HistogramBuilder(X_binned, n_bins, all_gradients,
all_hessians, hessians_are_constant)
n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
dtype=np.uint32)
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
monotonic_cst = np.array(
[MonotonicConstraint.NO_CST] * X_binned.shape[1],
dtype=np.int8)
missing_values_bin_idx = n_bins - 1
children_lower_bound, children_upper_bound = -np.inf, np.inf
min_gain_to_split = 2000
splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
has_missing_values, monotonic_cst, l2_regularization,
min_hessian_to_split, min_samples_leaf,
min_gain_to_split, hessians_are_constant)
histograms = builder.compute_histograms_brute(sample_indices)
# Since the gradient array is [1, 1, 100, 1, 1]
# the max possible gain happens on the 3rd bin (or equivalently in the 2nd)
# and is equal to about 1307, which less than min_gain_to_split = 2000, so
# the node is considered unsplittable (gain = -1)
current_lower_bound, current_upper_bound = -np.inf, np.inf
value = compute_node_value(sum_gradients, sum_hessians,
current_lower_bound, current_upper_bound,
l2_regularization)
# the unbounded value is equal to -sum_gradients / sum_hessians
assert value == pytest.approx(-104 / 5)
split_info = splitter.find_node_split(n_samples, histograms,
sum_gradients, sum_hessians, value,
lower_bound=children_lower_bound,
upper_bound=children_upper_bound)
assert split_info.gain == -1 # min_gain_to_split not respected
# here again the max possible gain is on the 3rd bin but we now cap the
# value of the node into [-10, inf].
# This means the gain is now about 2430 which is more than the
# min_gain_to_split constraint.
current_lower_bound, current_upper_bound = -10, np.inf
value = compute_node_value(sum_gradients, sum_hessians,
current_lower_bound, current_upper_bound,
l2_regularization)
assert value == -10
split_info = splitter.find_node_split(n_samples, histograms,
sum_gradients, sum_hessians, value,
lower_bound=children_lower_bound,
upper_bound=children_upper_bound)
assert split_info.gain > min_gain_to_split

View file

@ -0,0 +1,76 @@
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import pytest
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor
from sklearn.ensemble._hist_gradient_boosting.common import (
G_H_DTYPE, PREDICTOR_RECORD_DTYPE, ALMOST_INF)
@pytest.mark.parametrize('n_bins', [200, 256])
def test_regression_dataset(n_bins):
X, y = make_regression(n_samples=500, n_features=10, n_informative=5,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=42)
mapper = _BinMapper(n_bins=n_bins, random_state=42)
X_train_binned = mapper.fit_transform(X_train)
# Init gradients and hessians to that of least squares loss
gradients = -y_train.astype(G_H_DTYPE)
hessians = np.ones(1, dtype=G_H_DTYPE)
min_samples_leaf = 10
max_leaf_nodes = 30
grower = TreeGrower(X_train_binned, gradients, hessians,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=max_leaf_nodes, n_bins=n_bins,
n_bins_non_missing=mapper.n_bins_non_missing_)
grower.grow()
predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)
assert r2_score(y_train, predictor.predict(X_train)) > 0.82
assert r2_score(y_test, predictor.predict(X_test)) > 0.67
@pytest.mark.parametrize('threshold, expected_predictions', [
(-np.inf, [0, 1, 1, 1]),
(10, [0, 0, 1, 1]),
(20, [0, 0, 0, 1]),
(ALMOST_INF, [0, 0, 0, 1]),
(np.inf, [0, 0, 0, 0]),
])
def test_infinite_values_and_thresholds(threshold, expected_predictions):
# Make sure infinite values and infinite thresholds are handled properly.
# In particular, if a value is +inf and the threshold is ALMOST_INF the
# sample should go to the right child. If the threshold is inf (split on
# nan), the +inf sample will go to the left child.
X = np.array([-np.inf, 10, 20, np.inf]).reshape(-1, 1)
nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE)
# We just construct a simple tree with 1 root and 2 children
# parent node
nodes[0]['left'] = 1
nodes[0]['right'] = 2
nodes[0]['feature_idx'] = 0
nodes[0]['threshold'] = threshold
# left child
nodes[1]['is_leaf'] = True
nodes[1]['value'] = 0
# right child
nodes[2]['is_leaf'] = True
nodes[2]['value'] = 1
predictor = TreePredictor(nodes)
predictions = predictor.predict(X)
assert np.all(predictions == expected_predictions)

View file

@ -0,0 +1,480 @@
import numpy as np
import pytest
from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
from sklearn.ensemble._hist_gradient_boosting.splitting import (
Splitter,
compute_node_value
)
from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
from sklearn.utils._testing import skip_if_32bit
@pytest.mark.parametrize('n_bins', [3, 32, 256])
def test_histogram_split(n_bins):
rng = np.random.RandomState(42)
feature_idx = 0
l2_regularization = 0
min_hessian_to_split = 1e-3
min_samples_leaf = 1
min_gain_to_split = 0.
X_binned = np.asfortranarray(
rng.randint(0, n_bins - 1, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE)
binned_feature = X_binned.T[feature_idx]
sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32)
ordered_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
all_hessians = ordered_hessians
sum_hessians = all_hessians.sum()
hessians_are_constant = False
for true_bin in range(1, n_bins - 2):
for sign in [-1, 1]:
ordered_gradients = np.full_like(binned_feature, sign,
dtype=G_H_DTYPE)
ordered_gradients[binned_feature <= true_bin] *= -1
all_gradients = ordered_gradients
sum_gradients = all_gradients.sum()
builder = HistogramBuilder(X_binned,
n_bins,
all_gradients,
all_hessians,
hessians_are_constant)
n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
dtype=np.uint32)
has_missing_values = np.array([False] * X_binned.shape[1],
dtype=np.uint8)
monotonic_cst = np.array(
[MonotonicConstraint.NO_CST] * X_binned.shape[1],
dtype=np.int8)
missing_values_bin_idx = n_bins - 1
splitter = Splitter(X_binned,
n_bins_non_missing,
missing_values_bin_idx,
has_missing_values,
monotonic_cst,
l2_regularization,
min_hessian_to_split,
min_samples_leaf, min_gain_to_split,
hessians_are_constant)
histograms = builder.compute_histograms_brute(sample_indices)
value = compute_node_value(sum_gradients, sum_hessians,
-np.inf, np.inf, l2_regularization)
split_info = splitter.find_node_split(
sample_indices.shape[0], histograms, sum_gradients,
sum_hessians, value)
assert split_info.bin_idx == true_bin
assert split_info.gain >= 0
assert split_info.feature_idx == feature_idx
assert (split_info.n_samples_left + split_info.n_samples_right
== sample_indices.shape[0])
# Constant hessian: 1. per sample.
assert split_info.n_samples_left == split_info.sum_hessian_left
@skip_if_32bit
@pytest.mark.parametrize('constant_hessian', [True, False])
def test_gradient_and_hessian_sanity(constant_hessian):
# This test checks that the values of gradients and hessians are
# consistent in different places:
# - in split_info: si.sum_gradient_left + si.sum_gradient_right must be
# equal to the gradient at the node. Same for hessians.
# - in the histograms: summing 'sum_gradients' over the bins must be
# constant across all features, and those sums must be equal to the
# node's gradient. Same for hessians.
rng = np.random.RandomState(42)
n_bins = 10
n_features = 20
n_samples = 500
l2_regularization = 0.
min_hessian_to_split = 1e-3
min_samples_leaf = 1
min_gain_to_split = 0.
X_binned = rng.randint(0, n_bins, size=(n_samples, n_features),
dtype=X_BINNED_DTYPE)
X_binned = np.asfortranarray(X_binned)
sample_indices = np.arange(n_samples, dtype=np.uint32)
all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
sum_gradients = all_gradients.sum()
if constant_hessian:
all_hessians = np.ones(1, dtype=G_H_DTYPE)
sum_hessians = 1 * n_samples
else:
all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
sum_hessians = all_hessians.sum()
builder = HistogramBuilder(X_binned, n_bins, all_gradients,
all_hessians, constant_hessian)
n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
dtype=np.uint32)
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
monotonic_cst = np.array(
[MonotonicConstraint.NO_CST] * X_binned.shape[1],
dtype=np.int8)
missing_values_bin_idx = n_bins - 1
splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
has_missing_values, monotonic_cst, l2_regularization,
min_hessian_to_split, min_samples_leaf,
min_gain_to_split, constant_hessian)
hists_parent = builder.compute_histograms_brute(sample_indices)
value_parent = compute_node_value(sum_gradients, sum_hessians,
-np.inf, np.inf, l2_regularization)
si_parent = splitter.find_node_split(n_samples, hists_parent,
sum_gradients, sum_hessians,
value_parent)
sample_indices_left, sample_indices_right, _ = splitter.split_indices(
si_parent, sample_indices)
hists_left = builder.compute_histograms_brute(sample_indices_left)
value_left = compute_node_value(si_parent.sum_gradient_left,
si_parent.sum_hessian_left,
-np.inf, np.inf, l2_regularization)
hists_right = builder.compute_histograms_brute(sample_indices_right)
value_right = compute_node_value(si_parent.sum_gradient_right,
si_parent.sum_hessian_right,
-np.inf, np.inf, l2_regularization)
si_left = splitter.find_node_split(n_samples, hists_left,
si_parent.sum_gradient_left,
si_parent.sum_hessian_left,
value_left)
si_right = splitter.find_node_split(n_samples, hists_right,
si_parent.sum_gradient_right,
si_parent.sum_hessian_right,
value_right)
# make sure that si.sum_gradient_left + si.sum_gradient_right have their
# expected value, same for hessians
for si, indices in (
(si_parent, sample_indices),
(si_left, sample_indices_left),
(si_right, sample_indices_right)):
gradient = si.sum_gradient_right + si.sum_gradient_left
expected_gradient = all_gradients[indices].sum()
hessian = si.sum_hessian_right + si.sum_hessian_left
if constant_hessian:
expected_hessian = indices.shape[0] * all_hessians[0]
else:
expected_hessian = all_hessians[indices].sum()
assert np.isclose(gradient, expected_gradient)
assert np.isclose(hessian, expected_hessian)
# make sure sum of gradients in histograms are the same for all features,
# and make sure they're equal to their expected value
hists_parent = np.asarray(hists_parent, dtype=HISTOGRAM_DTYPE)
hists_left = np.asarray(hists_left, dtype=HISTOGRAM_DTYPE)
hists_right = np.asarray(hists_right, dtype=HISTOGRAM_DTYPE)
for hists, indices in (
(hists_parent, sample_indices),
(hists_left, sample_indices_left),
(hists_right, sample_indices_right)):
# note: gradients and hessians have shape (n_features,),
# we're comparing them to *scalars*. This has the benefit of also
# making sure that all the entries are equal across features.
gradients = hists['sum_gradients'].sum(axis=1) # shape = (n_features,)
expected_gradient = all_gradients[indices].sum() # scalar
hessians = hists['sum_hessians'].sum(axis=1)
if constant_hessian:
# 0 is not the actual hessian, but it's not computed in this case
expected_hessian = 0.
else:
expected_hessian = all_hessians[indices].sum()
assert np.allclose(gradients, expected_gradient)
assert np.allclose(hessians, expected_hessian)
def test_split_indices():
# Check that split_indices returns the correct splits and that
# splitter.partition is consistent with what is returned.
rng = np.random.RandomState(421)
n_bins = 5
n_samples = 10
l2_regularization = 0.
min_hessian_to_split = 1e-3
min_samples_leaf = 1
min_gain_to_split = 0.
# split will happen on feature 1 and on bin 3
X_binned = [[0, 0],
[0, 3],
[0, 4],
[0, 0],
[0, 0],
[0, 0],
[0, 0],
[0, 4],
[0, 0],
[0, 4]]
X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE)
sample_indices = np.arange(n_samples, dtype=np.uint32)
all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
all_hessians = np.ones(1, dtype=G_H_DTYPE)
sum_gradients = all_gradients.sum()
sum_hessians = 1 * n_samples
hessians_are_constant = True
builder = HistogramBuilder(X_binned, n_bins,
all_gradients, all_hessians,
hessians_are_constant)
n_bins_non_missing = np.array([n_bins] * X_binned.shape[1],
dtype=np.uint32)
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
monotonic_cst = np.array(
[MonotonicConstraint.NO_CST] * X_binned.shape[1],
dtype=np.int8)
missing_values_bin_idx = n_bins - 1
splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
has_missing_values, monotonic_cst, l2_regularization,
min_hessian_to_split, min_samples_leaf,
min_gain_to_split, hessians_are_constant)
assert np.all(sample_indices == splitter.partition)
histograms = builder.compute_histograms_brute(sample_indices)
value = compute_node_value(sum_gradients, sum_hessians,
-np.inf, np.inf, l2_regularization)
si_root = splitter.find_node_split(n_samples, histograms,
sum_gradients, sum_hessians, value)
# sanity checks for best split
assert si_root.feature_idx == 1
assert si_root.bin_idx == 3
samples_left, samples_right, position_right = splitter.split_indices(
si_root, splitter.partition)
assert set(samples_left) == set([0, 1, 3, 4, 5, 6, 8])
assert set(samples_right) == set([2, 7, 9])
assert list(samples_left) == list(splitter.partition[:position_right])
assert list(samples_right) == list(splitter.partition[position_right:])
# Check that the resulting split indices sizes are consistent with the
# count statistics anticipated when looking for the best split.
assert samples_left.shape[0] == si_root.n_samples_left
assert samples_right.shape[0] == si_root.n_samples_right
def test_min_gain_to_split():
# Try to split a pure node (all gradients are equal, same for hessians)
# with min_gain_to_split = 0 and make sure that the node is not split (best
# possible gain = -1). Note: before the strict inequality comparison, this
# test would fail because the node would be split with a gain of 0.
rng = np.random.RandomState(42)
l2_regularization = 0
min_hessian_to_split = 0
min_samples_leaf = 1
min_gain_to_split = 0.
n_bins = 255
n_samples = 100
X_binned = np.asfortranarray(
rng.randint(0, n_bins, size=(n_samples, 1)), dtype=X_BINNED_DTYPE)
binned_feature = X_binned[:, 0]
sample_indices = np.arange(n_samples, dtype=np.uint32)
all_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
all_gradients = np.ones_like(binned_feature, dtype=G_H_DTYPE)
sum_gradients = all_gradients.sum()
sum_hessians = all_hessians.sum()
hessians_are_constant = False
builder = HistogramBuilder(X_binned, n_bins, all_gradients,
all_hessians, hessians_are_constant)
n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
dtype=np.uint32)
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
monotonic_cst = np.array(
[MonotonicConstraint.NO_CST] * X_binned.shape[1],
dtype=np.int8)
missing_values_bin_idx = n_bins - 1
splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
has_missing_values, monotonic_cst, l2_regularization,
min_hessian_to_split, min_samples_leaf,
min_gain_to_split, hessians_are_constant)
histograms = builder.compute_histograms_brute(sample_indices)
value = compute_node_value(sum_gradients, sum_hessians,
-np.inf, np.inf, l2_regularization)
split_info = splitter.find_node_split(n_samples, histograms,
sum_gradients, sum_hessians, value)
assert split_info.gain == -1
@pytest.mark.parametrize(
'X_binned, all_gradients, has_missing_values, n_bins_non_missing, '
' expected_split_on_nan, expected_bin_idx, expected_go_to_left', [
# basic sanity check with no missing values: given the gradient
# values, the split must occur on bin_idx=3
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], # X_binned
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5], # gradients
False, # no missing values
10, # n_bins_non_missing
False, # don't split on nans
3, # expected_bin_idx
'not_applicable'),
# We replace 2 samples by NaNs (bin_idx=8)
# These 2 samples were mapped to the left node before, so they should
# be mapped to left node again
# Notice how the bin_idx threshold changes from 3 to 1.
([8, 0, 1, 8, 2, 3, 4, 5, 6, 7], # 8 <=> missing
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
True, # missing values
8, # n_bins_non_missing
False, # don't split on nans
1, # cut on bin_idx=1
True), # missing values go to left
# same as above, but with non-consecutive missing_values_bin
([9, 0, 1, 9, 2, 3, 4, 5, 6, 7], # 9 <=> missing
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
True, # missing values
8, # n_bins_non_missing
False, # don't split on nans
1, # cut on bin_idx=1
True), # missing values go to left
# this time replacing 2 samples that were on the right.
([0, 1, 2, 3, 8, 4, 8, 5, 6, 7], # 8 <=> missing
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
True, # missing values
8, # n_bins_non_missing
False, # don't split on nans
3, # cut on bin_idx=3 (like in first case)
False), # missing values go to right
# same as above, but with non-consecutive missing_values_bin
([0, 1, 2, 3, 9, 4, 9, 5, 6, 7], # 9 <=> missing
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
True, # missing values
8, # n_bins_non_missing
False, # don't split on nans
3, # cut on bin_idx=3 (like in first case)
False), # missing values go to right
# For the following cases, split_on_nans is True (we replace all of
# the samples with nans, instead of just 2).
([0, 1, 2, 3, 4, 4, 4, 4, 4, 4], # 4 <=> missing
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
True, # missing values
4, # n_bins_non_missing
True, # split on nans
3, # cut on bin_idx=3
False), # missing values go to right
# same as above, but with non-consecutive missing_values_bin
([0, 1, 2, 3, 9, 9, 9, 9, 9, 9], # 9 <=> missing
[1, 1, 1, 1, 1, 1, 5, 5, 5, 5],
True, # missing values
4, # n_bins_non_missing
True, # split on nans
3, # cut on bin_idx=3
False), # missing values go to right
([6, 6, 6, 6, 0, 1, 2, 3, 4, 5], # 6 <=> missing
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
True, # missing values
6, # n_bins_non_missing
True, # split on nans
5, # cut on bin_idx=5
False), # missing values go to right
# same as above, but with non-consecutive missing_values_bin
([9, 9, 9, 9, 0, 1, 2, 3, 4, 5], # 9 <=> missing
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
True, # missing values
6, # n_bins_non_missing
True, # split on nans
5, # cut on bin_idx=5
False), # missing values go to right
]
)
def test_splitting_missing_values(X_binned, all_gradients,
has_missing_values, n_bins_non_missing,
expected_split_on_nan, expected_bin_idx,
expected_go_to_left):
# Make sure missing values are properly supported.
# we build an artificial example with gradients such that the best split
# is on bin_idx=3, when there are no missing values.
# Then we introduce missing values and:
# - make sure the chosen bin is correct (find_best_bin()): it's
# still the same split, even though the index of the bin may change
# - make sure the missing values are mapped to the correct child
# (split_indices())
n_bins = max(X_binned) + 1
n_samples = len(X_binned)
l2_regularization = 0.
min_hessian_to_split = 1e-3
min_samples_leaf = 1
min_gain_to_split = 0.
sample_indices = np.arange(n_samples, dtype=np.uint32)
X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1)
X_binned = np.asfortranarray(X_binned)
all_gradients = np.array(all_gradients, dtype=G_H_DTYPE)
has_missing_values = np.array([has_missing_values], dtype=np.uint8)
all_hessians = np.ones(1, dtype=G_H_DTYPE)
sum_gradients = all_gradients.sum()
sum_hessians = 1 * n_samples
hessians_are_constant = True
builder = HistogramBuilder(X_binned, n_bins,
all_gradients, all_hessians,
hessians_are_constant)
n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
monotonic_cst = np.array(
[MonotonicConstraint.NO_CST] * X_binned.shape[1],
dtype=np.int8)
missing_values_bin_idx = n_bins - 1
splitter = Splitter(X_binned, n_bins_non_missing,
missing_values_bin_idx, has_missing_values,
monotonic_cst,
l2_regularization, min_hessian_to_split,
min_samples_leaf, min_gain_to_split,
hessians_are_constant)
histograms = builder.compute_histograms_brute(sample_indices)
value = compute_node_value(sum_gradients, sum_hessians,
-np.inf, np.inf, l2_regularization)
split_info = splitter.find_node_split(n_samples, histograms,
sum_gradients, sum_hessians, value)
assert split_info.bin_idx == expected_bin_idx
if has_missing_values:
assert split_info.missing_go_to_left == expected_go_to_left
split_on_nan = split_info.bin_idx == n_bins_non_missing[0] - 1
assert split_on_nan == expected_split_on_nan
# Make sure the split is properly computed.
# This also make sure missing values are properly assigned to the correct
# child in split_indices()
samples_left, samples_right, _ = splitter.split_indices(
split_info, splitter.partition)
if not expected_split_on_nan:
# When we don't split on nans, the split should always be the same.
assert set(samples_left) == set([0, 1, 2, 3])
assert set(samples_right) == set([4, 5, 6, 7, 8, 9])
else:
# When we split on nans, samples with missing values are always mapped
# to the right child.
missing_samples_indices = np.flatnonzero(
np.array(X_binned) == missing_values_bin_idx)
non_missing_samples_indices = np.flatnonzero(
np.array(X_binned) != missing_values_bin_idx)
assert set(samples_right) == set(missing_samples_indices)
assert set(samples_left) == set(non_missing_samples_indices)

View file

@ -0,0 +1,206 @@
import numpy as np
from numpy.testing import assert_array_equal
from numpy.testing import assert_allclose
import pytest
from sklearn.base import clone
from sklearn.datasets import make_classification, make_regression
# To use this experimental feature, we need to explicitly ask for it:
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import check_scoring
X_classification, y_classification = make_classification(random_state=0)
X_regression, y_regression = make_regression(random_state=0)
def _assert_predictor_equal(gb_1, gb_2, X):
"""Assert that two HistGBM instances are identical."""
# Check identical nodes for each tree
for (pred_ith_1, pred_ith_2) in zip(gb_1._predictors, gb_2._predictors):
for (predictor_1, predictor_2) in zip(pred_ith_1, pred_ith_2):
assert_array_equal(predictor_1.nodes, predictor_2.nodes)
# Check identical predictions
assert_allclose(gb_1.predict(X), gb_2.predict(X))
@pytest.mark.parametrize('GradientBoosting, X, y', [
(HistGradientBoostingClassifier, X_classification, y_classification),
(HistGradientBoostingRegressor, X_regression, y_regression)
])
def test_max_iter_with_warm_start_validation(GradientBoosting, X, y):
# Check that a ValueError is raised when the maximum number of iterations
# is smaller than the number of iterations from the previous fit when warm
# start is True.
estimator = GradientBoosting(max_iter=10, early_stopping=False,
warm_start=True)
estimator.fit(X, y)
estimator.set_params(max_iter=5)
err_msg = ('max_iter=5 must be larger than or equal to n_iter_=10 '
'when warm_start==True')
with pytest.raises(ValueError, match=err_msg):
estimator.fit(X, y)
@pytest.mark.parametrize('GradientBoosting, X, y', [
(HistGradientBoostingClassifier, X_classification, y_classification),
(HistGradientBoostingRegressor, X_regression, y_regression)
])
def test_warm_start_yields_identical_results(GradientBoosting, X, y):
# Make sure that fitting 50 iterations and then 25 with warm start is
# equivalent to fitting 75 iterations.
rng = 42
gb_warm_start = GradientBoosting(
n_iter_no_change=100, max_iter=50, random_state=rng, warm_start=True
)
gb_warm_start.fit(X, y).set_params(max_iter=75).fit(X, y)
gb_no_warm_start = GradientBoosting(
n_iter_no_change=100, max_iter=75, random_state=rng, warm_start=False
)
gb_no_warm_start.fit(X, y)
# Check that both predictors are equal
_assert_predictor_equal(gb_warm_start, gb_no_warm_start, X)
@pytest.mark.parametrize('GradientBoosting, X, y', [
(HistGradientBoostingClassifier, X_classification, y_classification),
(HistGradientBoostingRegressor, X_regression, y_regression)
])
def test_warm_start_max_depth(GradientBoosting, X, y):
# Test if possible to fit trees of different depth in ensemble.
gb = GradientBoosting(max_iter=20, min_samples_leaf=1,
warm_start=True, max_depth=2, early_stopping=False)
gb.fit(X, y)
gb.set_params(max_iter=30, max_depth=3, n_iter_no_change=110)
gb.fit(X, y)
# First 20 trees have max_depth == 2
for i in range(20):
assert gb._predictors[i][0].get_max_depth() == 2
# Last 10 trees have max_depth == 3
for i in range(1, 11):
assert gb._predictors[-i][0].get_max_depth() == 3
@pytest.mark.parametrize('GradientBoosting, X, y', [
(HistGradientBoostingClassifier, X_classification, y_classification),
(HistGradientBoostingRegressor, X_regression, y_regression)
])
@pytest.mark.parametrize('scoring', (None, 'loss'))
def test_warm_start_early_stopping(GradientBoosting, X, y, scoring):
# Make sure that early stopping occurs after a small number of iterations
# when fitting a second time with warm starting.
n_iter_no_change = 5
gb = GradientBoosting(
n_iter_no_change=n_iter_no_change, max_iter=10000, early_stopping=True,
random_state=42, warm_start=True, tol=1e-3, scoring=scoring,
)
gb.fit(X, y)
n_iter_first_fit = gb.n_iter_
gb.fit(X, y)
n_iter_second_fit = gb.n_iter_
assert 0 < n_iter_second_fit - n_iter_first_fit < n_iter_no_change
@pytest.mark.parametrize('GradientBoosting, X, y', [
(HistGradientBoostingClassifier, X_classification, y_classification),
(HistGradientBoostingRegressor, X_regression, y_regression)
])
def test_warm_start_equal_n_estimators(GradientBoosting, X, y):
# Test if warm start with equal n_estimators does nothing
gb_1 = GradientBoosting(max_depth=2, early_stopping=False)
gb_1.fit(X, y)
gb_2 = clone(gb_1)
gb_2.set_params(max_iter=gb_1.max_iter, warm_start=True,
n_iter_no_change=5)
gb_2.fit(X, y)
# Check that both predictors are equal
_assert_predictor_equal(gb_1, gb_2, X)
@pytest.mark.parametrize('GradientBoosting, X, y', [
(HistGradientBoostingClassifier, X_classification, y_classification),
(HistGradientBoostingRegressor, X_regression, y_regression)
])
def test_warm_start_clear(GradientBoosting, X, y):
# Test if fit clears state.
gb_1 = GradientBoosting(n_iter_no_change=5, random_state=42)
gb_1.fit(X, y)
gb_2 = GradientBoosting(n_iter_no_change=5, random_state=42,
warm_start=True)
gb_2.fit(X, y) # inits state
gb_2.set_params(warm_start=False)
gb_2.fit(X, y) # clears old state and equals est
# Check that both predictors have the same train_score_ and
# validation_score_ attributes
assert_allclose(gb_1.train_score_, gb_2.train_score_)
assert_allclose(gb_1.validation_score_, gb_2.validation_score_)
# Check that both predictors are equal
_assert_predictor_equal(gb_1, gb_2, X)
@pytest.mark.parametrize('GradientBoosting, X, y', [
(HistGradientBoostingClassifier, X_classification, y_classification),
(HistGradientBoostingRegressor, X_regression, y_regression)
])
@pytest.mark.parametrize('rng_type', ('none', 'int', 'instance'))
def test_random_seeds_warm_start(GradientBoosting, X, y, rng_type):
# Make sure the seeds for train/val split and small trainset subsampling
# are correctly set in a warm start context.
def _get_rng(rng_type):
# Helper to avoid consuming rngs
if rng_type == 'none':
return None
elif rng_type == 'int':
return 42
else:
return np.random.RandomState(0)
random_state = _get_rng(rng_type)
gb_1 = GradientBoosting(early_stopping=True, max_iter=2,
random_state=random_state)
gb_1.set_params(scoring=check_scoring(gb_1))
gb_1.fit(X, y)
random_seed_1_1 = gb_1._random_seed
gb_1.fit(X, y)
random_seed_1_2 = gb_1._random_seed # clear the old state, different seed
random_state = _get_rng(rng_type)
gb_2 = GradientBoosting(early_stopping=True, max_iter=2,
random_state=random_state, warm_start=True)
gb_2.set_params(scoring=check_scoring(gb_2))
gb_2.fit(X, y) # inits state
random_seed_2_1 = gb_2._random_seed
gb_2.fit(X, y) # clears old state and equals est
random_seed_2_2 = gb_2._random_seed
# Without warm starting, the seeds should be
# * all different if random state is None
# * all equal if random state is an integer
# * different when refitting and equal with a new estimator (because
# the random state is mutated)
if rng_type == 'none':
assert random_seed_1_1 != random_seed_1_2 != random_seed_2_1
elif rng_type == 'int':
assert random_seed_1_1 == random_seed_1_2 == random_seed_2_1
else:
assert random_seed_1_1 == random_seed_2_1 != random_seed_1_2
# With warm starting, the seeds must be equal
assert random_seed_2_1 == random_seed_2_2

View file

@ -0,0 +1,513 @@
# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
# License: BSD 3 clause
import numbers
import numpy as np
from scipy.sparse import issparse
from warnings import warn
from ..tree import ExtraTreeRegressor
from ..utils import (
check_random_state,
check_array,
gen_batches,
get_chunk_n_rows,
)
from ..utils.fixes import _joblib_parallel_args
from ..utils.validation import check_is_fitted, _num_samples
from ..utils.validation import _deprecate_positional_args
from ..base import OutlierMixin
from ._bagging import BaseBagging
__all__ = ["IsolationForest"]
class IsolationForest(OutlierMixin, BaseBagging):
"""
Isolation Forest Algorithm.
Return the anomaly score of each sample using the IsolationForest algorithm
The IsolationForest 'isolates' observations by randomly selecting a feature
and then randomly selecting a split value between the maximum and minimum
values of the selected feature.
Since recursive partitioning can be represented by a tree structure, the
number of splittings required to isolate a sample is equivalent to the path
length from the root node to the terminating node.
This path length, averaged over a forest of such random trees, is a
measure of normality and our decision function.
Random partitioning produces noticeably shorter paths for anomalies.
Hence, when a forest of random trees collectively produce shorter path
lengths for particular samples, they are highly likely to be anomalies.
Read more in the :ref:`User Guide <isolation_forest>`.
.. versionadded:: 0.18
Parameters
----------
n_estimators : int, default=100
The number of base estimators in the ensemble.
max_samples : "auto", int or float, default="auto"
The number of samples to draw from X to train each base estimator.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples.
- If "auto", then `max_samples=min(256, n_samples)`.
If max_samples is larger than the number of samples provided,
all samples will be used for all trees (no sampling).
contamination : 'auto' or float, default='auto'
The amount of contamination of the data set, i.e. the proportion
of outliers in the data set. Used when fitting to define the threshold
on the scores of the samples.
- If 'auto', the threshold is determined as in the
original paper.
- If float, the contamination should be in the range [0, 0.5].
.. versionchanged:: 0.22
The default value of ``contamination`` changed from 0.1
to ``'auto'``.
max_features : int or float, default=1.0
The number of features to draw from X to train each base estimator.
- If int, then draw `max_features` features.
- If float, then draw `max_features * X.shape[1]` features.
bootstrap : bool, default=False
If True, individual trees are fit on random subsets of the training
data sampled with replacement. If False, sampling without replacement
is performed.
n_jobs : int, default=None
The number of jobs to run in parallel for both :meth:`fit` and
:meth:`predict`. ``None`` means 1 unless in a
:obj:`joblib.parallel_backend` context. ``-1`` means using all
processors. See :term:`Glossary <n_jobs>` for more details.
behaviour : str, default='deprecated'
This parameter has no effect, is deprecated, and will be removed.
.. versionadded:: 0.20
``behaviour`` is added in 0.20 for back-compatibility purpose.
.. deprecated:: 0.20
``behaviour='old'`` is deprecated in 0.20 and will not be possible
in 0.22.
.. deprecated:: 0.22
``behaviour`` parameter is deprecated in 0.22 and removed in
0.24.
random_state : int or RandomState, default=None
Controls the pseudo-randomness of the selection of the feature
and split values for each branching step and each tree in the forest.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
verbose : int, default=0
Controls the verbosity of the tree building process.
warm_start : bool, default=False
When set to ``True``, reuse the solution of the previous call to fit
and add more estimators to the ensemble, otherwise, just fit a whole
new forest. See :term:`the Glossary <warm_start>`.
.. versionadded:: 0.21
Attributes
----------
estimators_ : list of DecisionTreeClassifier
The collection of fitted sub-estimators.
estimators_samples_ : list of arrays
The subset of drawn samples (i.e., the in-bag samples) for each base
estimator.
max_samples_ : int
The actual number of samples.
offset_ : float
Offset used to define the decision function from the raw scores. We
have the relation: ``decision_function = score_samples - offset_``.
``offset_`` is defined as follows. When the contamination parameter is
set to "auto", the offset is equal to -0.5 as the scores of inliers are
close to 0 and the scores of outliers are close to -1. When a
contamination parameter different than "auto" is provided, the offset
is defined in such a way we obtain the expected number of outliers
(samples with decision function < 0) in training.
.. versionadded:: 0.20
estimators_features_ : list of arrays
The subset of drawn features for each base estimator.
Notes
-----
The implementation is based on an ensemble of ExtraTreeRegressor. The
maximum depth of each tree is set to ``ceil(log_2(n))`` where
:math:`n` is the number of samples used to build the tree
(see (Liu et al., 2008) for more details).
References
----------
.. [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
.. [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation-based
anomaly detection." ACM Transactions on Knowledge Discovery from
Data (TKDD) 6.1 (2012): 3.
See Also
----------
sklearn.covariance.EllipticEnvelope : An object for detecting outliers in a
Gaussian distributed dataset.
sklearn.svm.OneClassSVM : Unsupervised Outlier Detection.
Estimate the support of a high-dimensional distribution.
The implementation is based on libsvm.
sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection
using Local Outlier Factor (LOF).
Examples
--------
>>> from sklearn.ensemble import IsolationForest
>>> X = [[-1.1], [0.3], [0.5], [100]]
>>> clf = IsolationForest(random_state=0).fit(X)
>>> clf.predict([[0.1], [0], [90]])
array([ 1, 1, -1])
"""
@_deprecate_positional_args
def __init__(self, *,
n_estimators=100,
max_samples="auto",
contamination="auto",
max_features=1.,
bootstrap=False,
n_jobs=None,
behaviour='deprecated',
random_state=None,
verbose=0,
warm_start=False):
super().__init__(
base_estimator=ExtraTreeRegressor(
max_features=1,
splitter='random',
random_state=random_state),
# here above max_features has no links with self.max_features
bootstrap=bootstrap,
bootstrap_features=False,
n_estimators=n_estimators,
max_samples=max_samples,
max_features=max_features,
warm_start=warm_start,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose)
self.behaviour = behaviour
self.contamination = contamination
def _set_oob_score(self, X, y):
raise NotImplementedError("OOB score not supported by iforest")
def _parallel_args(self):
# ExtraTreeRegressor releases the GIL, so it's more efficient to use
# a thread-based backend rather than a process-based backend so as
# to avoid suffering from communication overhead and extra memory
# copies.
return _joblib_parallel_args(prefer='threads')
def fit(self, X, y=None, sample_weight=None):
"""
Fit estimator.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples. Use ``dtype=np.float32`` for maximum
efficiency. Sparse matrices are also supported, use sparse
``csc_matrix`` for maximum efficiency.
y : Ignored
Not used, present for API consistency by convention.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted.
Returns
-------
self : object
Fitted estimator.
"""
if self.behaviour != 'deprecated':
if self.behaviour == 'new':
warn(
"'behaviour' is deprecated in 0.22 and will be removed "
"in 0.24. You should not pass or set this parameter.",
FutureWarning
)
else:
raise NotImplementedError(
"The old behaviour of IsolationForest is not implemented "
"anymore. Remove the 'behaviour' parameter."
)
X = check_array(X, accept_sparse=['csc'])
if issparse(X):
# Pre-sort indices to avoid that each individual tree of the
# ensemble sorts the indices.
X.sort_indices()
rnd = check_random_state(self.random_state)
y = rnd.uniform(size=X.shape[0])
# ensure that max_sample is in [1, n_samples]:
n_samples = X.shape[0]
if isinstance(self.max_samples, str):
if self.max_samples == 'auto':
max_samples = min(256, n_samples)
else:
raise ValueError('max_samples (%s) is not supported.'
'Valid choices are: "auto", int or'
'float' % self.max_samples)
elif isinstance(self.max_samples, numbers.Integral):
if self.max_samples > n_samples:
warn("max_samples (%s) is greater than the "
"total number of samples (%s). max_samples "
"will be set to n_samples for estimation."
% (self.max_samples, n_samples))
max_samples = n_samples
else:
max_samples = self.max_samples
else: # float
if not 0. < self.max_samples <= 1.:
raise ValueError("max_samples must be in (0, 1], got %r"
% self.max_samples)
max_samples = int(self.max_samples * X.shape[0])
self.max_samples_ = max_samples
max_depth = int(np.ceil(np.log2(max(max_samples, 2))))
super()._fit(X, y, max_samples,
max_depth=max_depth,
sample_weight=sample_weight)
if self.contamination == "auto":
# 0.5 plays a special role as described in the original paper.
# we take the opposite as we consider the opposite of their score.
self.offset_ = -0.5
return self
# else, define offset_ wrt contamination parameter
self.offset_ = np.percentile(self.score_samples(X),
100. * self.contamination)
return self
def predict(self, X):
"""
Predict if a particular sample is an outlier or not.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples. Internally, it will be converted to
``dtype=np.float32`` and if a sparse matrix is provided
to a sparse ``csr_matrix``.
Returns
-------
is_inlier : ndarray of shape (n_samples,)
For each observation, tells whether or not (+1 or -1) it should
be considered as an inlier according to the fitted model.
"""
check_is_fitted(self)
X = check_array(X, accept_sparse='csr')
is_inlier = np.ones(X.shape[0], dtype=int)
is_inlier[self.decision_function(X) < 0] = -1
return is_inlier
def decision_function(self, X):
"""
Average anomaly score of X of the base classifiers.
The anomaly score of an input sample is computed as
the mean anomaly score of the trees in the forest.
The measure of normality of an observation given a tree is the depth
of the leaf containing this observation, which is equivalent to
the number of splittings required to isolate this point. In case of
several observations n_left in the leaf, the average path length of
a n_left samples isolation tree is added.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples. Internally, it will be converted to
``dtype=np.float32`` and if a sparse matrix is provided
to a sparse ``csr_matrix``.
Returns
-------
scores : ndarray of shape (n_samples,)
The anomaly score of the input samples.
The lower, the more abnormal. Negative scores represent outliers,
positive scores represent inliers.
"""
# We subtract self.offset_ to make 0 be the threshold value for being
# an outlier:
return self.score_samples(X) - self.offset_
def score_samples(self, X):
"""
Opposite of the anomaly score defined in the original paper.
The anomaly score of an input sample is computed as
the mean anomaly score of the trees in the forest.
The measure of normality of an observation given a tree is the depth
of the leaf containing this observation, which is equivalent to
the number of splittings required to isolate this point. In case of
several observations n_left in the leaf, the average path length of
a n_left samples isolation tree is added.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples.
Returns
-------
scores : ndarray of shape (n_samples,)
The anomaly score of the input samples.
The lower, the more abnormal.
"""
# code structure from ForestClassifier/predict_proba
check_is_fitted(self)
# Check data
X = check_array(X, accept_sparse='csr')
if self.n_features_ != X.shape[1]:
raise ValueError("Number of features of the model must "
"match the input. Model n_features is {0} and "
"input n_features is {1}."
"".format(self.n_features_, X.shape[1]))
# Take the opposite of the scores as bigger is better (here less
# abnormal)
return -self._compute_chunked_score_samples(X)
def _compute_chunked_score_samples(self, X):
n_samples = _num_samples(X)
if self._max_features == X.shape[1]:
subsample_features = False
else:
subsample_features = True
# We get as many rows as possible within our working_memory budget
# (defined by sklearn.get_config()['working_memory']) to store
# self._max_features in each row during computation.
#
# Note:
# - this will get at least 1 row, even if 1 row of score will
# exceed working_memory.
# - this does only account for temporary memory usage while loading
# the data needed to compute the scores -- the returned scores
# themselves are 1D.
chunk_n_rows = get_chunk_n_rows(row_bytes=16 * self._max_features,
max_n_rows=n_samples)
slices = gen_batches(n_samples, chunk_n_rows)
scores = np.zeros(n_samples, order="f")
for sl in slices:
# compute score on the slices of test samples:
scores[sl] = self._compute_score_samples(X[sl], subsample_features)
return scores
def _compute_score_samples(self, X, subsample_features):
"""
Compute the score of each samples in X going through the extra trees.
Parameters
----------
X : array-like or sparse matrix
Data matrix.
subsample_features : bool
Whether features should be subsampled.
"""
n_samples = X.shape[0]
depths = np.zeros(n_samples, order="f")
for tree, features in zip(self.estimators_, self.estimators_features_):
X_subset = X[:, features] if subsample_features else X
leaves_index = tree.apply(X_subset)
node_indicator = tree.decision_path(X_subset)
n_samples_leaf = tree.tree_.n_node_samples[leaves_index]
depths += (
np.ravel(node_indicator.sum(axis=1))
+ _average_path_length(n_samples_leaf)
- 1.0
)
scores = 2 ** (
-depths
/ (len(self.estimators_)
* _average_path_length([self.max_samples_]))
)
return scores
def _average_path_length(n_samples_leaf):
"""
The average path length in a n_samples iTree, which is equal to
the average path length of an unsuccessful BST search since the
latter has the same structure as an isolation tree.
Parameters
----------
n_samples_leaf : array-like of shape (n_samples,)
The number of training samples in each test sample leaf, for
each estimators.
Returns
-------
average_path_length : ndarray of shape (n_samples,)
"""
n_samples_leaf = check_array(n_samples_leaf, ensure_2d=False)
n_samples_leaf_shape = n_samples_leaf.shape
n_samples_leaf = n_samples_leaf.reshape((1, -1))
average_path_length = np.zeros(n_samples_leaf.shape)
mask_1 = n_samples_leaf <= 1
mask_2 = n_samples_leaf == 2
not_mask = ~np.logical_or(mask_1, mask_2)
average_path_length[mask_1] = 0.
average_path_length[mask_2] = 1.
average_path_length[not_mask] = (
2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma)
- 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask]
)
return average_path_length.reshape(n_samples_leaf_shape)

View file

@ -0,0 +1,705 @@
"""Stacking classifier and regressor."""
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: BSD 3 clause
from abc import ABCMeta, abstractmethod
from copy import deepcopy
import numpy as np
from joblib import Parallel, delayed
import scipy.sparse as sparse
from ..base import clone
from ..base import ClassifierMixin, RegressorMixin, TransformerMixin
from ..base import is_classifier, is_regressor
from ..exceptions import NotFittedError
from ..utils._estimator_html_repr import _VisualBlock
from ._base import _fit_single_estimator
from ._base import _BaseHeterogeneousEnsemble
from ..linear_model import LogisticRegression
from ..linear_model import RidgeCV
from ..model_selection import cross_val_predict
from ..model_selection import check_cv
from ..preprocessing import LabelEncoder
from ..utils import Bunch
from ..utils.metaestimators import if_delegate_has_method
from ..utils.multiclass import check_classification_targets
from ..utils.validation import check_is_fitted
from ..utils.validation import column_or_1d
from ..utils.validation import _deprecate_positional_args
class _BaseStacking(TransformerMixin, _BaseHeterogeneousEnsemble,
metaclass=ABCMeta):
"""Base class for stacking method."""
@abstractmethod
def __init__(self, estimators, final_estimator=None, *, cv=None,
stack_method='auto', n_jobs=None, verbose=0,
passthrough=False):
super().__init__(estimators=estimators)
self.final_estimator = final_estimator
self.cv = cv
self.stack_method = stack_method
self.n_jobs = n_jobs
self.verbose = verbose
self.passthrough = passthrough
def _clone_final_estimator(self, default):
if self.final_estimator is not None:
self.final_estimator_ = clone(self.final_estimator)
else:
self.final_estimator_ = clone(default)
def _concatenate_predictions(self, X, predictions):
"""Concatenate the predictions of each first layer learner and
possibly the input dataset `X`.
If `X` is sparse and `self.passthrough` is False, the output of
`transform` will be dense (the predictions). If `X` is sparse
and `self.passthrough` is True, the output of `transform` will
be sparse.
This helper is in charge of ensuring the predictions are 2D arrays and
it will drop one of the probability column when using probabilities
in the binary case. Indeed, the p(y|c=0) = 1 - p(y|c=1)
"""
X_meta = []
for est_idx, preds in enumerate(predictions):
# case where the the estimator returned a 1D array
if preds.ndim == 1:
X_meta.append(preds.reshape(-1, 1))
else:
if (self.stack_method_[est_idx] == 'predict_proba' and
len(self.classes_) == 2):
# Remove the first column when using probabilities in
# binary classification because both features are perfectly
# collinear.
X_meta.append(preds[:, 1:])
else:
X_meta.append(preds)
if self.passthrough:
X_meta.append(X)
if sparse.issparse(X):
return sparse.hstack(X_meta, format=X.format)
return np.hstack(X_meta)
@staticmethod
def _method_name(name, estimator, method):
if estimator == 'drop':
return None
if method == 'auto':
if getattr(estimator, 'predict_proba', None):
return 'predict_proba'
elif getattr(estimator, 'decision_function', None):
return 'decision_function'
else:
return 'predict'
else:
if not hasattr(estimator, method):
raise ValueError('Underlying estimator {} does not implement '
'the method {}.'.format(name, method))
return method
def fit(self, X, y, sample_weight=None):
"""Fit the estimators.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vectors, where `n_samples` is the number of samples and
`n_features` is the number of features.
y : array-like of shape (n_samples,)
Target values.
sample_weight : array-like of shape (n_samples,) or default=None
Sample weights. If None, then samples are equally weighted.
Note that this is supported only if all underlying estimators
support sample weights.
.. versionchanged:: 0.23
when not None, `sample_weight` is passed to all underlying
estimators
Returns
-------
self : object
"""
# all_estimators contains all estimators, the one to be fitted and the
# 'drop' string.
names, all_estimators = self._validate_estimators()
self._validate_final_estimator()
stack_method = [self.stack_method] * len(all_estimators)
# Fit the base estimators on the whole training data. Those
# base estimators will be used in transform, predict, and
# predict_proba. They are exposed publicly.
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
delayed(_fit_single_estimator)(clone(est), X, y, sample_weight)
for est in all_estimators if est != 'drop'
)
self.named_estimators_ = Bunch()
est_fitted_idx = 0
for name_est, org_est in zip(names, all_estimators):
if org_est != 'drop':
self.named_estimators_[name_est] = self.estimators_[
est_fitted_idx]
est_fitted_idx += 1
else:
self.named_estimators_[name_est] = 'drop'
# To train the meta-classifier using the most data as possible, we use
# a cross-validation to obtain the output of the stacked estimators.
# To ensure that the data provided to each estimator are the same, we
# need to set the random state of the cv if there is one and we need to
# take a copy.
cv = check_cv(self.cv, y=y, classifier=is_classifier(self))
if hasattr(cv, 'random_state') and cv.random_state is None:
cv.random_state = np.random.RandomState()
self.stack_method_ = [
self._method_name(name, est, meth)
for name, est, meth in zip(names, all_estimators, stack_method)
]
fit_params = ({"sample_weight": sample_weight}
if sample_weight is not None
else None)
predictions = Parallel(n_jobs=self.n_jobs)(
delayed(cross_val_predict)(clone(est), X, y, cv=deepcopy(cv),
method=meth, n_jobs=self.n_jobs,
fit_params=fit_params,
verbose=self.verbose)
for est, meth in zip(all_estimators, self.stack_method_)
if est != 'drop'
)
# Only not None or not 'drop' estimators will be used in transform.
# Remove the None from the method as well.
self.stack_method_ = [
meth for (meth, est) in zip(self.stack_method_, all_estimators)
if est != 'drop'
]
X_meta = self._concatenate_predictions(X, predictions)
_fit_single_estimator(self.final_estimator_, X_meta, y,
sample_weight=sample_weight)
return self
@property
def n_features_in_(self):
"""Number of features seen during :term:`fit`."""
try:
check_is_fitted(self)
except NotFittedError as nfe:
raise AttributeError(
f"{self.__class__.__name__} object has no attribute "
f"n_features_in_") from nfe
return self.estimators_[0].n_features_in_
def _transform(self, X):
"""Concatenate and return the predictions of the estimators."""
check_is_fitted(self)
predictions = [
getattr(est, meth)(X)
for est, meth in zip(self.estimators_, self.stack_method_)
if est != 'drop'
]
return self._concatenate_predictions(X, predictions)
@if_delegate_has_method(delegate='final_estimator_')
def predict(self, X, **predict_params):
"""Predict target for X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
**predict_params : dict of str -> obj
Parameters to the `predict` called by the `final_estimator`. Note
that this may be used to return uncertainties from some estimators
with `return_std` or `return_cov`. Be aware that it will only
accounts for uncertainty in the final estimator.
Returns
-------
y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)
Predicted targets.
"""
check_is_fitted(self)
return self.final_estimator_.predict(
self.transform(X), **predict_params
)
def _sk_visual_block_(self, final_estimator):
names, estimators = zip(*self.estimators)
parallel = _VisualBlock('parallel', estimators, names=names,
dash_wrapped=False)
serial = _VisualBlock('serial', (parallel, final_estimator),
dash_wrapped=False)
return _VisualBlock('serial', [serial])
class StackingClassifier(ClassifierMixin, _BaseStacking):
"""Stack of estimators with a final classifier.
Stacked generalization consists in stacking the output of individual
estimator and use a classifier to compute the final prediction. Stacking
allows to use the strength of each individual estimator by using their
output as input of a final estimator.
Note that `estimators_` are fitted on the full `X` while `final_estimator_`
is trained using cross-validated predictions of the base estimators using
`cross_val_predict`.
.. versionadded:: 0.22
Read more in the :ref:`User Guide <stacking>`.
Parameters
----------
estimators : list of (str, estimator)
Base estimators which will be stacked together. Each element of the
list is defined as a tuple of string (i.e. name) and an estimator
instance. An estimator can be set to 'drop' using `set_params`.
final_estimator : estimator, default=None
A classifier which will be used to combine the base estimators.
The default classifier is a `LogisticRegression`.
cv : int, cross-validation generator or an iterable, default=None
Determines the cross-validation splitting strategy used in
`cross_val_predict` to train `final_estimator`. Possible inputs for
cv are:
* None, to use the default 5-fold cross validation,
* integer, to specify the number of folds in a (Stratified) KFold,
* An object to be used as a cross-validation generator,
* An iterable yielding train, test splits.
For integer/None inputs, if the estimator is a classifier and y is
either binary or multiclass, `StratifiedKFold` is used. In all other
cases, `KFold` is used.
Refer :ref:`User Guide <cross_validation>` for the various
cross-validation strategies that can be used here.
.. note::
A larger number of split will provide no benefits if the number
of training samples is large enough. Indeed, the training time
will increase. ``cv`` is not used for model evaluation but for
prediction.
stack_method : {'auto', 'predict_proba', 'decision_function', 'predict'}, \
default='auto'
Methods called for each base estimator. It can be:
* if 'auto', it will try to invoke, for each estimator,
`'predict_proba'`, `'decision_function'` or `'predict'` in that
order.
* otherwise, one of `'predict_proba'`, `'decision_function'` or
`'predict'`. If the method is not implemented by the estimator, it
will raise an error.
n_jobs : int, default=None
The number of jobs to run in parallel all `estimators` `fit`.
`None` means 1 unless in a `joblib.parallel_backend` context. -1 means
using all processors. See Glossary for more details.
passthrough : bool, default=False
When False, only the predictions of estimators will be used as
training data for `final_estimator`. When True, the
`final_estimator` is trained on the predictions as well as the
original training data.
verbose : int, default=0
Verbosity level.
Attributes
----------
classes_ : ndarray of shape (n_classes,)
Class labels.
estimators_ : list of estimators
The elements of the estimators parameter, having been fitted on the
training data. If an estimator has been set to `'drop'`, it
will not appear in `estimators_`.
named_estimators_ : :class:`~sklearn.utils.Bunch`
Attribute to access any fitted sub-estimators by name.
final_estimator_ : estimator
The classifier which predicts given the output of `estimators_`.
stack_method_ : list of str
The method used by each base estimator.
Notes
-----
When `predict_proba` is used by each estimator (i.e. most of the time for
`stack_method='auto'` or specifically for `stack_method='predict_proba'`),
The first column predicted by each estimator will be dropped in the case
of a binary classification problem. Indeed, both feature will be perfectly
collinear.
References
----------
.. [1] Wolpert, David H. "Stacked generalization." Neural networks 5.2
(1992): 241-259.
Examples
--------
>>> from sklearn.datasets import load_iris
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.svm import LinearSVC
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.preprocessing import StandardScaler
>>> from sklearn.pipeline import make_pipeline
>>> from sklearn.ensemble import StackingClassifier
>>> X, y = load_iris(return_X_y=True)
>>> estimators = [
... ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
... ('svr', make_pipeline(StandardScaler(),
... LinearSVC(random_state=42)))
... ]
>>> clf = StackingClassifier(
... estimators=estimators, final_estimator=LogisticRegression()
... )
>>> from sklearn.model_selection import train_test_split
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, stratify=y, random_state=42
... )
>>> clf.fit(X_train, y_train).score(X_test, y_test)
0.9...
"""
@_deprecate_positional_args
def __init__(self, estimators, final_estimator=None, *, cv=None,
stack_method='auto', n_jobs=None, passthrough=False,
verbose=0):
super().__init__(
estimators=estimators,
final_estimator=final_estimator,
cv=cv,
stack_method=stack_method,
n_jobs=n_jobs,
passthrough=passthrough,
verbose=verbose
)
def _validate_final_estimator(self):
self._clone_final_estimator(default=LogisticRegression())
if not is_classifier(self.final_estimator_):
raise ValueError(
"'final_estimator' parameter should be a classifier. Got {}"
.format(self.final_estimator_)
)
def fit(self, X, y, sample_weight=None):
"""Fit the estimators.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vectors, where `n_samples` is the number of samples and
`n_features` is the number of features.
y : array-like of shape (n_samples,)
Target values.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted.
Note that this is supported only if all underlying estimators
support sample weights.
Returns
-------
self : object
"""
check_classification_targets(y)
self._le = LabelEncoder().fit(y)
self.classes_ = self._le.classes_
return super().fit(X, self._le.transform(y), sample_weight)
@if_delegate_has_method(delegate='final_estimator_')
def predict(self, X, **predict_params):
"""Predict target for X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
**predict_params : dict of str -> obj
Parameters to the `predict` called by the `final_estimator`. Note
that this may be used to return uncertainties from some estimators
with `return_std` or `return_cov`. Be aware that it will only
accounts for uncertainty in the final estimator.
Returns
-------
y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)
Predicted targets.
"""
y_pred = super().predict(X, **predict_params)
return self._le.inverse_transform(y_pred)
@if_delegate_has_method(delegate='final_estimator_')
def predict_proba(self, X):
"""Predict class probabilities for X using
`final_estimator_.predict_proba`.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
-------
probabilities : ndarray of shape (n_samples, n_classes) or \
list of ndarray of shape (n_output,)
The class probabilities of the input samples.
"""
check_is_fitted(self)
return self.final_estimator_.predict_proba(self.transform(X))
@if_delegate_has_method(delegate='final_estimator_')
def decision_function(self, X):
"""Predict decision function for samples in X using
`final_estimator_.decision_function`.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
-------
decisions : ndarray of shape (n_samples,), (n_samples, n_classes), \
or (n_samples, n_classes * (n_classes-1) / 2)
The decision function computed the final estimator.
"""
check_is_fitted(self)
return self.final_estimator_.decision_function(self.transform(X))
def transform(self, X):
"""Return class labels or probabilities for X for each estimator.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vectors, where `n_samples` is the number of samples and
`n_features` is the number of features.
Returns
-------
y_preds : ndarray of shape (n_samples, n_estimators) or \
(n_samples, n_classes * n_estimators)
Prediction outputs for each estimator.
"""
return self._transform(X)
def _sk_visual_block_(self):
# If final_estimator's default changes then this should be
# updated.
if self.final_estimator is None:
final_estimator = LogisticRegression()
else:
final_estimator = self.final_estimator
return super()._sk_visual_block_(final_estimator)
class StackingRegressor(RegressorMixin, _BaseStacking):
"""Stack of estimators with a final regressor.
Stacked generalization consists in stacking the output of individual
estimator and use a regressor to compute the final prediction. Stacking
allows to use the strength of each individual estimator by using their
output as input of a final estimator.
Note that `estimators_` are fitted on the full `X` while `final_estimator_`
is trained using cross-validated predictions of the base estimators using
`cross_val_predict`.
.. versionadded:: 0.22
Read more in the :ref:`User Guide <stacking>`.
Parameters
----------
estimators : list of (str, estimator)
Base estimators which will be stacked together. Each element of the
list is defined as a tuple of string (i.e. name) and an estimator
instance. An estimator can be set to 'drop' using `set_params`.
final_estimator : estimator, default=None
A regressor which will be used to combine the base estimators.
The default regressor is a `RidgeCV`.
cv : int, cross-validation generator or an iterable, default=None
Determines the cross-validation splitting strategy used in
`cross_val_predict` to train `final_estimator`. Possible inputs for
cv are:
* None, to use the default 5-fold cross validation,
* integer, to specify the number of folds in a (Stratified) KFold,
* An object to be used as a cross-validation generator,
* An iterable yielding train, test splits.
For integer/None inputs, if the estimator is a classifier and y is
either binary or multiclass, `StratifiedKFold` is used. In all other
cases, `KFold` is used.
Refer :ref:`User Guide <cross_validation>` for the various
cross-validation strategies that can be used here.
.. note::
A larger number of split will provide no benefits if the number
of training samples is large enough. Indeed, the training time
will increase. ``cv`` is not used for model evaluation but for
prediction.
n_jobs : int, default=None
The number of jobs to run in parallel for `fit` of all `estimators`.
`None` means 1 unless in a `joblib.parallel_backend` context. -1 means
using all processors. See Glossary for more details.
passthrough : bool, default=False
When False, only the predictions of estimators will be used as
training data for `final_estimator`. When True, the
`final_estimator` is trained on the predictions as well as the
original training data.
verbose : int, default=0
Verbosity level.
Attributes
----------
estimators_ : list of estimator
The elements of the estimators parameter, having been fitted on the
training data. If an estimator has been set to `'drop'`, it
will not appear in `estimators_`.
named_estimators_ : :class:`~sklearn.utils.Bunch`
Attribute to access any fitted sub-estimators by name.
final_estimator_ : estimator
The regressor to stacked the base estimators fitted.
References
----------
.. [1] Wolpert, David H. "Stacked generalization." Neural networks 5.2
(1992): 241-259.
Examples
--------
>>> from sklearn.datasets import load_diabetes
>>> from sklearn.linear_model import RidgeCV
>>> from sklearn.svm import LinearSVR
>>> from sklearn.ensemble import RandomForestRegressor
>>> from sklearn.ensemble import StackingRegressor
>>> X, y = load_diabetes(return_X_y=True)
>>> estimators = [
... ('lr', RidgeCV()),
... ('svr', LinearSVR(random_state=42))
... ]
>>> reg = StackingRegressor(
... estimators=estimators,
... final_estimator=RandomForestRegressor(n_estimators=10,
... random_state=42)
... )
>>> from sklearn.model_selection import train_test_split
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, random_state=42
... )
>>> reg.fit(X_train, y_train).score(X_test, y_test)
0.3...
"""
@_deprecate_positional_args
def __init__(self, estimators, final_estimator=None, *, cv=None,
n_jobs=None, passthrough=False, verbose=0):
super().__init__(
estimators=estimators,
final_estimator=final_estimator,
cv=cv,
stack_method="predict",
n_jobs=n_jobs,
passthrough=passthrough,
verbose=verbose
)
def _validate_final_estimator(self):
self._clone_final_estimator(default=RidgeCV())
if not is_regressor(self.final_estimator_):
raise ValueError(
"'final_estimator' parameter should be a regressor. Got {}"
.format(self.final_estimator_)
)
def fit(self, X, y, sample_weight=None):
"""Fit the estimators.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
y : array-like of shape (n_samples,)
Target values.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted.
Note that this is supported only if all underlying estimators
support sample weights.
Returns
-------
self : object
"""
y = column_or_1d(y, warn=True)
return super().fit(X, y, sample_weight)
def transform(self, X):
"""Return the predictions for X for each estimator.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vectors, where `n_samples` is the number of samples and
`n_features` is the number of features.
Returns
-------
y_preds : ndarray of shape (n_samples, n_estimators)
Prediction outputs for each estimator.
"""
return self._transform(X)
def _sk_visual_block_(self):
# If final_estimator's default changes then this should be
# updated.
if self.final_estimator is None:
final_estimator = RidgeCV()
else:
final_estimator = self.final_estimator
return super()._sk_visual_block_(final_estimator)

View file

@ -0,0 +1,495 @@
"""
Soft Voting/Majority Rule classifier and Voting regressor.
This module contains:
- A Soft Voting/Majority Rule classifier for classification estimators.
- A Voting regressor for regression estimators.
"""
# Authors: Sebastian Raschka <se.raschka@gmail.com>,
# Gilles Louppe <g.louppe@gmail.com>,
# Ramil Nugmanov <stsouko@live.ru>
# Mohamed Ali Jamaoui <m.ali.jamaoui@gmail.com>
#
# License: BSD 3 clause
from abc import abstractmethod
import numpy as np
from joblib import Parallel, delayed
from ..base import ClassifierMixin
from ..base import RegressorMixin
from ..base import TransformerMixin
from ..base import clone
from ._base import _fit_single_estimator
from ._base import _BaseHeterogeneousEnsemble
from ..preprocessing import LabelEncoder
from ..utils import Bunch
from ..utils.validation import check_is_fitted
from ..utils.multiclass import check_classification_targets
from ..utils.validation import column_or_1d
from ..utils.validation import _deprecate_positional_args
from ..exceptions import NotFittedError
from ..utils._estimator_html_repr import _VisualBlock
class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble):
"""Base class for voting.
Warning: This class should not be used directly. Use derived classes
instead.
"""
def _log_message(self, name, idx, total):
if not self.verbose:
return None
return '(%d of %d) Processing %s' % (idx, total, name)
@property
def _weights_not_none(self):
"""Get the weights of not `None` estimators."""
if self.weights is None:
return None
return [w for est, w in zip(self.estimators, self.weights)
if est[1] not in (None, 'drop')]
def _predict(self, X):
"""Collect results from clf.predict calls."""
return np.asarray([est.predict(X) for est in self.estimators_]).T
@abstractmethod
def fit(self, X, y, sample_weight=None):
"""Get common fit operations."""
names, clfs = self._validate_estimators()
if (self.weights is not None and
len(self.weights) != len(self.estimators)):
raise ValueError('Number of `estimators` and weights must be equal'
'; got %d weights, %d estimators'
% (len(self.weights), len(self.estimators)))
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
delayed(_fit_single_estimator)(
clone(clf), X, y,
sample_weight=sample_weight,
message_clsname='Voting',
message=self._log_message(names[idx],
idx + 1, len(clfs))
)
for idx, clf in enumerate(clfs) if clf not in (None, 'drop')
)
self.named_estimators_ = Bunch()
# Uses None or 'drop' as placeholder for dropped estimators
est_iter = iter(self.estimators_)
for name, est in self.estimators:
current_est = est if est in (None, 'drop') else next(est_iter)
self.named_estimators_[name] = current_est
return self
@property
def n_features_in_(self):
# For consistency with other estimators we raise a AttributeError so
# that hasattr() fails if the estimator isn't fitted.
try:
check_is_fitted(self)
except NotFittedError as nfe:
raise AttributeError(
"{} object has no n_features_in_ attribute."
.format(self.__class__.__name__)
) from nfe
return self.estimators_[0].n_features_in_
def _sk_visual_block_(self):
names, estimators = zip(*self.estimators)
return _VisualBlock('parallel', estimators, names=names)
class VotingClassifier(ClassifierMixin, _BaseVoting):
"""Soft Voting/Majority Rule classifier for unfitted estimators.
.. versionadded:: 0.17
Read more in the :ref:`User Guide <voting_classifier>`.
Parameters
----------
estimators : list of (str, estimator) tuples
Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
of those original estimators that will be stored in the class attribute
``self.estimators_``. An estimator can be set to ``'drop'``
using ``set_params``.
.. versionchanged:: 0.21
``'drop'`` is accepted.
.. deprecated:: 0.22
Using ``None`` to drop an estimator is deprecated in 0.22 and
support will be dropped in 0.24. Use the string ``'drop'`` instead.
voting : {'hard', 'soft'}, default='hard'
If 'hard', uses predicted class labels for majority rule voting.
Else if 'soft', predicts the class label based on the argmax of
the sums of the predicted probabilities, which is recommended for
an ensemble of well-calibrated classifiers.
weights : array-like of shape (n_classifiers,), default=None
Sequence of weights (`float` or `int`) to weight the occurrences of
predicted class labels (`hard` voting) or class probabilities
before averaging (`soft` voting). Uses uniform weights if `None`.
n_jobs : int, default=None
The number of jobs to run in parallel for ``fit``.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
.. versionadded:: 0.18
flatten_transform : bool, default=True
Affects shape of transform output only when voting='soft'
If voting='soft' and flatten_transform=True, transform method returns
matrix with shape (n_samples, n_classifiers * n_classes). If
flatten_transform=False, it returns
(n_classifiers, n_samples, n_classes).
verbose : bool, default=False
If True, the time elapsed while fitting will be printed as it
is completed.
Attributes
----------
estimators_ : list of classifiers
The collection of fitted sub-estimators as defined in ``estimators``
that are not 'drop'.
named_estimators_ : :class:`~sklearn.utils.Bunch`
Attribute to access any fitted sub-estimators by name.
.. versionadded:: 0.20
classes_ : array-like of shape (n_predictions,)
The classes labels.
See Also
--------
VotingRegressor: Prediction voting regressor.
Examples
--------
>>> import numpy as np
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.naive_bayes import GaussianNB
>>> from sklearn.ensemble import RandomForestClassifier, VotingClassifier
>>> clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
>>> clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
>>> clf3 = GaussianNB()
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
>>> y = np.array([1, 1, 1, 2, 2, 2])
>>> eclf1 = VotingClassifier(estimators=[
... ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
>>> eclf1 = eclf1.fit(X, y)
>>> print(eclf1.predict(X))
[1 1 1 2 2 2]
>>> np.array_equal(eclf1.named_estimators_.lr.predict(X),
... eclf1.named_estimators_['lr'].predict(X))
True
>>> eclf2 = VotingClassifier(estimators=[
... ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
... voting='soft')
>>> eclf2 = eclf2.fit(X, y)
>>> print(eclf2.predict(X))
[1 1 1 2 2 2]
>>> eclf3 = VotingClassifier(estimators=[
... ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
... voting='soft', weights=[2,1,1],
... flatten_transform=True)
>>> eclf3 = eclf3.fit(X, y)
>>> print(eclf3.predict(X))
[1 1 1 2 2 2]
>>> print(eclf3.transform(X).shape)
(6, 6)
"""
@_deprecate_positional_args
def __init__(self, estimators, *, voting='hard', weights=None,
n_jobs=None, flatten_transform=True, verbose=False):
super().__init__(estimators=estimators)
self.voting = voting
self.weights = weights
self.n_jobs = n_jobs
self.flatten_transform = flatten_transform
self.verbose = verbose
def fit(self, X, y, sample_weight=None):
"""Fit the estimators.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
y : array-like of shape (n_samples,)
Target values.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted.
Note that this is supported only if all underlying estimators
support sample weights.
.. versionadded:: 0.18
Returns
-------
self : object
"""
check_classification_targets(y)
if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
raise NotImplementedError('Multilabel and multi-output'
' classification is not supported.')
if self.voting not in ('soft', 'hard'):
raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
% self.voting)
self.le_ = LabelEncoder().fit(y)
self.classes_ = self.le_.classes_
transformed_y = self.le_.transform(y)
return super().fit(X, transformed_y, sample_weight)
def predict(self, X):
"""Predict class labels for X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples.
Returns
-------
maj : array-like of shape (n_samples,)
Predicted class labels.
"""
check_is_fitted(self)
if self.voting == 'soft':
maj = np.argmax(self.predict_proba(X), axis=1)
else: # 'hard' voting
predictions = self._predict(X)
maj = np.apply_along_axis(
lambda x: np.argmax(
np.bincount(x, weights=self._weights_not_none)),
axis=1, arr=predictions)
maj = self.le_.inverse_transform(maj)
return maj
def _collect_probas(self, X):
"""Collect results from clf.predict calls."""
return np.asarray([clf.predict_proba(X) for clf in self.estimators_])
def _predict_proba(self, X):
"""Predict class probabilities for X in 'soft' voting."""
check_is_fitted(self)
avg = np.average(self._collect_probas(X), axis=0,
weights=self._weights_not_none)
return avg
@property
def predict_proba(self):
"""Compute probabilities of possible outcomes for samples in X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples.
Returns
-------
avg : array-like of shape (n_samples, n_classes)
Weighted average probability for each class per sample.
"""
if self.voting == 'hard':
raise AttributeError("predict_proba is not available when"
" voting=%r" % self.voting)
return self._predict_proba
def transform(self, X):
"""Return class labels or probabilities for X for each estimator.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
-------
probabilities_or_labels
If `voting='soft'` and `flatten_transform=True`:
returns ndarray of shape (n_classifiers, n_samples *
n_classes), being class probabilities calculated by each
classifier.
If `voting='soft' and `flatten_transform=False`:
ndarray of shape (n_classifiers, n_samples, n_classes)
If `voting='hard'`:
ndarray of shape (n_samples, n_classifiers), being
class labels predicted by each classifier.
"""
check_is_fitted(self)
if self.voting == 'soft':
probas = self._collect_probas(X)
if not self.flatten_transform:
return probas
return np.hstack(probas)
else:
return self._predict(X)
class VotingRegressor(RegressorMixin, _BaseVoting):
"""Prediction voting regressor for unfitted estimators.
.. versionadded:: 0.21
A voting regressor is an ensemble meta-estimator that fits several base
regressors, each on the whole dataset. Then it averages the individual
predictions to form a final prediction.
Read more in the :ref:`User Guide <voting_regressor>`.
Parameters
----------
estimators : list of (str, estimator) tuples
Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones
of those original estimators that will be stored in the class attribute
``self.estimators_``. An estimator can be set to ``'drop'`` using
``set_params``.
.. versionchanged:: 0.21
``'drop'`` is accepted.
.. deprecated:: 0.22
Using ``None`` to drop an estimator is deprecated in 0.22 and
support will be dropped in 0.24. Use the string ``'drop'`` instead.
weights : array-like of shape (n_regressors,), default=None
Sequence of weights (`float` or `int`) to weight the occurrences of
predicted values before averaging. Uses uniform weights if `None`.
n_jobs : int, default=None
The number of jobs to run in parallel for ``fit``.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
verbose : bool, default=False
If True, the time elapsed while fitting will be printed as it
is completed.
Attributes
----------
estimators_ : list of regressors
The collection of fitted sub-estimators as defined in ``estimators``
that are not 'drop'.
named_estimators_ : Bunch
Attribute to access any fitted sub-estimators by name.
.. versionadded:: 0.20
See Also
--------
VotingClassifier: Soft Voting/Majority Rule classifier.
Examples
--------
>>> import numpy as np
>>> from sklearn.linear_model import LinearRegression
>>> from sklearn.ensemble import RandomForestRegressor
>>> from sklearn.ensemble import VotingRegressor
>>> r1 = LinearRegression()
>>> r2 = RandomForestRegressor(n_estimators=10, random_state=1)
>>> X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]])
>>> y = np.array([2, 6, 12, 20, 30, 42])
>>> er = VotingRegressor([('lr', r1), ('rf', r2)])
>>> print(er.fit(X, y).predict(X))
[ 3.3 5.7 11.8 19.7 28. 40.3]
"""
@_deprecate_positional_args
def __init__(self, estimators, *, weights=None, n_jobs=None,
verbose=False):
super().__init__(estimators=estimators)
self.weights = weights
self.n_jobs = n_jobs
self.verbose = verbose
def fit(self, X, y, sample_weight=None):
"""Fit the estimators.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
y : array-like of shape (n_samples,)
Target values.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted.
Note that this is supported only if all underlying estimators
support sample weights.
Returns
-------
self : object
Fitted estimator.
"""
y = column_or_1d(y, warn=True)
return super().fit(X, y, sample_weight)
def predict(self, X):
"""Predict regression target for X.
The predicted regression target of an input sample is computed as the
mean predicted regression targets of the estimators in the ensemble.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples.
Returns
-------
y : ndarray of shape (n_samples,)
The predicted values.
"""
check_is_fitted(self)
return np.average(self._predict(X), axis=1,
weights=self._weights_not_none)
def transform(self, X):
"""Return predictions for X for each estimator.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples.
Returns
-------
predictions: ndarray of shape (n_samples, n_classifiers)
Values predicted by each regressor.
"""
check_is_fitted(self)
return self._predict(X)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _bagging # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.ensemble.bagging'
correct_import_path = 'sklearn.ensemble'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_bagging, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _base # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.ensemble.base'
correct_import_path = 'sklearn.ensemble'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_base, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _forest # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.ensemble.forest'
correct_import_path = 'sklearn.ensemble'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_forest, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _gb # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.ensemble.gradient_boosting'
correct_import_path = 'sklearn.ensemble'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_gb, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _iforest # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.ensemble.iforest'
correct_import_path = 'sklearn.ensemble'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_iforest, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,54 @@
import numpy
from numpy.distutils.misc_util import Configuration
def configuration(parent_package="", top_path=None):
config = Configuration("ensemble", parent_package, top_path)
config.add_extension("_gradient_boosting",
sources=["_gradient_boosting.pyx"],
include_dirs=[numpy.get_include()])
config.add_subpackage("tests")
# Histogram-based gradient boosting files
config.add_extension(
"_hist_gradient_boosting._gradient_boosting",
sources=["_hist_gradient_boosting/_gradient_boosting.pyx"],
include_dirs=[numpy.get_include()])
config.add_extension("_hist_gradient_boosting.histogram",
sources=["_hist_gradient_boosting/histogram.pyx"],
include_dirs=[numpy.get_include()])
config.add_extension("_hist_gradient_boosting.splitting",
sources=["_hist_gradient_boosting/splitting.pyx"],
include_dirs=[numpy.get_include()])
config.add_extension("_hist_gradient_boosting._binning",
sources=["_hist_gradient_boosting/_binning.pyx"],
include_dirs=[numpy.get_include()])
config.add_extension("_hist_gradient_boosting._predictor",
sources=["_hist_gradient_boosting/_predictor.pyx"],
include_dirs=[numpy.get_include()])
config.add_extension("_hist_gradient_boosting._loss",
sources=["_hist_gradient_boosting/_loss.pyx"],
include_dirs=[numpy.get_include()])
config.add_extension("_hist_gradient_boosting.common",
sources=["_hist_gradient_boosting/common.pyx"],
include_dirs=[numpy.get_include()])
config.add_extension("_hist_gradient_boosting.utils",
sources=["_hist_gradient_boosting/utils.pyx"],
include_dirs=[numpy.get_include()])
config.add_subpackage("_hist_gradient_boosting.tests")
return config
if __name__ == "__main__":
from numpy.distutils.core import setup
setup(**configuration().todict())

View file

@ -0,0 +1,902 @@
"""
Testing for the bagging ensemble module (sklearn.ensemble.bagging).
"""
# Author: Gilles Louppe
# License: BSD 3 clause
import numpy as np
import joblib
from sklearn.base import BaseEstimator
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_raises
from sklearn.utils._testing import assert_warns
from sklearn.utils._testing import assert_warns_message
from sklearn.utils._testing import assert_raise_message
from sklearn.utils._testing import ignore_warnings
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, SVR
from sklearn.random_projection import SparseRandomProjection
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2
from sklearn.utils import check_random_state
from sklearn.preprocessing import FunctionTransformer
from scipy.sparse import csc_matrix, csr_matrix
rng = check_random_state(0)
# also load the iris dataset
# and randomly permute it
iris = load_iris()
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]
# also load the diabetes dataset
# and randomly permute it
diabetes = load_diabetes()
perm = rng.permutation(diabetes.target.size)
diabetes.data = diabetes.data[perm]
diabetes.target = diabetes.target[perm]
# TODO: Remove in 0.24 when DummyClassifier's `strategy` default updates
@ignore_warnings(category=FutureWarning)
def test_classification():
# Check classification for various parameter settings.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target,
random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0],
"max_features": [1, 2, 4],
"bootstrap": [True, False],
"bootstrap_features": [True, False]})
for base_estimator in [None,
DummyClassifier(),
Perceptron(),
DecisionTreeClassifier(),
KNeighborsClassifier(),
SVC()]:
for params in grid:
BaggingClassifier(base_estimator=base_estimator,
random_state=rng,
**params).fit(X_train, y_train).predict(X_test)
def test_sparse_classification():
# Check classification for various parameter settings on sparse input.
class CustomSVC(SVC):
"""SVC variant that records the nature of the training set"""
def fit(self, X, y):
super().fit(X, y)
self.data_type_ = type(X)
return self
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target,
random_state=rng)
parameter_sets = [
{"max_samples": 0.5,
"max_features": 2,
"bootstrap": True,
"bootstrap_features": True},
{"max_samples": 1.0,
"max_features": 4,
"bootstrap": True,
"bootstrap_features": True},
{"max_features": 2,
"bootstrap": False,
"bootstrap_features": True},
{"max_samples": 0.5,
"bootstrap": True,
"bootstrap_features": False},
]
for sparse_format in [csc_matrix, csr_matrix]:
X_train_sparse = sparse_format(X_train)
X_test_sparse = sparse_format(X_test)
for params in parameter_sets:
for f in ['predict', 'predict_proba', 'predict_log_proba', 'decision_function']:
# Trained on sparse format
sparse_classifier = BaggingClassifier(
base_estimator=CustomSVC(decision_function_shape='ovr'),
random_state=1,
**params
).fit(X_train_sparse, y_train)
sparse_results = getattr(sparse_classifier, f)(X_test_sparse)
# Trained on dense format
dense_classifier = BaggingClassifier(
base_estimator=CustomSVC(decision_function_shape='ovr'),
random_state=1,
**params
).fit(X_train, y_train)
dense_results = getattr(dense_classifier, f)(X_test)
assert_array_almost_equal(sparse_results, dense_results)
sparse_type = type(X_train_sparse)
types = [i.data_type_ for i in sparse_classifier.estimators_]
assert all([t == sparse_type for t in types])
def test_regression():
# Check regression for various parameter settings.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
diabetes.target[:50],
random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0],
"max_features": [0.5, 1.0],
"bootstrap": [True, False],
"bootstrap_features": [True, False]})
for base_estimator in [None,
DummyRegressor(),
DecisionTreeRegressor(),
KNeighborsRegressor(),
SVR()]:
for params in grid:
BaggingRegressor(base_estimator=base_estimator,
random_state=rng,
**params).fit(X_train, y_train).predict(X_test)
def test_sparse_regression():
# Check regression for various parameter settings on sparse input.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
diabetes.target[:50],
random_state=rng)
class CustomSVR(SVR):
"""SVC variant that records the nature of the training set"""
def fit(self, X, y):
super().fit(X, y)
self.data_type_ = type(X)
return self
parameter_sets = [
{"max_samples": 0.5,
"max_features": 2,
"bootstrap": True,
"bootstrap_features": True},
{"max_samples": 1.0,
"max_features": 4,
"bootstrap": True,
"bootstrap_features": True},
{"max_features": 2,
"bootstrap": False,
"bootstrap_features": True},
{"max_samples": 0.5,
"bootstrap": True,
"bootstrap_features": False},
]
for sparse_format in [csc_matrix, csr_matrix]:
X_train_sparse = sparse_format(X_train)
X_test_sparse = sparse_format(X_test)
for params in parameter_sets:
# Trained on sparse format
sparse_classifier = BaggingRegressor(
base_estimator=CustomSVR(),
random_state=1,
**params
).fit(X_train_sparse, y_train)
sparse_results = sparse_classifier.predict(X_test_sparse)
# Trained on dense format
dense_results = BaggingRegressor(
base_estimator=CustomSVR(),
random_state=1,
**params
).fit(X_train, y_train).predict(X_test)
sparse_type = type(X_train_sparse)
types = [i.data_type_ for i in sparse_classifier.estimators_]
assert_array_almost_equal(sparse_results, dense_results)
assert all([t == sparse_type for t in types])
assert_array_almost_equal(sparse_results, dense_results)
class DummySizeEstimator(BaseEstimator):
def fit(self, X, y):
self.training_size_ = X.shape[0]
self.training_hash_ = joblib.hash(X)
def test_bootstrap_samples():
# Test that bootstrapping samples generate non-perfect base estimators.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
diabetes.target,
random_state=rng)
base_estimator = DecisionTreeRegressor().fit(X_train, y_train)
# without bootstrap, all trees are perfect on the training set
ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
max_samples=1.0,
bootstrap=False,
random_state=rng).fit(X_train, y_train)
assert (base_estimator.score(X_train, y_train) ==
ensemble.score(X_train, y_train))
# with bootstrap, trees are no longer perfect on the training set
ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
max_samples=1.0,
bootstrap=True,
random_state=rng).fit(X_train, y_train)
assert (base_estimator.score(X_train, y_train) >
ensemble.score(X_train, y_train))
# check that each sampling correspond to a complete bootstrap resample.
# the size of each bootstrap should be the same as the input data but
# the data should be different (checked using the hash of the data).
ensemble = BaggingRegressor(base_estimator=DummySizeEstimator(),
bootstrap=True).fit(X_train, y_train)
training_hash = []
for estimator in ensemble.estimators_:
assert estimator.training_size_ == X_train.shape[0]
training_hash.append(estimator.training_hash_)
assert len(set(training_hash)) == len(training_hash)
def test_bootstrap_features():
# Test that bootstrapping features may generate duplicate features.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
diabetes.target,
random_state=rng)
ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
max_features=1.0,
bootstrap_features=False,
random_state=rng).fit(X_train, y_train)
for features in ensemble.estimators_features_:
assert diabetes.data.shape[1] == np.unique(features).shape[0]
ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
max_features=1.0,
bootstrap_features=True,
random_state=rng).fit(X_train, y_train)
for features in ensemble.estimators_features_:
assert diabetes.data.shape[1] > np.unique(features).shape[0]
def test_probability():
# Predict probabilities.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target,
random_state=rng)
with np.errstate(divide="ignore", invalid="ignore"):
# Normal case
ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
random_state=rng).fit(X_train, y_train)
assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
axis=1),
np.ones(len(X_test)))
assert_array_almost_equal(ensemble.predict_proba(X_test),
np.exp(ensemble.predict_log_proba(X_test)))
# Degenerate case, where some classes are missing
ensemble = BaggingClassifier(base_estimator=LogisticRegression(),
random_state=rng,
max_samples=5).fit(X_train, y_train)
assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
axis=1),
np.ones(len(X_test)))
assert_array_almost_equal(ensemble.predict_proba(X_test),
np.exp(ensemble.predict_log_proba(X_test)))
def test_oob_score_classification():
# Check that oob prediction is a good estimation of the generalization
# error.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target,
random_state=rng)
for base_estimator in [DecisionTreeClassifier(), SVC()]:
clf = BaggingClassifier(base_estimator=base_estimator,
n_estimators=100,
bootstrap=True,
oob_score=True,
random_state=rng).fit(X_train, y_train)
test_score = clf.score(X_test, y_test)
assert abs(test_score - clf.oob_score_) < 0.1
# Test with few estimators
assert_warns(UserWarning,
BaggingClassifier(base_estimator=base_estimator,
n_estimators=1,
bootstrap=True,
oob_score=True,
random_state=rng).fit,
X_train,
y_train)
def test_oob_score_regression():
# Check that oob prediction is a good estimation of the generalization
# error.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
diabetes.target,
random_state=rng)
clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
n_estimators=50,
bootstrap=True,
oob_score=True,
random_state=rng).fit(X_train, y_train)
test_score = clf.score(X_test, y_test)
assert abs(test_score - clf.oob_score_) < 0.1
# Test with few estimators
assert_warns(UserWarning,
BaggingRegressor(base_estimator=DecisionTreeRegressor(),
n_estimators=1,
bootstrap=True,
oob_score=True,
random_state=rng).fit,
X_train,
y_train)
def test_single_estimator():
# Check singleton ensembles.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
diabetes.target,
random_state=rng)
clf1 = BaggingRegressor(base_estimator=KNeighborsRegressor(),
n_estimators=1,
bootstrap=False,
bootstrap_features=False,
random_state=rng).fit(X_train, y_train)
clf2 = KNeighborsRegressor().fit(X_train, y_train)
assert_array_almost_equal(clf1.predict(X_test), clf2.predict(X_test))
def test_error():
# Test that it gives proper exception on deficient input.
X, y = iris.data, iris.target
base = DecisionTreeClassifier()
# Test max_samples
assert_raises(ValueError,
BaggingClassifier(base, max_samples=-1).fit, X, y)
assert_raises(ValueError,
BaggingClassifier(base, max_samples=0.0).fit, X, y)
assert_raises(ValueError,
BaggingClassifier(base, max_samples=2.0).fit, X, y)
assert_raises(ValueError,
BaggingClassifier(base, max_samples=1000).fit, X, y)
assert_raises(ValueError,
BaggingClassifier(base, max_samples="foobar").fit, X, y)
# Test max_features
assert_raises(ValueError,
BaggingClassifier(base, max_features=-1).fit, X, y)
assert_raises(ValueError,
BaggingClassifier(base, max_features=0.0).fit, X, y)
assert_raises(ValueError,
BaggingClassifier(base, max_features=2.0).fit, X, y)
assert_raises(ValueError,
BaggingClassifier(base, max_features=5).fit, X, y)
assert_raises(ValueError,
BaggingClassifier(base, max_features="foobar").fit, X, y)
# Test support of decision_function
assert not hasattr(BaggingClassifier(base).fit(X, y), 'decision_function')
def test_parallel_classification():
# Check parallel classification.
rng = check_random_state(0)
# Classification
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target,
random_state=rng)
ensemble = BaggingClassifier(DecisionTreeClassifier(),
n_jobs=3,
random_state=0).fit(X_train, y_train)
# predict_proba
ensemble.set_params(n_jobs=1)
y1 = ensemble.predict_proba(X_test)
ensemble.set_params(n_jobs=2)
y2 = ensemble.predict_proba(X_test)
assert_array_almost_equal(y1, y2)
ensemble = BaggingClassifier(DecisionTreeClassifier(),
n_jobs=1,
random_state=0).fit(X_train, y_train)
y3 = ensemble.predict_proba(X_test)
assert_array_almost_equal(y1, y3)
# decision_function
ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
n_jobs=3,
random_state=0).fit(X_train, y_train)
ensemble.set_params(n_jobs=1)
decisions1 = ensemble.decision_function(X_test)
ensemble.set_params(n_jobs=2)
decisions2 = ensemble.decision_function(X_test)
assert_array_almost_equal(decisions1, decisions2)
X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1))))
assert_raise_message(ValueError, "Number of features of the model "
"must match the input. Model n_features is {0} "
"and input n_features is {1} "
"".format(X_test.shape[1], X_err.shape[1]),
ensemble.decision_function, X_err)
ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
n_jobs=1,
random_state=0).fit(X_train, y_train)
decisions3 = ensemble.decision_function(X_test)
assert_array_almost_equal(decisions1, decisions3)
def test_parallel_regression():
# Check parallel regression.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
diabetes.target,
random_state=rng)
ensemble = BaggingRegressor(DecisionTreeRegressor(),
n_jobs=3,
random_state=0).fit(X_train, y_train)
ensemble.set_params(n_jobs=1)
y1 = ensemble.predict(X_test)
ensemble.set_params(n_jobs=2)
y2 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y2)
ensemble = BaggingRegressor(DecisionTreeRegressor(),
n_jobs=1,
random_state=0).fit(X_train, y_train)
y3 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y3)
def test_gridsearch():
# Check that bagging ensembles can be grid-searched.
# Transform iris into a binary classification task
X, y = iris.data, iris.target
y[y == 2] = 1
# Grid search with scoring based on decision_function
parameters = {'n_estimators': (1, 2),
'base_estimator__C': (1, 2)}
GridSearchCV(BaggingClassifier(SVC()),
parameters,
scoring="roc_auc").fit(X, y)
def test_base_estimator():
# Check base_estimator and its default values.
rng = check_random_state(0)
# Classification
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target,
random_state=rng)
ensemble = BaggingClassifier(None,
n_jobs=3,
random_state=0).fit(X_train, y_train)
assert isinstance(ensemble.base_estimator_, DecisionTreeClassifier)
ensemble = BaggingClassifier(DecisionTreeClassifier(),
n_jobs=3,
random_state=0).fit(X_train, y_train)
assert isinstance(ensemble.base_estimator_, DecisionTreeClassifier)
ensemble = BaggingClassifier(Perceptron(),
n_jobs=3,
random_state=0).fit(X_train, y_train)
assert isinstance(ensemble.base_estimator_, Perceptron)
# Regression
X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
diabetes.target,
random_state=rng)
ensemble = BaggingRegressor(None,
n_jobs=3,
random_state=0).fit(X_train, y_train)
assert isinstance(ensemble.base_estimator_, DecisionTreeRegressor)
ensemble = BaggingRegressor(DecisionTreeRegressor(),
n_jobs=3,
random_state=0).fit(X_train, y_train)
assert isinstance(ensemble.base_estimator_, DecisionTreeRegressor)
ensemble = BaggingRegressor(SVR(),
n_jobs=3,
random_state=0).fit(X_train, y_train)
assert isinstance(ensemble.base_estimator_, SVR)
def test_bagging_with_pipeline():
estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
DecisionTreeClassifier()),
max_features=2)
estimator.fit(iris.data, iris.target)
assert isinstance(estimator[0].steps[-1][1].random_state, int)
class DummyZeroEstimator(BaseEstimator):
def fit(self, X, y):
self.classes_ = np.unique(y)
return self
def predict(self, X):
return self.classes_[np.zeros(X.shape[0], dtype=int)]
def test_bagging_sample_weight_unsupported_but_passed():
estimator = BaggingClassifier(DummyZeroEstimator())
rng = check_random_state(0)
estimator.fit(iris.data, iris.target).predict(iris.data)
assert_raises(ValueError, estimator.fit, iris.data, iris.target,
sample_weight=rng.randint(10, size=(iris.data.shape[0])))
def test_warm_start(random_state=42):
# Test if fitting incrementally with warm start gives a forest of the
# right size and the same results as a normal fit.
X, y = make_hastie_10_2(n_samples=20, random_state=1)
clf_ws = None
for n_estimators in [5, 10]:
if clf_ws is None:
clf_ws = BaggingClassifier(n_estimators=n_estimators,
random_state=random_state,
warm_start=True)
else:
clf_ws.set_params(n_estimators=n_estimators)
clf_ws.fit(X, y)
assert len(clf_ws) == n_estimators
clf_no_ws = BaggingClassifier(n_estimators=10, random_state=random_state,
warm_start=False)
clf_no_ws.fit(X, y)
assert (set([tree.random_state for tree in clf_ws]) ==
set([tree.random_state for tree in clf_no_ws]))
def test_warm_start_smaller_n_estimators():
# Test if warm start'ed second fit with smaller n_estimators raises error.
X, y = make_hastie_10_2(n_samples=20, random_state=1)
clf = BaggingClassifier(n_estimators=5, warm_start=True)
clf.fit(X, y)
clf.set_params(n_estimators=4)
assert_raises(ValueError, clf.fit, X, y)
def test_warm_start_equal_n_estimators():
# Test that nothing happens when fitting without increasing n_estimators
X, y = make_hastie_10_2(n_samples=20, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# modify X to nonsense values, this should not change anything
X_train += 1.
assert_warns_message(UserWarning,
"Warm-start fitting without increasing n_estimators does not",
clf.fit, X_train, y_train)
assert_array_equal(y_pred, clf.predict(X_test))
def test_warm_start_equivalence():
# warm started classifier with 5+5 estimators should be equivalent to
# one classifier with 10 estimators
X, y = make_hastie_10_2(n_samples=20, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
clf_ws = BaggingClassifier(n_estimators=5, warm_start=True,
random_state=3141)
clf_ws.fit(X_train, y_train)
clf_ws.set_params(n_estimators=10)
clf_ws.fit(X_train, y_train)
y1 = clf_ws.predict(X_test)
clf = BaggingClassifier(n_estimators=10, warm_start=False,
random_state=3141)
clf.fit(X_train, y_train)
y2 = clf.predict(X_test)
assert_array_almost_equal(y1, y2)
def test_warm_start_with_oob_score_fails():
# Check using oob_score and warm_start simultaneously fails
X, y = make_hastie_10_2(n_samples=20, random_state=1)
clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True)
assert_raises(ValueError, clf.fit, X, y)
def test_oob_score_removed_on_warm_start():
X, y = make_hastie_10_2(n_samples=2000, random_state=1)
clf = BaggingClassifier(n_estimators=50, oob_score=True)
clf.fit(X, y)
clf.set_params(warm_start=True, oob_score=False, n_estimators=100)
clf.fit(X, y)
assert_raises(AttributeError, getattr, clf, "oob_score_")
def test_oob_score_consistency():
# Make sure OOB scores are identical when random_state, estimator, and
# training data are fixed and fitting is done twice
X, y = make_hastie_10_2(n_samples=200, random_state=1)
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5,
max_features=0.5, oob_score=True,
random_state=1)
assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
def test_estimators_samples():
# Check that format of estimators_samples_ is correct and that results
# generated at fit time can be identically reproduced at a later time
# using data saved in object attributes.
X, y = make_hastie_10_2(n_samples=200, random_state=1)
bagging = BaggingClassifier(LogisticRegression(), max_samples=0.5,
max_features=0.5, random_state=1,
bootstrap=False)
bagging.fit(X, y)
# Get relevant attributes
estimators_samples = bagging.estimators_samples_
estimators_features = bagging.estimators_features_
estimators = bagging.estimators_
# Test for correct formatting
assert len(estimators_samples) == len(estimators)
assert len(estimators_samples[0]) == len(X) // 2
assert estimators_samples[0].dtype.kind == 'i'
# Re-fit single estimator to test for consistent sampling
estimator_index = 0
estimator_samples = estimators_samples[estimator_index]
estimator_features = estimators_features[estimator_index]
estimator = estimators[estimator_index]
X_train = (X[estimator_samples])[:, estimator_features]
y_train = y[estimator_samples]
orig_coefs = estimator.coef_
estimator.fit(X_train, y_train)
new_coefs = estimator.coef_
assert_array_almost_equal(orig_coefs, new_coefs)
def test_estimators_samples_deterministic():
# This test is a regression test to check that with a random step
# (e.g. SparseRandomProjection) and a given random state, the results
# generated at fit time can be identically reproduced at a later time using
# data saved in object attributes. Check issue #9524 for full discussion.
iris = load_iris()
X, y = iris.data, iris.target
base_pipeline = make_pipeline(SparseRandomProjection(n_components=2),
LogisticRegression())
clf = BaggingClassifier(base_estimator=base_pipeline,
max_samples=0.5,
random_state=0)
clf.fit(X, y)
pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy()
estimator = clf.estimators_[0]
estimator_sample = clf.estimators_samples_[0]
estimator_feature = clf.estimators_features_[0]
X_train = (X[estimator_sample])[:, estimator_feature]
y_train = y[estimator_sample]
estimator.fit(X_train, y_train)
assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
def test_max_samples_consistency():
# Make sure validated max_samples and original max_samples are identical
# when valid integer max_samples supplied by user
max_samples = 100
X, y = make_hastie_10_2(n_samples=2*max_samples, random_state=1)
bagging = BaggingClassifier(KNeighborsClassifier(),
max_samples=max_samples,
max_features=0.5, random_state=1)
bagging.fit(X, y)
assert bagging._max_samples == max_samples
def test_set_oob_score_label_encoding():
# Make sure the oob_score doesn't change when the labels change
# See: https://github.com/scikit-learn/scikit-learn/issues/8933
random_state = 5
X = [[-1], [0], [1]] * 5
Y1 = ['A', 'B', 'C'] * 5
Y2 = [-1, 0, 1] * 5
Y3 = [0, 1, 2] * 5
x1 = BaggingClassifier(oob_score=True,
random_state=random_state).fit(X, Y1).oob_score_
x2 = BaggingClassifier(oob_score=True,
random_state=random_state).fit(X, Y2).oob_score_
x3 = BaggingClassifier(oob_score=True,
random_state=random_state).fit(X, Y3).oob_score_
assert [x1, x2] == [x3, x3]
def replace(X):
X = X.astype('float', copy=True)
X[~np.isfinite(X)] = 0
return X
def test_bagging_regressor_with_missing_inputs():
# Check that BaggingRegressor can accept X with missing/infinite data
X = np.array([
[1, 3, 5],
[2, None, 6],
[2, np.nan, 6],
[2, np.inf, 6],
[2, np.NINF, 6],
])
y_values = [
np.array([2, 3, 3, 3, 3]),
np.array([
[2, 1, 9],
[3, 6, 8],
[3, 6, 8],
[3, 6, 8],
[3, 6, 8],
])
]
for y in y_values:
regressor = DecisionTreeRegressor()
pipeline = make_pipeline(
FunctionTransformer(replace), regressor
)
pipeline.fit(X, y).predict(X)
bagging_regressor = BaggingRegressor(pipeline)
y_hat = bagging_regressor.fit(X, y).predict(X)
assert y.shape == y_hat.shape
# Verify that exceptions can be raised by wrapper regressor
regressor = DecisionTreeRegressor()
pipeline = make_pipeline(regressor)
assert_raises(ValueError, pipeline.fit, X, y)
bagging_regressor = BaggingRegressor(pipeline)
assert_raises(ValueError, bagging_regressor.fit, X, y)
def test_bagging_classifier_with_missing_inputs():
# Check that BaggingClassifier can accept X with missing/infinite data
X = np.array([
[1, 3, 5],
[2, None, 6],
[2, np.nan, 6],
[2, np.inf, 6],
[2, np.NINF, 6],
])
y = np.array([3, 6, 6, 6, 6])
classifier = DecisionTreeClassifier()
pipeline = make_pipeline(
FunctionTransformer(replace), classifier
)
pipeline.fit(X, y).predict(X)
bagging_classifier = BaggingClassifier(pipeline)
bagging_classifier.fit(X, y)
y_hat = bagging_classifier.predict(X)
assert y.shape == y_hat.shape
bagging_classifier.predict_log_proba(X)
bagging_classifier.predict_proba(X)
# Verify that exceptions can be raised by wrapper classifier
classifier = DecisionTreeClassifier()
pipeline = make_pipeline(classifier)
assert_raises(ValueError, pipeline.fit, X, y)
bagging_classifier = BaggingClassifier(pipeline)
assert_raises(ValueError, bagging_classifier.fit, X, y)
def test_bagging_small_max_features():
# Check that Bagging estimator can accept low fractional max_features
X = np.array([[1, 2], [3, 4]])
y = np.array([1, 0])
bagging = BaggingClassifier(LogisticRegression(),
max_features=0.3, random_state=1)
bagging.fit(X, y)
def test_bagging_get_estimators_indices():
# Check that Bagging estimator can generate sample indices properly
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/16436
rng = np.random.RandomState(0)
X = rng.randn(13, 4)
y = np.arange(13)
class MyEstimator(DecisionTreeRegressor):
"""An estimator which stores y indices information at fit."""
def fit(self, X, y):
self._sample_indices = y
clf = BaggingRegressor(base_estimator=MyEstimator(),
n_estimators=1, random_state=0)
clf.fit(X, y)
assert_array_equal(clf.estimators_[0]._sample_indices,
clf.estimators_samples_[0])

View file

@ -0,0 +1,127 @@
"""
Testing for the base module (sklearn.ensemble.base).
"""
# Authors: Gilles Louppe
# License: BSD 3 clause
import numpy as np
from sklearn.utils._testing import assert_raise_message
from sklearn.datasets import load_iris
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble._base import _set_random_states
from sklearn.linear_model import Perceptron
from collections import OrderedDict
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
def test_base():
# Check BaseEnsemble methods.
ensemble = BaggingClassifier(
base_estimator=Perceptron(random_state=None), n_estimators=3)
iris = load_iris()
ensemble.fit(iris.data, iris.target)
ensemble.estimators_ = [] # empty the list and create estimators manually
ensemble._make_estimator()
random_state = np.random.RandomState(3)
ensemble._make_estimator(random_state=random_state)
ensemble._make_estimator(random_state=random_state)
ensemble._make_estimator(append=False)
assert 3 == len(ensemble)
assert 3 == len(ensemble.estimators_)
assert isinstance(ensemble[0], Perceptron)
assert ensemble[0].random_state is None
assert isinstance(ensemble[1].random_state, int)
assert isinstance(ensemble[2].random_state, int)
assert ensemble[1].random_state != ensemble[2].random_state
np_int_ensemble = BaggingClassifier(base_estimator=Perceptron(),
n_estimators=np.int32(3))
np_int_ensemble.fit(iris.data, iris.target)
def test_base_zero_n_estimators():
# Check that instantiating a BaseEnsemble with n_estimators<=0 raises
# a ValueError.
ensemble = BaggingClassifier(base_estimator=Perceptron(),
n_estimators=0)
iris = load_iris()
assert_raise_message(ValueError,
"n_estimators must be greater than zero, got 0.",
ensemble.fit, iris.data, iris.target)
def test_base_not_int_n_estimators():
# Check that instantiating a BaseEnsemble with a string as n_estimators
# raises a ValueError demanding n_estimators to be supplied as an integer.
string_ensemble = BaggingClassifier(base_estimator=Perceptron(),
n_estimators='3')
iris = load_iris()
assert_raise_message(ValueError,
"n_estimators must be an integer",
string_ensemble.fit, iris.data, iris.target)
float_ensemble = BaggingClassifier(base_estimator=Perceptron(),
n_estimators=3.0)
assert_raise_message(ValueError,
"n_estimators must be an integer",
float_ensemble.fit, iris.data, iris.target)
def test_set_random_states():
# Linear Discriminant Analysis doesn't have random state: smoke test
_set_random_states(LinearDiscriminantAnalysis(), random_state=17)
clf1 = Perceptron(random_state=None)
assert clf1.random_state is None
# check random_state is None still sets
_set_random_states(clf1, None)
assert isinstance(clf1.random_state, int)
# check random_state fixes results in consistent initialisation
_set_random_states(clf1, 3)
assert isinstance(clf1.random_state, int)
clf2 = Perceptron(random_state=None)
_set_random_states(clf2, 3)
assert clf1.random_state == clf2.random_state
# nested random_state
def make_steps():
return [('sel', SelectFromModel(Perceptron(random_state=None))),
('clf', Perceptron(random_state=None))]
est1 = Pipeline(make_steps())
_set_random_states(est1, 3)
assert isinstance(est1.steps[0][1].estimator.random_state, int)
assert isinstance(est1.steps[1][1].random_state, int)
assert (est1.get_params()['sel__estimator__random_state'] !=
est1.get_params()['clf__random_state'])
# ensure multiple random_state parameters are invariant to get_params()
# iteration order
class AlphaParamPipeline(Pipeline):
def get_params(self, *args, **kwargs):
params = Pipeline.get_params(self, *args, **kwargs).items()
return OrderedDict(sorted(params))
class RevParamPipeline(Pipeline):
def get_params(self, *args, **kwargs):
params = Pipeline.get_params(self, *args, **kwargs).items()
return OrderedDict(sorted(params, reverse=True))
for cls in [AlphaParamPipeline, RevParamPipeline]:
est2 = cls(make_steps())
_set_random_states(est2, 3)
assert (est1.get_params()['sel__estimator__random_state'] ==
est2.get_params()['sel__estimator__random_state'])
assert (est1.get_params()['clf__random_state'] ==
est2.get_params()['clf__random_state'])

View file

@ -0,0 +1,172 @@
import pytest
from sklearn.base import clone
from sklearn.base import ClassifierMixin
from sklearn.base import is_classifier
from sklearn.datasets import make_classification
from sklearn.datasets import make_regression
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor
@pytest.mark.parametrize(
"X, y, estimator",
[(*make_classification(n_samples=10),
StackingClassifier(estimators=[('lr', LogisticRegression()),
('svm', LinearSVC()),
('rf', RandomForestClassifier())])),
(*make_classification(n_samples=10),
VotingClassifier(estimators=[('lr', LogisticRegression()),
('svm', LinearSVC()),
('rf', RandomForestClassifier())])),
(*make_regression(n_samples=10),
StackingRegressor(estimators=[('lr', LinearRegression()),
('svm', LinearSVR()),
('rf', RandomForestRegressor())])),
(*make_regression(n_samples=10),
VotingRegressor(estimators=[('lr', LinearRegression()),
('svm', LinearSVR()),
('rf', RandomForestRegressor())]))],
ids=['stacking-classifier', 'voting-classifier',
'stacking-regressor', 'voting-regressor']
)
def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
# check that the behavior of `estimators`, `estimators_`,
# `named_estimators`, `named_estimators_` is consistent across all
# ensemble classes and when using `set_params()`.
# before fit
assert 'svm' in estimator.named_estimators
assert estimator.named_estimators.svm is estimator.estimators[1][1]
assert estimator.named_estimators.svm is estimator.named_estimators['svm']
# check fitted attributes
estimator.fit(X, y)
assert len(estimator.named_estimators) == 3
assert len(estimator.named_estimators_) == 3
assert (sorted(list(estimator.named_estimators_.keys())) ==
sorted(['lr', 'svm', 'rf']))
# check that set_params() does not add a new attribute
estimator_new_params = clone(estimator)
svm_estimator = SVC() if is_classifier(estimator) else SVR()
estimator_new_params.set_params(svm=svm_estimator).fit(X, y)
assert not hasattr(estimator_new_params, 'svm')
assert (estimator_new_params.named_estimators.lr.get_params() ==
estimator.named_estimators.lr.get_params())
assert (estimator_new_params.named_estimators.rf.get_params() ==
estimator.named_estimators.rf.get_params())
# check the behavior when setting an dropping an estimator
estimator_dropped = clone(estimator)
estimator_dropped.set_params(svm='drop')
estimator_dropped.fit(X, y)
assert len(estimator_dropped.named_estimators) == 3
assert estimator_dropped.named_estimators.svm == 'drop'
assert len(estimator_dropped.named_estimators_) == 3
assert (sorted(list(estimator_dropped.named_estimators_.keys())) ==
sorted(['lr', 'svm', 'rf']))
for sub_est in estimator_dropped.named_estimators_:
# check that the correspondence is correct
assert not isinstance(sub_est, type(estimator.named_estimators.svm))
# check that we can set the parameters of the underlying classifier
estimator.set_params(svm__C=10.0)
estimator.set_params(rf__max_depth=5)
assert (estimator.get_params()['svm__C'] ==
estimator.get_params()['svm'].get_params()['C'])
assert (estimator.get_params()['rf__max_depth'] ==
estimator.get_params()['rf'].get_params()['max_depth'])
@pytest.mark.parametrize(
"Ensemble",
[StackingClassifier, VotingClassifier, StackingRegressor, VotingRegressor]
)
def test_ensemble_heterogeneous_estimators_type(Ensemble):
# check that ensemble will fail during validation if the underlying
# estimators are not of the same type (i.e. classifier or regressor)
if issubclass(Ensemble, ClassifierMixin):
X, y = make_classification(n_samples=10)
estimators = [('lr', LinearRegression())]
ensemble_type = 'classifier'
else:
X, y = make_regression(n_samples=10)
estimators = [('lr', LogisticRegression())]
ensemble_type = 'regressor'
ensemble = Ensemble(estimators=estimators)
err_msg = "should be a {}".format(ensemble_type)
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
@pytest.mark.parametrize(
"X, y, Ensemble",
[(*make_classification(n_samples=10), StackingClassifier),
(*make_classification(n_samples=10), VotingClassifier),
(*make_regression(n_samples=10), StackingRegressor),
(*make_regression(n_samples=10), VotingRegressor)]
)
def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble):
# raise an error when the name contains dunder
if issubclass(Ensemble, ClassifierMixin):
estimators = [('lr__', LogisticRegression())]
else:
estimators = [('lr__', LinearRegression())]
ensemble = Ensemble(estimators=estimators)
err_msg = r"Estimator names must not contain __: got \['lr__'\]"
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
# raise an error when the name is not unique
if issubclass(Ensemble, ClassifierMixin):
estimators = [('lr', LogisticRegression()),
('lr', LogisticRegression())]
else:
estimators = [('lr', LinearRegression()),
('lr', LinearRegression())]
ensemble = Ensemble(estimators=estimators)
err_msg = r"Names provided are not unique: \['lr', 'lr'\]"
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
# raise an error when the name conflicts with the parameters
if issubclass(Ensemble, ClassifierMixin):
estimators = [('estimators', LogisticRegression())]
else:
estimators = [('estimators', LinearRegression())]
ensemble = Ensemble(estimators=estimators)
err_msg = "Estimator names conflict with constructor arguments"
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
@pytest.mark.parametrize(
"X, y, estimator",
[(*make_classification(n_samples=10),
StackingClassifier(estimators=[('lr', LogisticRegression())])),
(*make_classification(n_samples=10),
VotingClassifier(estimators=[('lr', LogisticRegression())])),
(*make_regression(n_samples=10),
StackingRegressor(estimators=[('lr', LinearRegression())])),
(*make_regression(n_samples=10),
VotingRegressor(estimators=[('lr', LinearRegression())]))],
ids=['stacking-classifier', 'voting-classifier',
'stacking-regressor', 'voting-regressor']
)
def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
# check that we raise a consistent error when all estimators are
# dropped
estimator.set_params(lr='drop')
with pytest.raises(ValueError, match="All estimators are dropped."):
estimator.fit(X, y)

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,343 @@
"""
Testing for the gradient boosting loss functions and initial estimators.
"""
import numpy as np
from numpy.testing import assert_almost_equal
from numpy.testing import assert_allclose
import pytest
from sklearn.utils import check_random_state
from sklearn.utils.stats import _weighted_percentile
from sklearn.ensemble._gb_losses import RegressionLossFunction
from sklearn.ensemble._gb_losses import LeastSquaresError
from sklearn.ensemble._gb_losses import LeastAbsoluteError
from sklearn.ensemble._gb_losses import HuberLossFunction
from sklearn.ensemble._gb_losses import QuantileLossFunction
from sklearn.ensemble._gb_losses import BinomialDeviance
from sklearn.ensemble._gb_losses import MultinomialDeviance
from sklearn.ensemble._gb_losses import ExponentialLoss
from sklearn.ensemble._gb_losses import LOSS_FUNCTIONS
def test_binomial_deviance():
# Check binomial deviance loss.
# Check against alternative definitions in ESLII.
bd = BinomialDeviance(2)
# pred has the same BD for y in {0, 1}
assert (bd(np.array([0.0]), np.array([0.0])) ==
bd(np.array([1.0]), np.array([0.0])))
assert_almost_equal(bd(np.array([1.0, 1.0, 1.0]),
np.array([100.0, 100.0, 100.0])),
0.0)
assert_almost_equal(bd(np.array([1.0, 0.0, 0.0]),
np.array([100.0, -100.0, -100.0])), 0)
# check if same results as alternative definition of deviance (from ESLII)
def alt_dev(y, pred):
return np.mean(np.logaddexp(0.0, -2.0 * (2.0 * y - 1) * pred))
test_data = [(np.array([1.0, 1.0, 1.0]), np.array([100.0, 100.0, 100.0])),
(np.array([0.0, 0.0, 0.0]), np.array([100.0, 100.0, 100.0])),
(np.array([0.0, 0.0, 0.0]),
np.array([-100.0, -100.0, -100.0])),
(np.array([1.0, 1.0, 1.0]),
np.array([-100.0, -100.0, -100.0]))]
for datum in test_data:
assert_almost_equal(bd(*datum), alt_dev(*datum))
# check the gradient against the
def alt_ng(y, pred):
return (2 * y - 1) / (1 + np.exp(2 * (2 * y - 1) * pred))
for datum in test_data:
assert_almost_equal(bd.negative_gradient(*datum), alt_ng(*datum))
def test_sample_weight_smoke():
rng = check_random_state(13)
y = rng.rand(100)
pred = rng.rand(100)
# least squares
loss = LeastSquaresError(1)
loss_wo_sw = loss(y, pred)
loss_w_sw = loss(y, pred, np.ones(pred.shape[0], dtype=np.float32))
assert_almost_equal(loss_wo_sw, loss_w_sw)
def test_sample_weight_init_estimators():
# Smoke test for init estimators with sample weights.
rng = check_random_state(13)
X = rng.rand(100, 2)
sample_weight = np.ones(100)
reg_y = rng.rand(100)
clf_y = rng.randint(0, 2, size=100)
for Loss in LOSS_FUNCTIONS.values():
if Loss is None:
continue
if issubclass(Loss, RegressionLossFunction):
k = 1
y = reg_y
else:
k = 2
y = clf_y
if Loss.is_multi_class:
# skip multiclass
continue
loss = Loss(k)
init_est = loss.init_estimator()
init_est.fit(X, y)
out = loss.get_init_raw_predictions(X, init_est)
assert out.shape == (y.shape[0], 1)
sw_init_est = loss.init_estimator()
sw_init_est.fit(X, y, sample_weight=sample_weight)
sw_out = loss.get_init_raw_predictions(X, sw_init_est)
assert sw_out.shape == (y.shape[0], 1)
# check if predictions match
assert_allclose(out, sw_out, rtol=1e-2)
def test_weighted_percentile():
y = np.empty(102, dtype=np.float64)
y[:50] = 0
y[-51:] = 2
y[-1] = 100000
y[50] = 1
sw = np.ones(102, dtype=np.float64)
sw[-1] = 0.0
score = _weighted_percentile(y, sw, 50)
assert score == 1
def test_weighted_percentile_equal():
y = np.empty(102, dtype=np.float64)
y.fill(0.0)
sw = np.ones(102, dtype=np.float64)
sw[-1] = 0.0
score = _weighted_percentile(y, sw, 50)
assert score == 0
def test_weighted_percentile_zero_weight():
y = np.empty(102, dtype=np.float64)
y.fill(1.0)
sw = np.ones(102, dtype=np.float64)
sw.fill(0.0)
score = _weighted_percentile(y, sw, 50)
assert score == 1.0
def test_quantile_loss_function():
# Non regression test for the QuantileLossFunction object
# There was a sign problem when evaluating the function
# for negative values of 'ytrue - ypred'
x = np.asarray([-1.0, 0.0, 1.0])
y_found = QuantileLossFunction(1, 0.9)(x, np.zeros_like(x))
y_expected = np.asarray([0.1, 0.0, 0.9]).mean()
np.testing.assert_allclose(y_found, y_expected)
def test_sample_weight_deviance():
# Test if deviance supports sample weights.
rng = check_random_state(13)
sample_weight = np.ones(100)
reg_y = rng.rand(100)
clf_y = rng.randint(0, 2, size=100)
mclf_y = rng.randint(0, 3, size=100)
for Loss in LOSS_FUNCTIONS.values():
if Loss is None:
continue
if issubclass(Loss, RegressionLossFunction):
k = 1
y = reg_y
p = reg_y
else:
k = 2
y = clf_y
p = clf_y
if Loss.is_multi_class:
k = 3
y = mclf_y
# one-hot encoding
p = np.zeros((y.shape[0], k), dtype=np.float64)
for i in range(k):
p[:, i] = y == i
loss = Loss(k)
deviance_w_w = loss(y, p, sample_weight)
deviance_wo_w = loss(y, p)
assert deviance_wo_w == deviance_w_w
@pytest.mark.parametrize(
'n_classes, n_samples', [(3, 100), (5, 57), (7, 13)]
)
def test_multinomial_deviance(n_classes, n_samples):
# Check multinomial deviance with and without sample weights.
rng = np.random.RandomState(13)
sample_weight = np.ones(n_samples)
y_true = rng.randint(0, n_classes, size=n_samples)
y_pred = np.zeros((n_samples, n_classes), dtype=np.float64)
for klass in range(y_pred.shape[1]):
y_pred[:, klass] = y_true == klass
loss = MultinomialDeviance(n_classes)
loss_wo_sw = loss(y_true, y_pred)
assert loss_wo_sw > 0
loss_w_sw = loss(y_true, y_pred, sample_weight=sample_weight)
assert loss_wo_sw == pytest.approx(loss_w_sw)
# Multinomial deviance uses weighted average loss rather than
# weighted sum loss, so we make sure that the value remains the same
# when we device the weight by 2.
loss_w_sw = loss(y_true, y_pred, sample_weight=0.5 * sample_weight)
assert loss_wo_sw == pytest.approx(loss_w_sw)
def test_mdl_computation_weighted():
raw_predictions = np.array([[1., -1., -.1], [-2., 1., 2.]])
y_true = np.array([0, 1])
weights = np.array([1, 3])
expected_loss = 1.0909323
# MultinomialDeviance loss computation with weights.
loss = MultinomialDeviance(3)
assert (loss(y_true, raw_predictions, weights)
== pytest.approx(expected_loss))
@pytest.mark.parametrize('n', [0, 1, 2])
def test_mdl_exception(n):
# Check that MultinomialDeviance throws an exception when n_classes <= 2
err_msg = 'MultinomialDeviance requires more than 2 classes.'
with pytest.raises(ValueError, match=err_msg):
MultinomialDeviance(n)
def test_init_raw_predictions_shapes():
# Make sure get_init_raw_predictions returns float64 arrays with shape
# (n_samples, K) where K is 1 for binary classification and regression, and
# K = n_classes for multiclass classification
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 5))
y = rng.normal(size=n_samples)
for loss in (LeastSquaresError(n_classes=1),
LeastAbsoluteError(n_classes=1),
QuantileLossFunction(n_classes=1),
HuberLossFunction(n_classes=1)):
init_estimator = loss.init_estimator().fit(X, y)
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
assert raw_predictions.shape == (n_samples, 1)
assert raw_predictions.dtype == np.float64
y = rng.randint(0, 2, size=n_samples)
for loss in (BinomialDeviance(n_classes=2),
ExponentialLoss(n_classes=2)):
init_estimator = loss.init_estimator().fit(X, y)
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
assert raw_predictions.shape == (n_samples, 1)
assert raw_predictions.dtype == np.float64
for n_classes in range(3, 5):
y = rng.randint(0, n_classes, size=n_samples)
loss = MultinomialDeviance(n_classes=n_classes)
init_estimator = loss.init_estimator().fit(X, y)
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
assert raw_predictions.shape == (n_samples, n_classes)
assert raw_predictions.dtype == np.float64
def test_init_raw_predictions_values():
# Make sure the get_init_raw_predictions() returns the expected values for
# each loss.
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 5))
y = rng.normal(size=n_samples)
# Least squares loss
loss = LeastSquaresError(n_classes=1)
init_estimator = loss.init_estimator().fit(X, y)
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
# Make sure baseline prediction is the mean of all targets
assert_almost_equal(raw_predictions, y.mean())
# Least absolute and huber loss
for Loss in (LeastAbsoluteError, HuberLossFunction):
loss = Loss(n_classes=1)
init_estimator = loss.init_estimator().fit(X, y)
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
# Make sure baseline prediction is the median of all targets
assert_almost_equal(raw_predictions, np.median(y))
# Quantile loss
for alpha in (.1, .5, .9):
loss = QuantileLossFunction(n_classes=1, alpha=alpha)
init_estimator = loss.init_estimator().fit(X, y)
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
# Make sure baseline prediction is the alpha-quantile of all targets
assert_almost_equal(raw_predictions, np.percentile(y, alpha * 100))
y = rng.randint(0, 2, size=n_samples)
# Binomial deviance
loss = BinomialDeviance(n_classes=2)
init_estimator = loss.init_estimator().fit(X, y)
# Make sure baseline prediction is equal to link_function(p), where p
# is the proba of the positive class. We want predict_proba() to return p,
# and by definition
# p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction)
# So we want raw_prediction = link_function(p) = log(p / (1 - p))
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
p = y.mean()
assert_almost_equal(raw_predictions, np.log(p / (1 - p)))
# Exponential loss
loss = ExponentialLoss(n_classes=2)
init_estimator = loss.init_estimator().fit(X, y)
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
p = y.mean()
assert_almost_equal(raw_predictions, .5 * np.log(p / (1 - p)))
# Multinomial deviance loss
for n_classes in range(3, 5):
y = rng.randint(0, n_classes, size=n_samples)
loss = MultinomialDeviance(n_classes=n_classes)
init_estimator = loss.init_estimator().fit(X, y)
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
for k in range(n_classes):
p = (y == k).mean()
assert_almost_equal(raw_predictions[:, k], np.log(p))
@pytest.mark.parametrize('seed', range(5))
def test_lad_equals_quantile_50(seed):
# Make sure quantile loss with alpha = .5 is equivalent to LAD
lad = LeastAbsoluteError(n_classes=1)
ql = QuantileLossFunction(n_classes=1, alpha=0.5)
n_samples = 50
rng = np.random.RandomState(seed)
raw_predictions = rng.normal(size=(n_samples))
y_true = rng.normal(size=(n_samples))
lad_loss = lad(y_true, raw_predictions)
ql_loss = ql(y_true, raw_predictions)
assert_almost_equal(lad_loss, 2 * ql_loss)
weights = np.linspace(0, 1, n_samples) ** 2
lad_weighted_loss = lad(y_true, raw_predictions, sample_weight=weights)
ql_weighted_loss = ql(y_true, raw_predictions, sample_weight=weights)
assert_almost_equal(lad_weighted_loss, 2 * ql_weighted_loss)

View file

@ -0,0 +1,358 @@
"""
Testing for Isolation Forest algorithm (sklearn.ensemble.iforest).
"""
# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
# License: BSD 3 clause
import pytest
import numpy as np
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_raises
from sklearn.utils._testing import assert_warns_message
from sklearn.utils._testing import ignore_warnings
from sklearn.utils._testing import assert_allclose
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import IsolationForest
from sklearn.ensemble._iforest import _average_path_length
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes, load_iris
from sklearn.utils import check_random_state
from sklearn.metrics import roc_auc_score
from scipy.sparse import csc_matrix, csr_matrix
from unittest.mock import Mock, patch
rng = check_random_state(0)
# load the iris dataset
# and randomly permute it
iris = load_iris()
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]
# also load the diabetes dataset
# and randomly permute it
diabetes = load_diabetes()
perm = rng.permutation(diabetes.target.size)
diabetes.data = diabetes.data[perm]
diabetes.target = diabetes.target[perm]
def test_iforest():
"""Check Isolation Forest for various parameter settings."""
X_train = np.array([[0, 1], [1, 2]])
X_test = np.array([[2, 1], [1, 1]])
grid = ParameterGrid({"n_estimators": [3],
"max_samples": [0.5, 1.0, 3],
"bootstrap": [True, False]})
with ignore_warnings():
for params in grid:
IsolationForest(random_state=rng,
**params).fit(X_train).predict(X_test)
def test_iforest_sparse():
"""Check IForest for various parameter settings on sparse input."""
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
diabetes.target[:50],
random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0],
"bootstrap": [True, False]})
for sparse_format in [csc_matrix, csr_matrix]:
X_train_sparse = sparse_format(X_train)
X_test_sparse = sparse_format(X_test)
for params in grid:
# Trained on sparse format
sparse_classifier = IsolationForest(
n_estimators=10, random_state=1, **params).fit(X_train_sparse)
sparse_results = sparse_classifier.predict(X_test_sparse)
# Trained on dense format
dense_classifier = IsolationForest(
n_estimators=10, random_state=1, **params).fit(X_train)
dense_results = dense_classifier.predict(X_test)
assert_array_equal(sparse_results, dense_results)
def test_iforest_error():
"""Test that it gives proper exception on deficient input."""
X = iris.data
# Test max_samples
assert_raises(ValueError,
IsolationForest(max_samples=-1).fit, X)
assert_raises(ValueError,
IsolationForest(max_samples=0.0).fit, X)
assert_raises(ValueError,
IsolationForest(max_samples=2.0).fit, X)
# The dataset has less than 256 samples, explicitly setting
# max_samples > n_samples should result in a warning. If not set
# explicitly there should be no warning
assert_warns_message(UserWarning,
"max_samples will be set to n_samples for estimation",
IsolationForest(max_samples=1000).fit, X)
# note that assert_no_warnings does not apply since it enables a
# PendingDeprecationWarning triggered by scipy.sparse's use of
# np.matrix. See issue #11251.
with pytest.warns(None) as record:
IsolationForest(max_samples='auto').fit(X)
user_warnings = [each for each in record
if issubclass(each.category, UserWarning)]
assert len(user_warnings) == 0
with pytest.warns(None) as record:
IsolationForest(max_samples=np.int64(2)).fit(X)
user_warnings = [each for each in record
if issubclass(each.category, UserWarning)]
assert len(user_warnings) == 0
assert_raises(ValueError, IsolationForest(max_samples='foobar').fit, X)
assert_raises(ValueError, IsolationForest(max_samples=1.5).fit, X)
# test X_test n_features match X_train one:
assert_raises(ValueError, IsolationForest().fit(X).predict, X[:, 1:])
# test that behaviour='old' will raise an error
msg = "The old behaviour of IsolationForest is not implemented anymore."
with pytest.raises(NotImplementedError, match=msg):
IsolationForest(behaviour='old').fit(X)
def test_recalculate_max_depth():
"""Check max_depth recalculation when max_samples is reset to n_samples"""
X = iris.data
clf = IsolationForest().fit(X)
for est in clf.estimators_:
assert est.max_depth == int(np.ceil(np.log2(X.shape[0])))
def test_max_samples_attribute():
X = iris.data
clf = IsolationForest().fit(X)
assert clf.max_samples_ == X.shape[0]
clf = IsolationForest(max_samples=500)
assert_warns_message(UserWarning,
"max_samples will be set to n_samples for estimation",
clf.fit, X)
assert clf.max_samples_ == X.shape[0]
clf = IsolationForest(max_samples=0.4).fit(X)
assert clf.max_samples_ == 0.4*X.shape[0]
def test_iforest_parallel_regression():
"""Check parallel regression."""
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
diabetes.target,
random_state=rng)
ensemble = IsolationForest(n_jobs=3,
random_state=0).fit(X_train)
ensemble.set_params(n_jobs=1)
y1 = ensemble.predict(X_test)
ensemble.set_params(n_jobs=2)
y2 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y2)
ensemble = IsolationForest(n_jobs=1,
random_state=0).fit(X_train)
y3 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y3)
def test_iforest_performance():
"""Test Isolation Forest performs well"""
# Generate train/test data
rng = check_random_state(2)
X = 0.3 * rng.randn(120, 2)
X_train = np.r_[X + 2, X - 2]
X_train = X[:100]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
X_test = np.r_[X[100:], X_outliers]
y_test = np.array([0] * 20 + [1] * 20)
# fit the model
clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)
# predict scores (the lower, the more normal)
y_pred = - clf.decision_function(X_test)
# check that there is at most 6 errors (false positive or false negative)
assert roc_auc_score(y_test, y_pred) > 0.98
@pytest.mark.parametrize("contamination", [0.25, "auto"])
def test_iforest_works(contamination):
# toy sample (the last two samples are outliers)
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]
# Test IsolationForest
clf = IsolationForest(random_state=rng, contamination=contamination)
clf.fit(X)
decision_func = -clf.decision_function(X)
pred = clf.predict(X)
# assert detect outliers:
assert np.min(decision_func[-2:]) > np.max(decision_func[:-2])
assert_array_equal(pred, 6 * [1] + 2 * [-1])
def test_max_samples_consistency():
# Make sure validated max_samples in iforest and BaseBagging are identical
X = iris.data
clf = IsolationForest().fit(X)
assert clf.max_samples_ == clf._max_samples
def test_iforest_subsampled_features():
# It tests non-regression for #5732 which failed at predict.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
diabetes.target[:50],
random_state=rng)
clf = IsolationForest(max_features=0.8)
clf.fit(X_train, y_train)
clf.predict(X_test)
def test_iforest_average_path_length():
# It tests non-regression for #8549 which used the wrong formula
# for average path length, strictly for the integer case
# Updated to check average path length when input is <= 2 (issue #11839)
result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0
result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0
assert_allclose(_average_path_length([0]), [0.0])
assert_allclose(_average_path_length([1]), [0.0])
assert_allclose(_average_path_length([2]), [1.0])
assert_allclose(_average_path_length([5]), [result_one])
assert_allclose(_average_path_length([999]), [result_two])
assert_allclose(
_average_path_length(np.array([1, 2, 5, 999])),
[0.0, 1.0, result_one, result_two],
)
# _average_path_length is increasing
avg_path_length = _average_path_length(np.arange(5))
assert_array_equal(avg_path_length, np.sort(avg_path_length))
def test_score_samples():
X_train = [[1, 1], [1, 2], [2, 1]]
clf1 = IsolationForest(contamination=0.1).fit(X_train)
clf2 = IsolationForest().fit(X_train)
assert_array_equal(clf1.score_samples([[2., 2.]]),
clf1.decision_function([[2., 2.]]) + clf1.offset_)
assert_array_equal(clf2.score_samples([[2., 2.]]),
clf2.decision_function([[2., 2.]]) + clf2.offset_)
assert_array_equal(clf1.score_samples([[2., 2.]]),
clf2.score_samples([[2., 2.]]))
def test_iforest_warm_start():
"""Test iterative addition of iTrees to an iForest """
rng = check_random_state(0)
X = rng.randn(20, 2)
# fit first 10 trees
clf = IsolationForest(n_estimators=10, max_samples=20,
random_state=rng, warm_start=True)
clf.fit(X)
# remember the 1st tree
tree_1 = clf.estimators_[0]
# fit another 10 trees
clf.set_params(n_estimators=20)
clf.fit(X)
# expecting 20 fitted trees and no overwritten trees
assert len(clf.estimators_) == 20
assert clf.estimators_[0] is tree_1
# mock get_chunk_n_rows to actually test more than one chunk (here one
# chunk = 3 rows:
@patch(
"sklearn.ensemble._iforest.get_chunk_n_rows",
side_effect=Mock(**{"return_value": 3}),
)
@pytest.mark.parametrize(
"contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]
)
def test_iforest_chunks_works1(
mocked_get_chunk, contamination, n_predict_calls
):
test_iforest_works(contamination)
assert mocked_get_chunk.call_count == n_predict_calls
# idem with chunk_size = 5 rows
@patch(
"sklearn.ensemble._iforest.get_chunk_n_rows",
side_effect=Mock(**{"return_value": 10}),
)
@pytest.mark.parametrize(
"contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]
)
def test_iforest_chunks_works2(
mocked_get_chunk, contamination, n_predict_calls
):
test_iforest_works(contamination)
assert mocked_get_chunk.call_count == n_predict_calls
def test_iforest_deprecation():
iforest = IsolationForest(behaviour='new')
warn_msg = "'behaviour' is deprecated in 0.22 and will be removed in 0.24"
with pytest.warns(FutureWarning, match=warn_msg):
iforest.fit(iris.data)
def test_iforest_with_uniform_data():
"""Test whether iforest predicts inliers when using uniform data"""
# 2-d array of all 1s
X = np.ones((100, 10))
iforest = IsolationForest()
iforest.fit(X)
rng = np.random.RandomState(0)
assert all(iforest.predict(X) == 1)
assert all(iforest.predict(rng.randn(100, 10)) == 1)
assert all(iforest.predict(X + 1) == 1)
assert all(iforest.predict(X - 1) == 1)
# 2-d array where columns contain the same value across rows
X = np.repeat(rng.randn(1, 10), 100, 0)
iforest = IsolationForest()
iforest.fit(X)
assert all(iforest.predict(X) == 1)
assert all(iforest.predict(rng.randn(100, 10)) == 1)
assert all(iforest.predict(np.ones((100, 10))) == 1)
# Single row
X = rng.randn(1, 10)
iforest = IsolationForest()
iforest.fit(X)
assert all(iforest.predict(X) == 1)
assert all(iforest.predict(rng.randn(100, 10)) == 1)
assert all(iforest.predict(np.ones((100, 10))) == 1)

View file

@ -0,0 +1,524 @@
"""Test the stacking classifier and regressor."""
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: BSD 3 clause
import pytest
import numpy as np
import scipy.sparse as sparse
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.base import RegressorMixin
from sklearn.base import clone
from sklearn.exceptions import ConvergenceWarning
from sklearn.datasets import load_iris
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import make_regression
from sklearn.datasets import make_classification
from sklearn.dummy import DummyClassifier
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.svm import LinearSVR
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import scale
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.utils._mocking import CheckingClassifier
from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import assert_allclose_dense_sparse
from sklearn.utils._testing import ignore_warnings
from sklearn.utils.estimator_checks import check_estimator
from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
X_diabetes, y_diabetes = load_diabetes(return_X_y=True)
X_iris, y_iris = load_iris(return_X_y=True)
@pytest.mark.parametrize(
"cv", [3, StratifiedKFold(n_splits=3, shuffle=True, random_state=42)]
)
@pytest.mark.parametrize(
"final_estimator", [None, RandomForestClassifier(random_state=42)]
)
@pytest.mark.parametrize("passthrough", [False, True])
def test_stacking_classifier_iris(cv, final_estimator, passthrough):
# prescale the data to avoid convergence warning without using a pipeline
# for later assert
X_train, X_test, y_train, y_test = train_test_split(
scale(X_iris), y_iris, stratify=y_iris, random_state=42
)
estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
clf = StackingClassifier(
estimators=estimators, final_estimator=final_estimator, cv=cv,
passthrough=passthrough
)
clf.fit(X_train, y_train)
clf.predict(X_test)
clf.predict_proba(X_test)
assert clf.score(X_test, y_test) > 0.8
X_trans = clf.transform(X_test)
expected_column_count = 10 if passthrough else 6
assert X_trans.shape[1] == expected_column_count
if passthrough:
assert_allclose(X_test, X_trans[:, -4:])
clf.set_params(lr='drop')
clf.fit(X_train, y_train)
clf.predict(X_test)
clf.predict_proba(X_test)
if final_estimator is None:
# LogisticRegression has decision_function method
clf.decision_function(X_test)
X_trans = clf.transform(X_test)
expected_column_count_drop = 7 if passthrough else 3
assert X_trans.shape[1] == expected_column_count_drop
if passthrough:
assert_allclose(X_test, X_trans[:, -4:])
def test_stacking_classifier_drop_column_binary_classification():
# check that a column is dropped in binary classification
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, _ = train_test_split(
scale(X), y, stratify=y, random_state=42
)
# both classifiers implement 'predict_proba' and will both drop one column
estimators = [('lr', LogisticRegression()),
('rf', RandomForestClassifier(random_state=42))]
clf = StackingClassifier(estimators=estimators, cv=3)
clf.fit(X_train, y_train)
X_trans = clf.transform(X_test)
assert X_trans.shape[1] == 2
# LinearSVC does not implement 'predict_proba' and will not drop one column
estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
clf.set_params(estimators=estimators)
clf.fit(X_train, y_train)
X_trans = clf.transform(X_test)
assert X_trans.shape[1] == 2
def test_stacking_classifier_drop_estimator():
# prescale the data to avoid convergence warning without using a pipeline
# for later assert
X_train, X_test, y_train, _ = train_test_split(
scale(X_iris), y_iris, stratify=y_iris, random_state=42
)
estimators = [('lr', 'drop'), ('svc', LinearSVC(random_state=0))]
rf = RandomForestClassifier(n_estimators=10, random_state=42)
clf = StackingClassifier(
estimators=[('svc', LinearSVC(random_state=0))],
final_estimator=rf, cv=5
)
clf_drop = StackingClassifier(
estimators=estimators, final_estimator=rf, cv=5
)
clf.fit(X_train, y_train)
clf_drop.fit(X_train, y_train)
assert_allclose(clf.predict(X_test), clf_drop.predict(X_test))
assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test))
assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
def test_stacking_regressor_drop_estimator():
# prescale the data to avoid convergence warning without using a pipeline
# for later assert
X_train, X_test, y_train, _ = train_test_split(
scale(X_diabetes), y_diabetes, random_state=42
)
estimators = [('lr', 'drop'), ('svr', LinearSVR(random_state=0))]
rf = RandomForestRegressor(n_estimators=10, random_state=42)
reg = StackingRegressor(
estimators=[('svr', LinearSVR(random_state=0))],
final_estimator=rf, cv=5
)
reg_drop = StackingRegressor(
estimators=estimators, final_estimator=rf, cv=5
)
reg.fit(X_train, y_train)
reg_drop.fit(X_train, y_train)
assert_allclose(reg.predict(X_test), reg_drop.predict(X_test))
assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
@pytest.mark.parametrize(
"cv", [3, KFold(n_splits=3, shuffle=True, random_state=42)]
)
@pytest.mark.parametrize(
"final_estimator, predict_params",
[(None, {}),
(RandomForestRegressor(random_state=42), {}),
(DummyRegressor(), {'return_std': True})]
)
@pytest.mark.parametrize("passthrough", [False, True])
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params,
passthrough):
# prescale the data to avoid convergence warning without using a pipeline
# for later assert
X_train, X_test, y_train, _ = train_test_split(
scale(X_diabetes), y_diabetes, random_state=42
)
estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
reg = StackingRegressor(
estimators=estimators, final_estimator=final_estimator, cv=cv,
passthrough=passthrough
)
reg.fit(X_train, y_train)
result = reg.predict(X_test, **predict_params)
expected_result_length = 2 if predict_params else 1
if predict_params:
assert len(result) == expected_result_length
X_trans = reg.transform(X_test)
expected_column_count = 12 if passthrough else 2
assert X_trans.shape[1] == expected_column_count
if passthrough:
assert_allclose(X_test, X_trans[:, -10:])
reg.set_params(lr='drop')
reg.fit(X_train, y_train)
reg.predict(X_test)
X_trans = reg.transform(X_test)
expected_column_count_drop = 11 if passthrough else 1
assert X_trans.shape[1] == expected_column_count_drop
if passthrough:
assert_allclose(X_test, X_trans[:, -10:])
@pytest.mark.parametrize('fmt', ['csc', 'csr', 'coo'])
def test_stacking_regressor_sparse_passthrough(fmt):
# Check passthrough behavior on a sparse X matrix
X_train, X_test, y_train, _ = train_test_split(
sparse.coo_matrix(scale(X_diabetes)).asformat(fmt),
y_diabetes, random_state=42
)
estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
rf = RandomForestRegressor(n_estimators=10, random_state=42)
clf = StackingRegressor(
estimators=estimators, final_estimator=rf, cv=5, passthrough=True
)
clf.fit(X_train, y_train)
X_trans = clf.transform(X_test)
assert_allclose_dense_sparse(X_test, X_trans[:, -10:])
assert sparse.issparse(X_trans)
assert X_test.format == X_trans.format
@pytest.mark.parametrize('fmt', ['csc', 'csr', 'coo'])
def test_stacking_classifier_sparse_passthrough(fmt):
# Check passthrough behavior on a sparse X matrix
X_train, X_test, y_train, _ = train_test_split(
sparse.coo_matrix(scale(X_iris)).asformat(fmt),
y_iris, random_state=42
)
estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
rf = RandomForestClassifier(n_estimators=10, random_state=42)
clf = StackingClassifier(
estimators=estimators, final_estimator=rf, cv=5, passthrough=True
)
clf.fit(X_train, y_train)
X_trans = clf.transform(X_test)
assert_allclose_dense_sparse(X_test, X_trans[:, -4:])
assert sparse.issparse(X_trans)
assert X_test.format == X_trans.format
def test_stacking_classifier_drop_binary_prob():
# check that classifier will drop one of the probability column for
# binary classification problem
# Select only the 2 first classes
X_, y_ = scale(X_iris[:100]), y_iris[:100]
estimators = [
('lr', LogisticRegression()), ('rf', RandomForestClassifier())
]
clf = StackingClassifier(estimators=estimators)
clf.fit(X_, y_)
X_meta = clf.transform(X_)
assert X_meta.shape[1] == 2
class NoWeightRegressor(BaseEstimator, RegressorMixin):
def fit(self, X, y):
self.reg = DummyRegressor()
return self.reg.fit(X, y)
def predict(self, X):
return np.ones(X.shape[0])
class NoWeightClassifier(BaseEstimator, ClassifierMixin):
def fit(self, X, y):
self.clf = DummyClassifier(strategy='stratified')
return self.clf.fit(X, y)
@pytest.mark.parametrize(
"y, params, type_err, msg_err",
[(y_iris,
{'estimators': None},
ValueError, "Invalid 'estimators' attribute,"),
(y_iris,
{'estimators': []},
ValueError, "Invalid 'estimators' attribute,"),
(y_iris,
{'estimators': [('lr', LogisticRegression()),
('svm', SVC(max_iter=5e4))],
'stack_method': 'predict_proba'},
ValueError, 'does not implement the method predict_proba'),
(y_iris,
{'estimators': [('lr', LogisticRegression()),
('cor', NoWeightClassifier())]},
TypeError, 'does not support sample weight'),
(y_iris,
{'estimators': [('lr', LogisticRegression()),
('cor', LinearSVC(max_iter=5e4))],
'final_estimator': NoWeightClassifier()},
TypeError, 'does not support sample weight')]
)
def test_stacking_classifier_error(y, params, type_err, msg_err):
with pytest.raises(type_err, match=msg_err):
clf = StackingClassifier(**params, cv=3)
clf.fit(
scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0])
)
@pytest.mark.parametrize(
"y, params, type_err, msg_err",
[(y_diabetes,
{'estimators': None},
ValueError, "Invalid 'estimators' attribute,"),
(y_diabetes,
{'estimators': []},
ValueError, "Invalid 'estimators' attribute,"),
(y_diabetes,
{'estimators': [('lr', LinearRegression()),
('cor', NoWeightRegressor())]},
TypeError, 'does not support sample weight'),
(y_diabetes,
{'estimators': [('lr', LinearRegression()),
('cor', LinearSVR())],
'final_estimator': NoWeightRegressor()},
TypeError, 'does not support sample weight')]
)
def test_stacking_regressor_error(y, params, type_err, msg_err):
with pytest.raises(type_err, match=msg_err):
reg = StackingRegressor(**params, cv=3)
reg.fit(
scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0])
)
@pytest.mark.parametrize(
"estimator, X, y",
[(StackingClassifier(
estimators=[('lr', LogisticRegression(random_state=0)),
('svm', LinearSVC(random_state=0))]),
X_iris[:100], y_iris[:100]), # keep only classes 0 and 1
(StackingRegressor(
estimators=[('lr', LinearRegression()),
('svm', LinearSVR(random_state=0))]),
X_diabetes, y_diabetes)],
ids=['StackingClassifier', 'StackingRegressor']
)
def test_stacking_randomness(estimator, X, y):
# checking that fixing the random state of the CV will lead to the same
# results
estimator_full = clone(estimator)
estimator_full.set_params(
cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
)
estimator_drop = clone(estimator)
estimator_drop.set_params(lr='drop')
estimator_drop.set_params(
cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
)
assert_allclose(
estimator_full.fit(X, y).transform(X)[:, 1:],
estimator_drop.fit(X, y).transform(X)
)
# These warnings are raised due to _BaseComposition
@pytest.mark.filterwarnings("ignore:TypeError occurred during set_params")
@pytest.mark.filterwarnings("ignore:Estimator's parameters changed after")
@pytest.mark.parametrize(
"estimator",
[StackingClassifier(
estimators=[('lr', LogisticRegression(random_state=0)),
('tree', DecisionTreeClassifier(random_state=0))]),
StackingRegressor(
estimators=[('lr', LinearRegression()),
('tree', DecisionTreeRegressor(random_state=0))])],
ids=['StackingClassifier', 'StackingRegressor']
)
def test_check_estimators_stacking_estimator(estimator):
check_estimator(estimator)
check_no_attributes_set_in_init(estimator.__class__.__name__, estimator)
def test_stacking_classifier_stratify_default():
# check that we stratify the classes for the default CV
clf = StackingClassifier(
estimators=[('lr', LogisticRegression(max_iter=1e4)),
('svm', LinearSVC(max_iter=1e4))]
)
# since iris is not shuffled, a simple k-fold would not contain the
# 3 classes during training
clf.fit(X_iris, y_iris)
@pytest.mark.parametrize(
"stacker, X, y",
[(StackingClassifier(
estimators=[('lr', LogisticRegression()),
('svm', LinearSVC(random_state=42))],
final_estimator=LogisticRegression(),
cv=KFold(shuffle=True, random_state=42)),
*load_breast_cancer(return_X_y=True)),
(StackingRegressor(
estimators=[('lr', LinearRegression()),
('svm', LinearSVR(random_state=42))],
final_estimator=LinearRegression(),
cv=KFold(shuffle=True, random_state=42)),
X_diabetes, y_diabetes)],
ids=['StackingClassifier', 'StackingRegressor']
)
def test_stacking_with_sample_weight(stacker, X, y):
# check that sample weights has an influence on the fitting
# note: ConvergenceWarning are catch since we are not worrying about the
# convergence here
n_half_samples = len(y) // 2
total_sample_weight = np.array(
[0.1] * n_half_samples + [0.9] * (len(y) - n_half_samples)
)
X_train, X_test, y_train, _, sample_weight_train, _ = train_test_split(
X, y, total_sample_weight, random_state=42
)
with ignore_warnings(category=ConvergenceWarning):
stacker.fit(X_train, y_train)
y_pred_no_weight = stacker.predict(X_test)
with ignore_warnings(category=ConvergenceWarning):
stacker.fit(X_train, y_train, sample_weight=np.ones(y_train.shape))
y_pred_unit_weight = stacker.predict(X_test)
assert_allclose(y_pred_no_weight, y_pred_unit_weight)
with ignore_warnings(category=ConvergenceWarning):
stacker.fit(X_train, y_train, sample_weight=sample_weight_train)
y_pred_biased = stacker.predict(X_test)
assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0
def test_stacking_classifier_sample_weight_fit_param():
# check sample_weight is passed to all invocations of fit
stacker = StackingClassifier(
estimators=[
('lr', CheckingClassifier(expected_fit_params=['sample_weight']))
],
final_estimator=CheckingClassifier(
expected_fit_params=['sample_weight']
)
)
stacker.fit(X_iris, y_iris, sample_weight=np.ones(X_iris.shape[0]))
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
@pytest.mark.parametrize(
"stacker, X, y",
[(StackingClassifier(
estimators=[('lr', LogisticRegression()),
('svm', LinearSVC(random_state=42))],
final_estimator=LogisticRegression()),
*load_breast_cancer(return_X_y=True)),
(StackingRegressor(
estimators=[('lr', LinearRegression()),
('svm', LinearSVR(random_state=42))],
final_estimator=LinearRegression()),
X_diabetes, y_diabetes)],
ids=['StackingClassifier', 'StackingRegressor']
)
def test_stacking_cv_influence(stacker, X, y):
# check that the stacking affects the fit of the final estimator but not
# the fit of the base estimators
# note: ConvergenceWarning are catch since we are not worrying about the
# convergence here
stacker_cv_3 = clone(stacker)
stacker_cv_5 = clone(stacker)
stacker_cv_3.set_params(cv=3)
stacker_cv_5.set_params(cv=5)
stacker_cv_3.fit(X, y)
stacker_cv_5.fit(X, y)
# the base estimators should be identical
for est_cv_3, est_cv_5 in zip(stacker_cv_3.estimators_,
stacker_cv_5.estimators_):
assert_allclose(est_cv_3.coef_, est_cv_5.coef_)
# the final estimator should be different
with pytest.raises(AssertionError, match='Not equal'):
assert_allclose(stacker_cv_3.final_estimator_.coef_,
stacker_cv_5.final_estimator_.coef_)
@pytest.mark.parametrize("make_dataset, Stacking, Estimator", [
(make_classification, StackingClassifier, LogisticRegression),
(make_regression, StackingRegressor, LinearRegression)
])
def test_stacking_without_n_features_in(make_dataset, Stacking, Estimator):
# Stacking supports estimators without `n_features_in_`. Regression test
# for #17353
class MyEstimator(Estimator):
"""Estimator without n_features_in_"""
def fit(self, X, y):
super().fit(X, y)
del self.n_features_in_
X, y = make_dataset(random_state=0, n_samples=100)
stacker = Stacking(estimators=[('lr', MyEstimator())])
msg = f"{Stacking.__name__} object has no attribute n_features_in_"
with pytest.raises(AttributeError, match=msg):
stacker.n_features_in_
# Does not raise
stacker.fit(X, y)
msg = "'MyEstimator' object has no attribute 'n_features_in_'"
with pytest.raises(AttributeError, match=msg):
stacker.n_features_in_

View file

@ -0,0 +1,574 @@
"""Testing for the VotingClassifier and VotingRegressor"""
import pytest
import re
import numpy as np
from sklearn.utils._testing import assert_almost_equal, assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_raise_message
from sklearn.utils.estimator_checks import check_estimator
from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
from sklearn.exceptions import NotFittedError
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.datasets import make_multilabel_classification
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.dummy import DummyRegressor
# Load datasets
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target
X_r, y_r = datasets.load_diabetes(return_X_y=True)
@pytest.mark.parametrize(
"params, err_msg",
[({'estimators': []},
"Invalid 'estimators' attribute, 'estimators' should be a list of"),
({'estimators': [('lr', LogisticRegression())], 'voting': 'error'},
r"Voting must be 'soft' or 'hard'; got \(voting='error'\)"),
({'estimators': [('lr', LogisticRegression())], 'weights': [1, 2]},
"Number of `estimators` and weights must be equal")]
)
def test_voting_classifier_estimator_init(params, err_msg):
ensemble = VotingClassifier(**params)
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
def test_predictproba_hardvoting():
eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()),
('lr2', LogisticRegression())],
voting='hard')
msg = "predict_proba is not available when voting='hard'"
with pytest.raises(AttributeError, match=msg):
eclf.predict_proba
assert not hasattr(eclf, "predict_proba")
eclf.fit(X, y)
assert not hasattr(eclf, "predict_proba")
def test_notfitted():
eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()),
('lr2', LogisticRegression())],
voting='soft')
ereg = VotingRegressor([('dr', DummyRegressor())])
msg = ("This %s instance is not fitted yet. Call \'fit\'"
" with appropriate arguments before using this estimator.")
assert_raise_message(NotFittedError, msg % 'VotingClassifier',
eclf.predict, X)
assert_raise_message(NotFittedError, msg % 'VotingClassifier',
eclf.predict_proba, X)
assert_raise_message(NotFittedError, msg % 'VotingClassifier',
eclf.transform, X)
assert_raise_message(NotFittedError, msg % 'VotingRegressor',
ereg.predict, X_r)
assert_raise_message(NotFittedError, msg % 'VotingRegressor',
ereg.transform, X_r)
def test_majority_label_iris():
"""Check classification by majority label on dataset iris."""
clf1 = LogisticRegression(solver='liblinear', random_state=123)
clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
clf3 = GaussianNB()
eclf = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
voting='hard')
scores = cross_val_score(eclf, X, y, scoring='accuracy')
assert_almost_equal(scores.mean(), 0.95, decimal=2)
def test_tie_situation():
"""Check voting classifier selects smaller class label in tie situation."""
clf1 = LogisticRegression(random_state=123, solver='liblinear')
clf2 = RandomForestClassifier(random_state=123)
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)],
voting='hard')
assert clf1.fit(X, y).predict(X)[73] == 2
assert clf2.fit(X, y).predict(X)[73] == 1
assert eclf.fit(X, y).predict(X)[73] == 1
def test_weights_iris():
"""Check classification by average probabilities on dataset iris."""
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(random_state=123)
clf3 = GaussianNB()
eclf = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
voting='soft',
weights=[1, 2, 10])
scores = cross_val_score(eclf, X, y, scoring='accuracy')
assert_almost_equal(scores.mean(), 0.93, decimal=2)
def test_weights_regressor():
"""Check weighted average regression prediction on diabetes dataset."""
reg1 = DummyRegressor(strategy='mean')
reg2 = DummyRegressor(strategy='median')
reg3 = DummyRegressor(strategy='quantile', quantile=.2)
ereg = VotingRegressor([('mean', reg1), ('median', reg2),
('quantile', reg3)], weights=[1, 2, 10])
X_r_train, X_r_test, y_r_train, y_r_test = \
train_test_split(X_r, y_r, test_size=.25)
reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test)
reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test)
reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test)
ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test)
avg = np.average(np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0,
weights=[1, 2, 10])
assert_almost_equal(ereg_pred, avg, decimal=2)
ereg_weights_none = VotingRegressor([('mean', reg1), ('median', reg2),
('quantile', reg3)], weights=None)
ereg_weights_equal = VotingRegressor([('mean', reg1), ('median', reg2),
('quantile', reg3)],
weights=[1, 1, 1])
ereg_weights_none.fit(X_r_train, y_r_train)
ereg_weights_equal.fit(X_r_train, y_r_train)
ereg_none_pred = ereg_weights_none.predict(X_r_test)
ereg_equal_pred = ereg_weights_equal.predict(X_r_test)
assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
def test_predict_on_toy_problem():
"""Manually check predicted class labels for toy dataset."""
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(random_state=123)
clf3 = GaussianNB()
X = np.array([[-1.1, -1.5],
[-1.2, -1.4],
[-3.4, -2.2],
[1.1, 1.2],
[2.1, 1.4],
[3.1, 2.3]])
y = np.array([1, 1, 1, 2, 2, 2])
assert_array_equal(clf1.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
assert_array_equal(clf2.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
assert_array_equal(clf3.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
eclf = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
voting='hard',
weights=[1, 1, 1])
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
eclf = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
voting='soft',
weights=[1, 1, 1])
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
def test_predict_proba_on_toy_problem():
"""Calculate predicted probabilities on toy dataset."""
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(random_state=123)
clf3 = GaussianNB()
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
clf1_res = np.array([[0.59790391, 0.40209609],
[0.57622162, 0.42377838],
[0.50728456, 0.49271544],
[0.40241774, 0.59758226]])
clf2_res = np.array([[0.8, 0.2],
[0.8, 0.2],
[0.2, 0.8],
[0.3, 0.7]])
clf3_res = np.array([[0.9985082, 0.0014918],
[0.99845843, 0.00154157],
[0., 1.],
[0., 1.]])
t00 = (2*clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4
t11 = (2*clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4
t21 = (2*clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4
t31 = (2*clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4
eclf = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
voting='soft',
weights=[2, 1, 1])
eclf_res = eclf.fit(X, y).predict_proba(X)
assert_almost_equal(t00, eclf_res[0][0], decimal=1)
assert_almost_equal(t11, eclf_res[1][1], decimal=1)
assert_almost_equal(t21, eclf_res[2][1], decimal=1)
assert_almost_equal(t31, eclf_res[3][1], decimal=1)
with pytest.raises(
AttributeError,
match="predict_proba is not available when voting='hard'"):
eclf = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
voting='hard')
eclf.fit(X, y).predict_proba(X)
def test_multilabel():
"""Check if error is raised for multilabel classification."""
X, y = make_multilabel_classification(n_classes=2, n_labels=1,
allow_unlabeled=False,
random_state=123)
clf = OneVsRestClassifier(SVC(kernel='linear'))
eclf = VotingClassifier(estimators=[('ovr', clf)], voting='hard')
try:
eclf.fit(X, y)
except NotImplementedError:
return
def test_gridsearch():
"""Check GridSearch support."""
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
eclf = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
voting='soft')
params = {'lr__C': [1.0, 100.0],
'voting': ['soft', 'hard'],
'weights': [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]]}
grid = GridSearchCV(estimator=eclf, param_grid=params)
grid.fit(iris.data, iris.target)
def test_parallel_fit():
"""Check parallel backend of VotingClassifier on toy dataset."""
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(random_state=123)
clf3 = GaussianNB()
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
eclf1 = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
voting='soft',
n_jobs=1).fit(X, y)
eclf2 = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
voting='soft',
n_jobs=2).fit(X, y)
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
def test_sample_weight():
"""Tests sample_weight parameter of VotingClassifier"""
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(random_state=123)
clf3 = SVC(probability=True, random_state=123)
eclf1 = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2), ('svc', clf3)],
voting='soft').fit(X, y, sample_weight=np.ones((len(y),)))
eclf2 = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2), ('svc', clf3)],
voting='soft').fit(X, y)
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
sample_weight = np.random.RandomState(123).uniform(size=(len(y),))
eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft')
eclf3.fit(X, y, sample_weight)
clf1.fit(X, y, sample_weight)
assert_array_equal(eclf3.predict(X), clf1.predict(X))
assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X))
# check that an error is raised and indicative if sample_weight is not
# supported.
clf4 = KNeighborsClassifier()
eclf3 = VotingClassifier(estimators=[
('lr', clf1), ('svc', clf3), ('knn', clf4)],
voting='soft')
msg = ('Underlying estimator KNeighborsClassifier does not support '
'sample weights.')
with pytest.raises(TypeError, match=msg):
eclf3.fit(X, y, sample_weight)
# check that _fit_single_estimator will raise the right error
# it should raise the original error if this is not linked to sample_weight
class ClassifierErrorFit(ClassifierMixin, BaseEstimator):
def fit(self, X, y, sample_weight):
raise TypeError('Error unrelated to sample_weight.')
clf = ClassifierErrorFit()
with pytest.raises(TypeError, match='Error unrelated to sample_weight'):
clf.fit(X, y, sample_weight=sample_weight)
def test_sample_weight_kwargs():
"""Check that VotingClassifier passes sample_weight as kwargs"""
class MockClassifier(ClassifierMixin, BaseEstimator):
"""Mock Classifier to check that sample_weight is received as kwargs"""
def fit(self, X, y, *args, **sample_weight):
assert 'sample_weight' in sample_weight
clf = MockClassifier()
eclf = VotingClassifier(estimators=[('mock', clf)], voting='soft')
# Should not raise an error.
eclf.fit(X, y, sample_weight=np.ones((len(y),)))
def test_voting_classifier_set_params():
# check equivalence in the output when setting underlying estimators
clf1 = LogisticRegression(random_state=123, C=1.0)
clf2 = RandomForestClassifier(random_state=123, max_depth=None)
clf3 = GaussianNB()
eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft',
weights=[1, 2]).fit(X, y)
eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft',
weights=[1, 2])
eclf2.set_params(nb=clf2).fit(X, y)
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
assert eclf2.estimators[0][1].get_params() == clf1.get_params()
assert eclf2.estimators[1][1].get_params() == clf2.get_params()
# TODO: Remove parametrization in 0.24 when None is removed in Voting*
@pytest.mark.parametrize("drop", [None, 'drop'])
def test_set_estimator_none(drop):
"""VotingClassifier set_params should be able to set estimators as None or
drop"""
# Test predict
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
clf3 = GaussianNB()
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
('nb', clf3)],
voting='hard', weights=[1, 0, 0.5]).fit(X, y)
eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
('nb', clf3)],
voting='hard', weights=[1, 1, 0.5])
with pytest.warns(None) as record:
eclf2.set_params(rf=drop).fit(X, y)
assert record if drop is None else not record
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert dict(eclf2.estimators)["rf"] is drop
assert len(eclf2.estimators_) == 2
assert all(isinstance(est, (LogisticRegression, GaussianNB))
for est in eclf2.estimators_)
assert eclf2.get_params()["rf"] is drop
eclf1.set_params(voting='soft').fit(X, y)
with pytest.warns(None) as record:
eclf2.set_params(voting='soft').fit(X, y)
assert record if drop is None else not record
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
msg = 'All estimators are dropped. At least one is required'
with pytest.warns(None) as record:
with pytest.raises(ValueError, match=msg):
eclf2.set_params(lr=drop, rf=drop, nb=drop).fit(X, y)
assert record if drop is None else not record
# Test soft voting transform
X1 = np.array([[1], [2]])
y1 = np.array([1, 2])
eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
voting='soft', weights=[0, 0.5],
flatten_transform=False).fit(X1, y1)
eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
voting='soft', weights=[1, 0.5],
flatten_transform=False)
with pytest.warns(None) as record:
eclf2.set_params(rf=drop).fit(X1, y1)
assert record if drop is None else not record
assert_array_almost_equal(eclf1.transform(X1),
np.array([[[0.7, 0.3], [0.3, 0.7]],
[[1., 0.], [0., 1.]]]))
assert_array_almost_equal(eclf2.transform(X1),
np.array([[[1., 0.],
[0., 1.]]]))
eclf1.set_params(voting='hard')
eclf2.set_params(voting='hard')
assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_estimator_weights_format():
# Test estimator weights inputs as list and array
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(random_state=123)
eclf1 = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2)],
weights=[1, 2],
voting='soft')
eclf2 = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2)],
weights=np.array((1, 2)),
voting='soft')
eclf1.fit(X, y)
eclf2.fit(X, y)
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
def test_transform():
"""Check transform method of VotingClassifier on toy dataset."""
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(random_state=123)
clf3 = GaussianNB()
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
eclf1 = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
voting='soft').fit(X, y)
eclf2 = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
voting='soft',
flatten_transform=True).fit(X, y)
eclf3 = VotingClassifier(estimators=[
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
voting='soft',
flatten_transform=False).fit(X, y)
assert_array_equal(eclf1.transform(X).shape, (4, 6))
assert_array_equal(eclf2.transform(X).shape, (4, 6))
assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
assert_array_almost_equal(eclf1.transform(X),
eclf2.transform(X))
assert_array_almost_equal(
eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)),
eclf2.transform(X)
)
# TODO: Remove drop=None in 0.24 when None is removed in Voting*
@pytest.mark.parametrize(
"X, y, voter",
[(X, y, VotingClassifier(
[('lr', LogisticRegression()),
('rf', RandomForestClassifier(n_estimators=5))])),
(X_r, y_r, VotingRegressor(
[('lr', LinearRegression()),
('rf', RandomForestRegressor(n_estimators=5))]))]
)
@pytest.mark.parametrize("drop", [None, 'drop'])
def test_none_estimator_with_weights(X, y, voter, drop):
# TODO: remove the parametrization on 'drop' when support for None is
# removed.
# check that an estimator can be set to 'drop' and passing some weight
# regression test for
# https://github.com/scikit-learn/scikit-learn/issues/13777
voter = clone(voter)
voter.fit(X, y, sample_weight=np.ones(y.shape))
voter.set_params(lr=drop)
with pytest.warns(None) as record:
voter.fit(X, y, sample_weight=np.ones(y.shape))
assert record if drop is None else not record
y_pred = voter.predict(X)
assert y_pred.shape == y.shape
@pytest.mark.parametrize(
"estimator",
[VotingRegressor(
estimators=[('lr', LinearRegression()),
('tree', DecisionTreeRegressor(random_state=0))]),
VotingClassifier(
estimators=[('lr', LogisticRegression(random_state=0)),
('tree', DecisionTreeClassifier(random_state=0))])],
ids=['VotingRegressor', 'VotingClassifier']
)
def test_check_estimators_voting_estimator(estimator):
# FIXME: to be removed when meta-estimators can specified themselves
# their testing parameters (for required parameters).
check_estimator(estimator)
check_no_attributes_set_in_init(estimator.__class__.__name__, estimator)
@pytest.mark.parametrize(
"est",
[VotingRegressor(
estimators=[('lr', LinearRegression()),
('tree', DecisionTreeRegressor(random_state=0))]),
VotingClassifier(
estimators=[('lr', LogisticRegression(random_state=0)),
('tree', DecisionTreeClassifier(random_state=0))])],
ids=['VotingRegressor', 'VotingClassifier']
)
def test_n_features_in(est):
X = [[1, 2], [3, 4], [5, 6]]
y = [0, 1, 2]
assert not hasattr(est, 'n_features_in_')
est.fit(X, y)
assert est.n_features_in_ == 2
@pytest.mark.parametrize(
"estimator",
[VotingRegressor(
estimators=[('lr', LinearRegression()),
('rf', RandomForestRegressor(random_state=123))],
verbose=True),
VotingClassifier(
estimators=[('lr', LogisticRegression(random_state=123)),
('rf', RandomForestClassifier(random_state=123))],
verbose=True)]
)
def test_voting_verbose(estimator, capsys):
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
pattern = (r'\[Voting\].*\(1 of 2\) Processing lr, total=.*\n'
r'\[Voting\].*\(2 of 2\) Processing rf, total=.*\n$')
estimator.fit(X, y)
assert re.match(pattern, capsys.readouterr()[0])
# TODO: Remove in 0.24 when None is removed in Voting*
@pytest.mark.parametrize(
"Voter, BaseEstimator",
[(VotingClassifier, DecisionTreeClassifier),
(VotingRegressor, DecisionTreeRegressor)]
)
def test_deprecate_none_transformer(Voter, BaseEstimator):
est = Voter(estimators=[('lr', None),
('tree', BaseEstimator(random_state=0))])
msg = ("Using 'None' to drop an estimator from the ensemble is "
"deprecated in 0.22 and support will be dropped in 0.24. "
"Use the string 'drop' instead.")
with pytest.warns(FutureWarning, match=msg):
est.fit(X, y)

View file

@ -0,0 +1,582 @@
"""Testing for the boost module (sklearn.ensemble.boost)."""
import numpy as np
import pytest
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
from scipy.sparse import dok_matrix
from scipy.sparse import lil_matrix
from sklearn.utils._testing import assert_array_equal, assert_array_less
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_raises, assert_raises_regexp
from sklearn.utils._testing import ignore_warnings
from sklearn.base import BaseEstimator
from sklearn.base import clone
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble._weight_boosting import _samme_proba
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils import shuffle
from sklearn.utils._mocking import NoSampleWeightWrapper
from sklearn import datasets
# Common random state
rng = np.random.RandomState(0)
# Toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y_class = ["foo", "foo", "foo", 1, 1, 1] # test string class labels
y_regr = [-1, -1, -1, 1, 1, 1]
T = [[-1, -1], [2, 2], [3, 2]]
y_t_class = ["foo", 1, 1]
y_t_regr = [-1, 1, 1]
# Load the iris dataset and randomly permute it
iris = datasets.load_iris()
perm = rng.permutation(iris.target.size)
iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng)
# Load the boston dataset and randomly permute it
boston = datasets.load_boston()
boston.data, boston.target = shuffle(boston.data, boston.target,
random_state=rng)
def test_samme_proba():
# Test the `_samme_proba` helper function.
# Define some example (bad) `predict_proba` output.
probs = np.array([[1, 1e-6, 0],
[0.19, 0.6, 0.2],
[-999, 0.51, 0.5],
[1e-6, 1, 1e-9]])
probs /= np.abs(probs.sum(axis=1))[:, np.newaxis]
# _samme_proba calls estimator.predict_proba.
# Make a mock object so I can control what gets returned.
class MockEstimator:
def predict_proba(self, X):
assert_array_equal(X.shape, probs.shape)
return probs
mock = MockEstimator()
samme_proba = _samme_proba(mock, 3, np.ones_like(probs))
assert_array_equal(samme_proba.shape, probs.shape)
assert np.isfinite(samme_proba).all()
# Make sure that the correct elements come out as smallest --
# `_samme_proba` should preserve the ordering in each example.
assert_array_equal(np.argmin(samme_proba, axis=1), [2, 0, 0, 2])
assert_array_equal(np.argmax(samme_proba, axis=1), [0, 1, 1, 1])
def test_oneclass_adaboost_proba():
# Test predict_proba robustness for one class label input.
# In response to issue #7501
# https://github.com/scikit-learn/scikit-learn/issues/7501
y_t = np.ones(len(X))
clf = AdaBoostClassifier().fit(X, y_t)
assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
def test_classification_toy(algorithm):
# Check classification on a toy dataset.
clf = AdaBoostClassifier(algorithm=algorithm, random_state=0)
clf.fit(X, y_class)
assert_array_equal(clf.predict(T), y_t_class)
assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
assert clf.predict_proba(T).shape == (len(T), 2)
assert clf.decision_function(T).shape == (len(T),)
def test_regression_toy():
# Check classification on a toy dataset.
clf = AdaBoostRegressor(random_state=0)
clf.fit(X, y_regr)
assert_array_equal(clf.predict(T), y_t_regr)
def test_iris():
# Check consistency on dataset iris.
classes = np.unique(iris.target)
clf_samme = prob_samme = None
for alg in ['SAMME', 'SAMME.R']:
clf = AdaBoostClassifier(algorithm=alg)
clf.fit(iris.data, iris.target)
assert_array_equal(classes, clf.classes_)
proba = clf.predict_proba(iris.data)
if alg == "SAMME":
clf_samme = clf
prob_samme = proba
assert proba.shape[1] == len(classes)
assert clf.decision_function(iris.data).shape[1] == len(classes)
score = clf.score(iris.data, iris.target)
assert score > 0.9, "Failed with algorithm %s and score = %f" % \
(alg, score)
# Check we used multiple estimators
assert len(clf.estimators_) > 1
# Check for distinct random states (see issue #7408)
assert (len(set(est.random_state for est in clf.estimators_)) ==
len(clf.estimators_))
# Somewhat hacky regression test: prior to
# ae7adc880d624615a34bafdb1d75ef67051b8200,
# predict_proba returned SAMME.R values for SAMME.
clf_samme.algorithm = "SAMME.R"
assert_array_less(0,
np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
@pytest.mark.parametrize('loss', ['linear', 'square', 'exponential'])
def test_boston(loss):
# Check consistency on dataset boston house prices.
reg = AdaBoostRegressor(loss=loss, random_state=0)
reg.fit(boston.data, boston.target)
score = reg.score(boston.data, boston.target)
assert score > 0.85
# Check we used multiple estimators
assert len(reg.estimators_) > 1
# Check for distinct random states (see issue #7408)
assert (len(set(est.random_state for est in reg.estimators_)) ==
len(reg.estimators_))
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
def test_staged_predict(algorithm):
# Check staged predictions.
rng = np.random.RandomState(0)
iris_weights = rng.randint(10, size=iris.target.shape)
boston_weights = rng.randint(10, size=boston.target.shape)
clf = AdaBoostClassifier(algorithm=algorithm, n_estimators=10)
clf.fit(iris.data, iris.target, sample_weight=iris_weights)
predictions = clf.predict(iris.data)
staged_predictions = [p for p in clf.staged_predict(iris.data)]
proba = clf.predict_proba(iris.data)
staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
staged_scores = [
s for s in clf.staged_score(
iris.data, iris.target, sample_weight=iris_weights)]
assert len(staged_predictions) == 10
assert_array_almost_equal(predictions, staged_predictions[-1])
assert len(staged_probas) == 10
assert_array_almost_equal(proba, staged_probas[-1])
assert len(staged_scores) == 10
assert_array_almost_equal(score, staged_scores[-1])
# AdaBoost regression
clf = AdaBoostRegressor(n_estimators=10, random_state=0)
clf.fit(boston.data, boston.target, sample_weight=boston_weights)
predictions = clf.predict(boston.data)
staged_predictions = [p for p in clf.staged_predict(boston.data)]
score = clf.score(boston.data, boston.target, sample_weight=boston_weights)
staged_scores = [
s for s in clf.staged_score(
boston.data, boston.target, sample_weight=boston_weights)]
assert len(staged_predictions) == 10
assert_array_almost_equal(predictions, staged_predictions[-1])
assert len(staged_scores) == 10
assert_array_almost_equal(score, staged_scores[-1])
def test_gridsearch():
# Check that base trees can be grid-searched.
# AdaBoost classification
boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
parameters = {'n_estimators': (1, 2),
'base_estimator__max_depth': (1, 2),
'algorithm': ('SAMME', 'SAMME.R')}
clf = GridSearchCV(boost, parameters)
clf.fit(iris.data, iris.target)
# AdaBoost regression
boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),
random_state=0)
parameters = {'n_estimators': (1, 2),
'base_estimator__max_depth': (1, 2)}
clf = GridSearchCV(boost, parameters)
clf.fit(boston.data, boston.target)
def test_pickle():
# Check pickability.
import pickle
# Adaboost classifier
for alg in ['SAMME', 'SAMME.R']:
obj = AdaBoostClassifier(algorithm=alg)
obj.fit(iris.data, iris.target)
score = obj.score(iris.data, iris.target)
s = pickle.dumps(obj)
obj2 = pickle.loads(s)
assert type(obj2) == obj.__class__
score2 = obj2.score(iris.data, iris.target)
assert score == score2
# Adaboost regressor
obj = AdaBoostRegressor(random_state=0)
obj.fit(boston.data, boston.target)
score = obj.score(boston.data, boston.target)
s = pickle.dumps(obj)
obj2 = pickle.loads(s)
assert type(obj2) == obj.__class__
score2 = obj2.score(boston.data, boston.target)
assert score == score2
def test_importances():
# Check variable importances.
X, y = datasets.make_classification(n_samples=2000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=1)
for alg in ['SAMME', 'SAMME.R']:
clf = AdaBoostClassifier(algorithm=alg)
clf.fit(X, y)
importances = clf.feature_importances_
assert importances.shape[0] == 10
assert (importances[:3, np.newaxis] >= importances[3:]).all()
def test_error():
# Test that it gives proper exception on deficient input.
assert_raises(ValueError,
AdaBoostClassifier(learning_rate=-1).fit,
X, y_class)
assert_raises(ValueError,
AdaBoostClassifier(algorithm="foo").fit,
X, y_class)
assert_raises(ValueError,
AdaBoostClassifier().fit,
X, y_class, sample_weight=np.asarray([-1]))
def test_base_estimator():
# Test different base estimators.
from sklearn.ensemble import RandomForestClassifier
# XXX doesn't work with y_class because RF doesn't support classes_
# Shouldn't AdaBoost run a LabelBinarizer?
clf = AdaBoostClassifier(RandomForestClassifier())
clf.fit(X, y_regr)
clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
clf.fit(X, y_class)
from sklearn.ensemble import RandomForestRegressor
clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0)
clf.fit(X, y_regr)
clf = AdaBoostRegressor(SVR(), random_state=0)
clf.fit(X, y_regr)
# Check that an empty discrete ensemble fails in fit, not predict.
X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]]
y_fail = ["foo", "bar", 1, 2]
clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
assert_raises_regexp(ValueError, "worse than random",
clf.fit, X_fail, y_fail)
def test_sparse_classification():
# Check classification with sparse input.
class CustomSVC(SVC):
"""SVC variant that records the nature of the training set."""
def fit(self, X, y, sample_weight=None):
"""Modification on fit caries data type for later verification."""
super().fit(X, y, sample_weight=sample_weight)
self.data_type_ = type(X)
return self
X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=15,
n_features=5,
random_state=42)
# Flatten y to a 1d array
y = np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
dok_matrix]:
X_train_sparse = sparse_format(X_train)
X_test_sparse = sparse_format(X_test)
# Trained on sparse format
sparse_classifier = AdaBoostClassifier(
base_estimator=CustomSVC(probability=True),
random_state=1,
algorithm="SAMME"
).fit(X_train_sparse, y_train)
# Trained on dense format
dense_classifier = AdaBoostClassifier(
base_estimator=CustomSVC(probability=True),
random_state=1,
algorithm="SAMME"
).fit(X_train, y_train)
# predict
sparse_results = sparse_classifier.predict(X_test_sparse)
dense_results = dense_classifier.predict(X_test)
assert_array_equal(sparse_results, dense_results)
# decision_function
sparse_results = sparse_classifier.decision_function(X_test_sparse)
dense_results = dense_classifier.decision_function(X_test)
assert_array_almost_equal(sparse_results, dense_results)
# predict_log_proba
sparse_results = sparse_classifier.predict_log_proba(X_test_sparse)
dense_results = dense_classifier.predict_log_proba(X_test)
assert_array_almost_equal(sparse_results, dense_results)
# predict_proba
sparse_results = sparse_classifier.predict_proba(X_test_sparse)
dense_results = dense_classifier.predict_proba(X_test)
assert_array_almost_equal(sparse_results, dense_results)
# score
sparse_results = sparse_classifier.score(X_test_sparse, y_test)
dense_results = dense_classifier.score(X_test, y_test)
assert_array_almost_equal(sparse_results, dense_results)
# staged_decision_function
sparse_results = sparse_classifier.staged_decision_function(
X_test_sparse)
dense_results = dense_classifier.staged_decision_function(X_test)
for sprase_res, dense_res in zip(sparse_results, dense_results):
assert_array_almost_equal(sprase_res, dense_res)
# staged_predict
sparse_results = sparse_classifier.staged_predict(X_test_sparse)
dense_results = dense_classifier.staged_predict(X_test)
for sprase_res, dense_res in zip(sparse_results, dense_results):
assert_array_equal(sprase_res, dense_res)
# staged_predict_proba
sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse)
dense_results = dense_classifier.staged_predict_proba(X_test)
for sprase_res, dense_res in zip(sparse_results, dense_results):
assert_array_almost_equal(sprase_res, dense_res)
# staged_score
sparse_results = sparse_classifier.staged_score(X_test_sparse,
y_test)
dense_results = dense_classifier.staged_score(X_test, y_test)
for sprase_res, dense_res in zip(sparse_results, dense_results):
assert_array_equal(sprase_res, dense_res)
# Verify sparsity of data is maintained during training
types = [i.data_type_ for i in sparse_classifier.estimators_]
assert all([(t == csc_matrix or t == csr_matrix)
for t in types])
def test_sparse_regression():
# Check regression with sparse input.
class CustomSVR(SVR):
"""SVR variant that records the nature of the training set."""
def fit(self, X, y, sample_weight=None):
"""Modification on fit caries data type for later verification."""
super().fit(X, y, sample_weight=sample_weight)
self.data_type_ = type(X)
return self
X, y = datasets.make_regression(n_samples=15, n_features=50, n_targets=1,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
dok_matrix]:
X_train_sparse = sparse_format(X_train)
X_test_sparse = sparse_format(X_test)
# Trained on sparse format
sparse_classifier = AdaBoostRegressor(
base_estimator=CustomSVR(),
random_state=1
).fit(X_train_sparse, y_train)
# Trained on dense format
dense_classifier = dense_results = AdaBoostRegressor(
base_estimator=CustomSVR(),
random_state=1
).fit(X_train, y_train)
# predict
sparse_results = sparse_classifier.predict(X_test_sparse)
dense_results = dense_classifier.predict(X_test)
assert_array_almost_equal(sparse_results, dense_results)
# staged_predict
sparse_results = sparse_classifier.staged_predict(X_test_sparse)
dense_results = dense_classifier.staged_predict(X_test)
for sprase_res, dense_res in zip(sparse_results, dense_results):
assert_array_almost_equal(sprase_res, dense_res)
types = [i.data_type_ for i in sparse_classifier.estimators_]
assert all([(t == csc_matrix or t == csr_matrix)
for t in types])
def test_sample_weight_adaboost_regressor():
"""
AdaBoostRegressor should work without sample_weights in the base estimator
The random weighted sampling is done internally in the _boost method in
AdaBoostRegressor.
"""
class DummyEstimator(BaseEstimator):
def fit(self, X, y):
pass
def predict(self, X):
return np.zeros(X.shape[0])
boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
boost.fit(X, y_regr)
assert len(boost.estimator_weights_) == len(boost.estimator_errors_)
def test_multidimensional_X():
"""
Check that the AdaBoost estimators can work with n-dimensional
data matrix
"""
rng = np.random.RandomState(0)
X = rng.randn(50, 3, 3)
yc = rng.choice([0, 1], 50)
yr = rng.randn(50)
boost = AdaBoostClassifier(DummyClassifier(strategy='most_frequent'))
boost.fit(X, yc)
boost.predict(X)
boost.predict_proba(X)
boost = AdaBoostRegressor(DummyRegressor())
boost.fit(X, yr)
boost.predict(X)
# TODO: Remove in 0.24 when DummyClassifier's `strategy` default changes
@ignore_warnings
@pytest.mark.parametrize("algorithm", ['SAMME', 'SAMME.R'])
def test_adaboostclassifier_without_sample_weight(algorithm):
X, y = iris.data, iris.target
base_estimator = NoSampleWeightWrapper(DummyClassifier())
clf = AdaBoostClassifier(
base_estimator=base_estimator, algorithm=algorithm
)
err_msg = ("{} doesn't support sample_weight"
.format(base_estimator.__class__.__name__))
with pytest.raises(ValueError, match=err_msg):
clf.fit(X, y)
def test_adaboostregressor_sample_weight():
# check that giving weight will have an influence on the error computed
# for a weak learner
rng = np.random.RandomState(42)
X = np.linspace(0, 100, num=1000)
y = (.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001)
X = X.reshape(-1, 1)
# add an arbitrary outlier
X[-1] *= 10
y[-1] = 10000
# random_state=0 ensure that the underlying bootstrap will use the outlier
regr_no_outlier = AdaBoostRegressor(
base_estimator=LinearRegression(), n_estimators=1, random_state=0
)
regr_with_weight = clone(regr_no_outlier)
regr_with_outlier = clone(regr_no_outlier)
# fit 3 models:
# - a model containing the outlier
# - a model without the outlier
# - a model containing the outlier but with a null sample-weight
regr_with_outlier.fit(X, y)
regr_no_outlier.fit(X[:-1], y[:-1])
sample_weight = np.ones_like(y)
sample_weight[-1] = 0
regr_with_weight.fit(X, y, sample_weight=sample_weight)
score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1])
score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1])
score_with_weight = regr_with_weight.score(X[:-1], y[:-1])
assert score_with_outlier < score_no_outlier
assert score_with_outlier < score_with_weight
assert score_no_outlier == pytest.approx(score_with_weight)
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
def test_adaboost_consistent_predict(algorithm):
# check that predict_proba and predict give consistent results
# regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/14084
X_train, X_test, y_train, y_test = train_test_split(
*datasets.load_digits(return_X_y=True), random_state=42
)
model = AdaBoostClassifier(algorithm=algorithm, random_state=42)
model.fit(X_train, y_train)
assert_array_equal(
np.argmax(model.predict_proba(X_test), axis=1),
model.predict(X_test)
)
@pytest.mark.parametrize(
'model, X, y',
[(AdaBoostClassifier(), iris.data, iris.target),
(AdaBoostRegressor(), boston.data, boston.target)]
)
def test_adaboost_negative_weight_error(model, X, y):
sample_weight = np.ones_like(y)
sample_weight[-1] = -10
err_msg = "sample_weight cannot contain negative weight"
with pytest.raises(ValueError, match=err_msg):
model.fit(X, y, sample_weight=sample_weight)

Some files were not shown because too many files have changed in this diff Show more