Uploaded Test files

2020-11-12 11:05:57 -05:00 · 2020-11-12 11:05:57 -05:00 · 2e81cb7d99
commit 2e81cb7d99
parent f584ad9d97
16627 changed files with 2065359 additions and 102444 deletions
--- a/venv/Lib/site-packages/sklearn/ensemble/init.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/init.py
@ -0,0 +1,40 @@
+"""
+The :mod:`sklearn.ensemble` module includes ensemble-based methods for
+classification, regression and anomaly detection.
+"""
+import typing
+
+from ._base import BaseEnsemble
+from ._forest import RandomForestClassifier
+from ._forest import RandomForestRegressor
+from ._forest import RandomTreesEmbedding
+from ._forest import ExtraTreesClassifier
+from ._forest import ExtraTreesRegressor
+from ._bagging import BaggingClassifier
+from ._bagging import BaggingRegressor
+from ._iforest import IsolationForest
+from ._weight_boosting import AdaBoostClassifier
+from ._weight_boosting import AdaBoostRegressor
+from ._gb import GradientBoostingClassifier
+from ._gb import GradientBoostingRegressor
+from ._voting import VotingClassifier
+from ._voting import VotingRegressor
+from ._stacking import StackingClassifier
+from ._stacking import StackingRegressor
+
+if typing.TYPE_CHECKING:
+    # Avoid errors in type checkers (e.g. mypy) for experimental estimators.
+    # TODO: remove this check once the estimator is no longer experimental.
+    from ._hist_gradient_boosting.gradient_boosting import (  # noqa
+        HistGradientBoostingRegressor, HistGradientBoostingClassifier
+    )
+
+__all__ = ["BaseEnsemble",
+           "RandomForestClassifier", "RandomForestRegressor",
+           "RandomTreesEmbedding", "ExtraTreesClassifier",
+           "ExtraTreesRegressor", "BaggingClassifier",
+           "BaggingRegressor", "IsolationForest", "GradientBoostingClassifier",
+           "GradientBoostingRegressor", "AdaBoostClassifier",
+           "AdaBoostRegressor", "VotingClassifier", "VotingRegressor",
+           "StackingClassifier", "StackingRegressor",
+           ]
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/_bagging.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/_bagging.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/_base.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/_base.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/_forest.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/_forest.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/_gb.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/_gb.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/_gb_losses.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/_gb_losses.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/_iforest.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/_iforest.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/_stacking.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/_stacking.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/_voting.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/_voting.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/_weight_boosting.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/_weight_boosting.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/bagging.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/bagging.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/base.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/base.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/forest.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/forest.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/gradient_boosting.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/gradient_boosting.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/iforest.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/iforest.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/setup.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/setup.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/voting.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/voting.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/pycache/weight_boosting.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/pycache/weight_boosting.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_bagging.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_bagging.py
--- a/venv/Lib/site-packages/sklearn/ensemble/_base.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_base.py
@ -0,0 +1,287 @@
+"""Base class for ensemble-based estimators."""
+
+# Authors: Gilles Louppe
+# License: BSD 3 clause
+
+from abc import ABCMeta, abstractmethod
+import numbers
+import warnings
+from typing import List
+
+import numpy as np
+
+from joblib import effective_n_jobs
+
+from ..base import clone
+from ..base import is_classifier, is_regressor
+from ..base import BaseEstimator
+from ..base import MetaEstimatorMixin
+from ..utils import Bunch, _print_elapsed_time
+from ..utils import check_random_state
+from ..utils.metaestimators import _BaseComposition
+
+
+def _fit_single_estimator(estimator, X, y, sample_weight=None,
+                          message_clsname=None, message=None):
+    """Private function used to fit an estimator within a job."""
+    if sample_weight is not None:
+        try:
+            with _print_elapsed_time(message_clsname, message):
+                estimator.fit(X, y, sample_weight=sample_weight)
+        except TypeError as exc:
+            if "unexpected keyword argument 'sample_weight'" in str(exc):
+                raise TypeError(
+                    "Underlying estimator {} does not support sample weights."
+                    .format(estimator.__class__.__name__)
+                ) from exc
+            raise
+    else:
+        with _print_elapsed_time(message_clsname, message):
+            estimator.fit(X, y)
+    return estimator
+
+
+def _set_random_states(estimator, random_state=None):
+    """Set fixed random_state parameters for an estimator.
+
+    Finds all parameters ending ``random_state`` and sets them to integers
+    derived from ``random_state``.
+
+    Parameters
+    ----------
+    estimator : estimator supporting get/set_params
+        Estimator with potential randomness managed by random_state
+        parameters.
+
+    random_state : int or RandomState, default=None
+        Pseudo-random number generator to control the generation of the random
+        integers. Pass an int for reproducible output across multiple function
+        calls.
+        See :term:`Glossary <random_state>`.
+
+    Notes
+    -----
+    This does not necessarily set *all* ``random_state`` attributes that
+    control an estimator's randomness, only those accessible through
+    ``estimator.get_params()``.  ``random_state``s not controlled include
+    those belonging to:
+
+        * cross-validation splitters
+        * ``scipy.stats`` rvs
+    """
+    random_state = check_random_state(random_state)
+    to_set = {}
+    for key in sorted(estimator.get_params(deep=True)):
+        if key == 'random_state' or key.endswith('__random_state'):
+            to_set[key] = random_state.randint(np.iinfo(np.int32).max)
+
+    if to_set:
+        estimator.set_params(**to_set)
+
+
+class BaseEnsemble(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
+    """Base class for all ensemble classes.
+
+    Warning: This class should not be used directly. Use derived classes
+    instead.
+
+    Parameters
+    ----------
+    base_estimator : object
+        The base estimator from which the ensemble is built.
+
+    n_estimators : int, default=10
+        The number of estimators in the ensemble.
+
+    estimator_params : list of str, default=tuple()
+        The list of attributes to use as parameters when instantiating a
+        new base estimator. If none are given, default parameters are used.
+
+    Attributes
+    ----------
+    base_estimator_ : estimator
+        The base estimator from which the ensemble is grown.
+
+    estimators_ : list of estimators
+        The collection of fitted base estimators.
+    """
+
+    # overwrite _required_parameters from MetaEstimatorMixin
+    _required_parameters: List[str] = []
+
+    @abstractmethod
+    def __init__(self, base_estimator, *, n_estimators=10,
+                 estimator_params=tuple()):
+        # Set parameters
+        self.base_estimator = base_estimator
+        self.n_estimators = n_estimators
+        self.estimator_params = estimator_params
+
+        # Don't instantiate estimators now! Parameters of base_estimator might
+        # still change. Eg., when grid-searching with the nested object syntax.
+        # self.estimators_ needs to be filled by the derived classes in fit.
+
+    def _validate_estimator(self, default=None):
+        """Check the estimator and the n_estimator attribute.
+
+        Sets the base_estimator_` attributes.
+        """
+        if not isinstance(self.n_estimators, numbers.Integral):
+            raise ValueError("n_estimators must be an integer, "
+                             "got {0}.".format(type(self.n_estimators)))
+
+        if self.n_estimators <= 0:
+            raise ValueError("n_estimators must be greater than zero, "
+                             "got {0}.".format(self.n_estimators))
+
+        if self.base_estimator is not None:
+            self.base_estimator_ = self.base_estimator
+        else:
+            self.base_estimator_ = default
+
+        if self.base_estimator_ is None:
+            raise ValueError("base_estimator cannot be None")
+
+    def _make_estimator(self, append=True, random_state=None):
+        """Make and configure a copy of the `base_estimator_` attribute.
+
+        Warning: This method should be used to properly instantiate new
+        sub-estimators.
+        """
+        estimator = clone(self.base_estimator_)
+        estimator.set_params(**{p: getattr(self, p)
+                                for p in self.estimator_params})
+
+        if random_state is not None:
+            _set_random_states(estimator, random_state)
+
+        if append:
+            self.estimators_.append(estimator)
+
+        return estimator
+
+    def __len__(self):
+        """Return the number of estimators in the ensemble."""
+        return len(self.estimators_)
+
+    def __getitem__(self, index):
+        """Return the index'th estimator in the ensemble."""
+        return self.estimators_[index]
+
+    def __iter__(self):
+        """Return iterator over estimators in the ensemble."""
+        return iter(self.estimators_)
+
+
+def _partition_estimators(n_estimators, n_jobs):
+    """Private function used to partition estimators between jobs."""
+    # Compute the number of jobs
+    n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
+
+    # Partition estimators between jobs
+    n_estimators_per_job = np.full(n_jobs, n_estimators // n_jobs,
+                                   dtype=np.int)
+    n_estimators_per_job[:n_estimators % n_jobs] += 1
+    starts = np.cumsum(n_estimators_per_job)
+
+    return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
+
+
+class _BaseHeterogeneousEnsemble(MetaEstimatorMixin, _BaseComposition,
+                                 metaclass=ABCMeta):
+    """Base class for heterogeneous ensemble of learners.
+
+    Parameters
+    ----------
+    estimators : list of (str, estimator) tuples
+        The ensemble of estimators to use in the ensemble. Each element of the
+        list is defined as a tuple of string (i.e. name of the estimator) and
+        an estimator instance. An estimator can be set to `'drop'` using
+        `set_params`.
+
+    Attributes
+    ----------
+    estimators_ : list of estimators
+        The elements of the estimators parameter, having been fitted on the
+        training data. If an estimator has been set to `'drop'`, it will not
+        appear in `estimators_`.
+    """
+
+    _required_parameters = ['estimators']
+
+    @property
+    def named_estimators(self):
+        return Bunch(**dict(self.estimators))
+
+    @abstractmethod
+    def __init__(self, estimators):
+        self.estimators = estimators
+
+    def _validate_estimators(self):
+        if self.estimators is None or len(self.estimators) == 0:
+            raise ValueError(
+                "Invalid 'estimators' attribute, 'estimators' should be a list"
+                " of (string, estimator) tuples."
+            )
+        names, estimators = zip(*self.estimators)
+        # defined by MetaEstimatorMixin
+        self._validate_names(names)
+
+        # FIXME: deprecate the usage of None to drop an estimator from the
+        # ensemble. Remove in 0.24
+        if any(est is None for est in estimators):
+            warnings.warn(
+                "Using 'None' to drop an estimator from the ensemble is "
+                "deprecated in 0.22 and support will be dropped in 0.24. "
+                "Use the string 'drop' instead.", FutureWarning
+            )
+
+        has_estimator = any(est not in (None, 'drop') for est in estimators)
+        if not has_estimator:
+            raise ValueError(
+                "All estimators are dropped. At least one is required "
+                "to be an estimator."
+            )
+
+        is_estimator_type = (is_classifier if is_classifier(self)
+                             else is_regressor)
+
+        for est in estimators:
+            if est not in (None, 'drop') and not is_estimator_type(est):
+                raise ValueError(
+                    "The estimator {} should be a {}.".format(
+                        est.__class__.__name__, is_estimator_type.__name__[3:]
+                    )
+                )
+
+        return names, estimators
+
+    def set_params(self, **params):
+        """
+        Set the parameters of an estimator from the ensemble.
+
+        Valid parameter keys can be listed with `get_params()`.
+
+        Parameters
+        ----------
+        **params : keyword arguments
+            Specific parameters using e.g.
+            `set_params(parameter_name=new_value)`. In addition, to setting the
+            parameters of the stacking estimator, the individual estimator of
+            the stacking estimators can also be set, or can be removed by
+            setting them to 'drop'.
+        """
+        super()._set_params('estimators', **params)
+        return self
+
+    def get_params(self, deep=True):
+        """
+        Get the parameters of an estimator from the ensemble.
+
+        Parameters
+        ----------
+        deep : bool, default=True
+            Setting it to True gets the various classifiers and the parameters
+            of the classifiers as well.
+        """
+        return super()._get_params('estimators', deep=deep)
--- a/venv/Lib/site-packages/sklearn/ensemble/_forest.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_forest.py
--- a/venv/Lib/site-packages/sklearn/ensemble/_gb.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_gb.py
--- a/venv/Lib/site-packages/sklearn/ensemble/_gb_losses.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_gb_losses.py
@ -0,0 +1,881 @@
+"""Losses and corresponding default initial estimators for gradient boosting
+decision trees.
+"""
+
+from abc import ABCMeta
+from abc import abstractmethod
+
+import numpy as np
+from scipy.special import expit, logsumexp
+
+from ..tree._tree import TREE_LEAF
+from ..utils.stats import _weighted_percentile
+from ..dummy import DummyClassifier
+from ..dummy import DummyRegressor
+
+
+class LossFunction(metaclass=ABCMeta):
+    """Abstract base class for various loss functions.
+
+    Parameters
+    ----------
+    n_classes : int
+        Number of classes.
+
+    Attributes
+    ----------
+    K : int
+        The number of regression trees to be induced;
+        1 for regression and binary classification;
+        ``n_classes`` for multi-class classification.
+    """
+
+    is_multi_class = False
+
+    def __init__(self, n_classes):
+        self.K = n_classes
+
+    def init_estimator(self):
+        """Default ``init`` estimator for loss function. """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def __call__(self, y, raw_predictions, sample_weight=None):
+        """Compute the loss.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            True labels.
+
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves).
+
+        sample_weight : ndarray of shape (n_samples,), default=None
+            Sample weights.
+        """
+
+    @abstractmethod
+    def negative_gradient(self, y, raw_predictions, **kargs):
+        """Compute the negative gradient.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            The target labels.
+
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble at iteration ``i - 1``.
+        """
+
+    def update_terminal_regions(self, tree, X, y, residual, raw_predictions,
+                                sample_weight, sample_mask,
+                                learning_rate=0.1, k=0):
+        """Update the terminal regions (=leaves) of the given tree and
+        updates the current predictions of the model. Traverses tree
+        and invokes template method `_update_terminal_region`.
+
+        Parameters
+        ----------
+        tree : tree.Tree
+            The tree object.
+        X : ndarray of shape (n_samples, n_features)
+            The data array.
+        y : ndarray of shape (n_samples,)
+            The target labels.
+        residual : ndarray of shape (n_samples,)
+            The residuals (usually the negative gradient).
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble at iteration ``i - 1``.
+        sample_weight : ndarray of shape (n_samples,)
+            The weight of each sample.
+        sample_mask : ndarray of shape (n_samples,)
+            The sample mask to be used.
+        learning_rate : float, default=0.1
+            Learning rate shrinks the contribution of each tree by
+             ``learning_rate``.
+        k : int, default=0
+            The index of the estimator being updated.
+
+        """
+        # compute leaf for each sample in ``X``.
+        terminal_regions = tree.apply(X)
+
+        # mask all which are not in sample mask.
+        masked_terminal_regions = terminal_regions.copy()
+        masked_terminal_regions[~sample_mask] = -1
+
+        # update each leaf (= perform line search)
+        for leaf in np.where(tree.children_left == TREE_LEAF)[0]:
+            self._update_terminal_region(tree, masked_terminal_regions,
+                                         leaf, X, y, residual,
+                                         raw_predictions[:, k], sample_weight)
+
+        # update predictions (both in-bag and out-of-bag)
+        raw_predictions[:, k] += \
+            learning_rate * tree.value[:, 0, 0].take(terminal_regions, axis=0)
+
+    @abstractmethod
+    def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
+                                residual, raw_predictions, sample_weight):
+        """Template method for updating terminal regions (i.e., leaves)."""
+
+    @abstractmethod
+    def get_init_raw_predictions(self, X, estimator):
+        """Return the initial raw predictions.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            The data array.
+        estimator : object
+            The estimator to use to compute the predictions.
+
+        Returns
+        -------
+        raw_predictions : ndarray of shape (n_samples, K)
+            The initial raw predictions. K is equal to 1 for binary
+            classification and regression, and equal to the number of classes
+            for multiclass classification. ``raw_predictions`` is casted
+            into float64.
+        """
+        pass
+
+
+class RegressionLossFunction(LossFunction, metaclass=ABCMeta):
+    """Base class for regression loss functions.
+
+    Parameters
+    ----------
+    n_classes : int
+        Number of classes.
+    """
+    def __init__(self, n_classes):
+        if n_classes != 1:
+            raise ValueError("``n_classes`` must be 1 for regression but "
+                             "was %r" % n_classes)
+        super().__init__(n_classes)
+
+    def check_init_estimator(self, estimator):
+        """Make sure estimator has the required fit and predict methods.
+
+        Parameters
+        ----------
+        estimator : object
+            The init estimator to check.
+        """
+        if not (hasattr(estimator, 'fit') and hasattr(estimator, 'predict')):
+            raise ValueError(
+                "The init parameter must be a valid estimator and "
+                "support both fit and predict."
+            )
+
+    def get_init_raw_predictions(self, X, estimator):
+        predictions = estimator.predict(X)
+        return predictions.reshape(-1, 1).astype(np.float64)
+
+
+class LeastSquaresError(RegressionLossFunction):
+    """Loss function for least squares (LS) estimation.
+    Terminal regions do not need to be updated for least squares.
+
+    Parameters
+    ----------
+    n_classes : int
+        Number of classes.
+    """
+
+    def init_estimator(self):
+        return DummyRegressor(strategy='mean')
+
+    def __call__(self, y, raw_predictions, sample_weight=None):
+        """Compute the least squares loss.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            True labels.
+
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves).
+
+        sample_weight : ndarray of shape (n_samples,), default=None
+            Sample weights.
+        """
+        if sample_weight is None:
+            return np.mean((y - raw_predictions.ravel()) ** 2)
+        else:
+            return (1 / sample_weight.sum() * np.sum(
+                sample_weight * ((y - raw_predictions.ravel()) ** 2)))
+
+    def negative_gradient(self, y, raw_predictions, **kargs):
+        """Compute the negative gradient.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            The target labels.
+
+        raw_predictions : ndarray of shape (n_samples,)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble at iteration ``i - 1``.
+        """
+        return y - raw_predictions.ravel()
+
+    def update_terminal_regions(self, tree, X, y, residual, raw_predictions,
+                                sample_weight, sample_mask,
+                                learning_rate=0.1, k=0):
+        """Least squares does not need to update terminal regions.
+
+        But it has to update the predictions.
+
+        Parameters
+        ----------
+        tree : tree.Tree
+            The tree object.
+        X : ndarray of shape (n_samples, n_features)
+            The data array.
+        y : ndarray of shape (n_samples,)
+            The target labels.
+        residual : ndarray of shape (n_samples,)
+            The residuals (usually the negative gradient).
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble at iteration ``i - 1``.
+        sample_weight : ndarray of shape (n,)
+            The weight of each sample.
+        sample_mask : ndarray of shape (n,)
+            The sample mask to be used.
+        learning_rate : float, default=0.1
+            Learning rate shrinks the contribution of each tree by
+             ``learning_rate``.
+        k : int, default=0
+            The index of the estimator being updated.
+        """
+        # update predictions
+        raw_predictions[:, k] += learning_rate * tree.predict(X).ravel()
+
+    def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
+                                residual, raw_predictions, sample_weight):
+        pass
+
+
+class LeastAbsoluteError(RegressionLossFunction):
+    """Loss function for least absolute deviation (LAD) regression.
+
+    Parameters
+    ----------
+    n_classes : int
+        Number of classes
+    """
+    def init_estimator(self):
+        return DummyRegressor(strategy='quantile', quantile=.5)
+
+    def __call__(self, y, raw_predictions, sample_weight=None):
+        """Compute the least absolute error.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            True labels.
+
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves).
+
+        sample_weight : ndarray of shape (n_samples,), default=None
+            Sample weights.
+        """
+        if sample_weight is None:
+            return np.abs(y - raw_predictions.ravel()).mean()
+        else:
+            return (1 / sample_weight.sum() * np.sum(
+                sample_weight * np.abs(y - raw_predictions.ravel())))
+
+    def negative_gradient(self, y, raw_predictions, **kargs):
+        """Compute the negative gradient.
+
+        1.0 if y - raw_predictions > 0.0 else -1.0
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            The target labels.
+
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble at iteration ``i - 1``.
+        """
+        raw_predictions = raw_predictions.ravel()
+        return 2 * (y - raw_predictions > 0) - 1
+
+    def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
+                                residual, raw_predictions, sample_weight):
+        """LAD updates terminal regions to median estimates."""
+        terminal_region = np.where(terminal_regions == leaf)[0]
+        sample_weight = sample_weight.take(terminal_region, axis=0)
+        diff = (y.take(terminal_region, axis=0) -
+                raw_predictions.take(terminal_region, axis=0))
+        tree.value[leaf, 0, 0] = _weighted_percentile(diff, sample_weight,
+                                                      percentile=50)
+
+
+class HuberLossFunction(RegressionLossFunction):
+    """Huber loss function for robust regression.
+
+    M-Regression proposed in Friedman 2001.
+
+    Parameters
+    ----------
+    n_classes : int
+        Number of classes.
+
+    alpha : float, default=0.9
+        Percentile at which to extract score.
+
+    References
+    ----------
+    J. Friedman, Greedy Function Approximation: A Gradient Boosting
+    Machine, The Annals of Statistics, Vol. 29, No. 5, 2001.
+    """
+
+    def __init__(self, n_classes, alpha=0.9):
+        super().__init__(n_classes)
+        self.alpha = alpha
+        self.gamma = None
+
+    def init_estimator(self):
+        return DummyRegressor(strategy='quantile', quantile=.5)
+
+    def __call__(self, y, raw_predictions, sample_weight=None):
+        """Compute the Huber loss.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            True labels.
+
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble.
+
+        sample_weight : ndarray of shape (n_samples,), default=None
+            Sample weights.
+        """
+        raw_predictions = raw_predictions.ravel()
+        diff = y - raw_predictions
+        gamma = self.gamma
+        if gamma is None:
+            if sample_weight is None:
+                gamma = np.percentile(np.abs(diff), self.alpha * 100)
+            else:
+                gamma = _weighted_percentile(np.abs(diff), sample_weight,
+                                             self.alpha * 100)
+
+        gamma_mask = np.abs(diff) <= gamma
+        if sample_weight is None:
+            sq_loss = np.sum(0.5 * diff[gamma_mask] ** 2)
+            lin_loss = np.sum(gamma * (np.abs(diff[~gamma_mask]) -
+                                       gamma / 2))
+            loss = (sq_loss + lin_loss) / y.shape[0]
+        else:
+            sq_loss = np.sum(0.5 * sample_weight[gamma_mask] *
+                             diff[gamma_mask] ** 2)
+            lin_loss = np.sum(gamma * sample_weight[~gamma_mask] *
+                              (np.abs(diff[~gamma_mask]) - gamma / 2))
+            loss = (sq_loss + lin_loss) / sample_weight.sum()
+        return loss
+
+    def negative_gradient(self, y, raw_predictions, sample_weight=None,
+                          **kargs):
+        """Compute the negative gradient.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            The target labels.
+
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble at iteration ``i - 1``.
+
+        sample_weight : ndarray of shape (n_samples,), default=None
+            Sample weights.
+        """
+        raw_predictions = raw_predictions.ravel()
+        diff = y - raw_predictions
+        if sample_weight is None:
+            gamma = np.percentile(np.abs(diff), self.alpha * 100)
+        else:
+            gamma = _weighted_percentile(np.abs(diff), sample_weight,
+                                         self.alpha * 100)
+        gamma_mask = np.abs(diff) <= gamma
+        residual = np.zeros((y.shape[0],), dtype=np.float64)
+        residual[gamma_mask] = diff[gamma_mask]
+        residual[~gamma_mask] = gamma * np.sign(diff[~gamma_mask])
+        self.gamma = gamma
+        return residual
+
+    def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
+                                residual, raw_predictions, sample_weight):
+        terminal_region = np.where(terminal_regions == leaf)[0]
+        sample_weight = sample_weight.take(terminal_region, axis=0)
+        gamma = self.gamma
+        diff = (y.take(terminal_region, axis=0)
+                - raw_predictions.take(terminal_region, axis=0))
+        median = _weighted_percentile(diff, sample_weight, percentile=50)
+        diff_minus_median = diff - median
+        tree.value[leaf, 0] = median + np.mean(
+            np.sign(diff_minus_median) *
+            np.minimum(np.abs(diff_minus_median), gamma))
+
+
+class QuantileLossFunction(RegressionLossFunction):
+    """Loss function for quantile regression.
+
+    Quantile regression allows to estimate the percentiles
+    of the conditional distribution of the target.
+
+    Parameters
+    ----------
+    n_classes : int
+        Number of classes.
+
+    alpha : float, default=0.9
+        The percentile.
+    """
+    def __init__(self, n_classes, alpha=0.9):
+        super().__init__(n_classes)
+        self.alpha = alpha
+        self.percentile = alpha * 100
+
+    def init_estimator(self):
+        return DummyRegressor(strategy='quantile', quantile=self.alpha)
+
+    def __call__(self, y, raw_predictions, sample_weight=None):
+        """Compute the Quantile loss.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            True labels.
+
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble.
+
+        sample_weight : ndarray of shape (n_samples,), default=None
+            Sample weights.
+        """
+        raw_predictions = raw_predictions.ravel()
+        diff = y - raw_predictions
+        alpha = self.alpha
+
+        mask = y > raw_predictions
+        if sample_weight is None:
+            loss = (alpha * diff[mask].sum() -
+                    (1 - alpha) * diff[~mask].sum()) / y.shape[0]
+        else:
+            loss = ((alpha * np.sum(sample_weight[mask] * diff[mask]) -
+                    (1 - alpha) * np.sum(sample_weight[~mask] *
+                                         diff[~mask])) / sample_weight.sum())
+        return loss
+
+    def negative_gradient(self, y, raw_predictions, **kargs):
+        """Compute the negative gradient.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            The target labels.
+
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble at iteration ``i - 1``.
+        """
+        alpha = self.alpha
+        raw_predictions = raw_predictions.ravel()
+        mask = y > raw_predictions
+        return (alpha * mask) - ((1 - alpha) * ~mask)
+
+    def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
+                                residual, raw_predictions, sample_weight):
+        terminal_region = np.where(terminal_regions == leaf)[0]
+        diff = (y.take(terminal_region, axis=0)
+                - raw_predictions.take(terminal_region, axis=0))
+        sample_weight = sample_weight.take(terminal_region, axis=0)
+
+        val = _weighted_percentile(diff, sample_weight, self.percentile)
+        tree.value[leaf, 0] = val
+
+
+class ClassificationLossFunction(LossFunction, metaclass=ABCMeta):
+    """Base class for classification loss functions. """
+
+    def _raw_prediction_to_proba(self, raw_predictions):
+        """Template method to convert raw predictions into probabilities.
+
+        Parameters
+        ----------
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble.
+
+        Returns
+        -------
+        probas : ndarray of shape (n_samples, K)
+            The predicted probabilities.
+        """
+
+    @abstractmethod
+    def _raw_prediction_to_decision(self, raw_predictions):
+        """Template method to convert raw predictions to decisions.
+
+        Parameters
+        ----------
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble.
+
+        Returns
+        -------
+        encoded_predictions : ndarray of shape (n_samples, K)
+            The predicted encoded labels.
+        """
+
+    def check_init_estimator(self, estimator):
+        """Make sure estimator has fit and predict_proba methods.
+
+        Parameters
+        ----------
+        estimator : object
+            The init estimator to check.
+        """
+        if not (hasattr(estimator, 'fit') and
+                hasattr(estimator, 'predict_proba')):
+            raise ValueError(
+                "The init parameter must be a valid estimator "
+                "and support both fit and predict_proba."
+            )
+
+
+class BinomialDeviance(ClassificationLossFunction):
+    """Binomial deviance loss function for binary classification.
+
+    Binary classification is a special case; here, we only need to
+    fit one tree instead of ``n_classes`` trees.
+
+    Parameters
+    ----------
+    n_classes : int
+        Number of classes.
+    """
+    def __init__(self, n_classes):
+        if n_classes != 2:
+            raise ValueError("{0:s} requires 2 classes; got {1:d} class(es)"
+                             .format(self.__class__.__name__, n_classes))
+        # we only need to fit one tree for binary clf.
+        super().__init__(n_classes=1)
+
+    def init_estimator(self):
+        # return the most common class, taking into account the samples
+        # weights
+        return DummyClassifier(strategy='prior')
+
+    def __call__(self, y, raw_predictions, sample_weight=None):
+        """Compute the deviance (= 2 * negative log-likelihood).
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            True labels.
+
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble.
+
+        sample_weight : ndarray of shape (n_samples,), default=None
+            Sample weights.
+        """
+        # logaddexp(0, v) == log(1.0 + exp(v))
+        raw_predictions = raw_predictions.ravel()
+        if sample_weight is None:
+            return -2 * np.mean((y * raw_predictions) -
+                                np.logaddexp(0, raw_predictions))
+        else:
+            return (-2 / sample_weight.sum() * np.sum(
+                sample_weight * ((y * raw_predictions) -
+                                 np.logaddexp(0, raw_predictions))))
+
+    def negative_gradient(self, y, raw_predictions, **kargs):
+        """Compute the residual (= negative gradient).
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            True labels.
+
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble at iteration ``i - 1``.
+        """
+        return y - expit(raw_predictions.ravel())
+
+    def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
+                                residual, raw_predictions, sample_weight):
+        """Make a single Newton-Raphson step.
+
+        our node estimate is given by:
+
+            sum(w * (y - prob)) / sum(w * prob * (1 - prob))
+
+        we take advantage that: y - prob = residual
+        """
+        terminal_region = np.where(terminal_regions == leaf)[0]
+        residual = residual.take(terminal_region, axis=0)
+        y = y.take(terminal_region, axis=0)
+        sample_weight = sample_weight.take(terminal_region, axis=0)
+
+        numerator = np.sum(sample_weight * residual)
+        denominator = np.sum(sample_weight *
+                             (y - residual) * (1 - y + residual))
+
+        # prevents overflow and division by zero
+        if abs(denominator) < 1e-150:
+            tree.value[leaf, 0, 0] = 0.0
+        else:
+            tree.value[leaf, 0, 0] = numerator / denominator
+
+    def _raw_prediction_to_proba(self, raw_predictions):
+        proba = np.ones((raw_predictions.shape[0], 2), dtype=np.float64)
+        proba[:, 1] = expit(raw_predictions.ravel())
+        proba[:, 0] -= proba[:, 1]
+        return proba
+
+    def _raw_prediction_to_decision(self, raw_predictions):
+        proba = self._raw_prediction_to_proba(raw_predictions)
+        return np.argmax(proba, axis=1)
+
+    def get_init_raw_predictions(self, X, estimator):
+        probas = estimator.predict_proba(X)
+        proba_pos_class = probas[:, 1]
+        eps = np.finfo(np.float32).eps
+        proba_pos_class = np.clip(proba_pos_class, eps, 1 - eps)
+        # log(x / (1 - x)) is the inverse of the sigmoid (expit) function
+        raw_predictions = np.log(proba_pos_class / (1 - proba_pos_class))
+        return raw_predictions.reshape(-1, 1).astype(np.float64)
+
+
+class MultinomialDeviance(ClassificationLossFunction):
+    """Multinomial deviance loss function for multi-class classification.
+
+    For multi-class classification we need to fit ``n_classes`` trees at
+    each stage.
+
+    Parameters
+    ----------
+    n_classes : int
+        Number of classes.
+    """
+
+    is_multi_class = True
+
+    def __init__(self, n_classes):
+        if n_classes < 3:
+            raise ValueError("{0:s} requires more than 2 classes.".format(
+                self.__class__.__name__))
+        super().__init__(n_classes)
+
+    def init_estimator(self):
+        return DummyClassifier(strategy='prior')
+
+    def __call__(self, y, raw_predictions, sample_weight=None):
+        """Compute the Multinomial deviance.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            True labels.
+
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble.
+
+        sample_weight : ndarray of shape (n_samples,), default=None
+            Sample weights.
+        """
+        # create one-hot label encoding
+        Y = np.zeros((y.shape[0], self.K), dtype=np.float64)
+        for k in range(self.K):
+            Y[:, k] = y == k
+
+        return np.average(
+            -1 * (Y * raw_predictions).sum(axis=1) +
+            logsumexp(raw_predictions, axis=1),
+            weights=sample_weight
+        )
+
+    def negative_gradient(self, y, raw_predictions, k=0, **kwargs):
+        """Compute negative gradient for the ``k``-th class.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            The target labels.
+
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble at iteration ``i - 1``.
+
+        k : int, default=0
+            The index of the class.
+        """
+        return y - np.nan_to_num(np.exp(raw_predictions[:, k] -
+                                        logsumexp(raw_predictions, axis=1)))
+
+    def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
+                                residual, raw_predictions, sample_weight):
+        """Make a single Newton-Raphson step. """
+        terminal_region = np.where(terminal_regions == leaf)[0]
+        residual = residual.take(terminal_region, axis=0)
+        y = y.take(terminal_region, axis=0)
+        sample_weight = sample_weight.take(terminal_region, axis=0)
+
+        numerator = np.sum(sample_weight * residual)
+        numerator *= (self.K - 1) / self.K
+
+        denominator = np.sum(sample_weight * (y - residual) *
+                             (1 - y + residual))
+
+        # prevents overflow and division by zero
+        if abs(denominator) < 1e-150:
+            tree.value[leaf, 0, 0] = 0.0
+        else:
+            tree.value[leaf, 0, 0] = numerator / denominator
+
+    def _raw_prediction_to_proba(self, raw_predictions):
+        return np.nan_to_num(
+            np.exp(raw_predictions -
+                   (logsumexp(raw_predictions, axis=1)[:, np.newaxis])))
+
+    def _raw_prediction_to_decision(self, raw_predictions):
+        proba = self._raw_prediction_to_proba(raw_predictions)
+        return np.argmax(proba, axis=1)
+
+    def get_init_raw_predictions(self, X, estimator):
+        probas = estimator.predict_proba(X)
+        eps = np.finfo(np.float32).eps
+        probas = np.clip(probas, eps, 1 - eps)
+        raw_predictions = np.log(probas).astype(np.float64)
+        return raw_predictions
+
+
+class ExponentialLoss(ClassificationLossFunction):
+    """Exponential loss function for binary classification.
+
+    Same loss as AdaBoost.
+
+    Parameters
+    ----------
+    n_classes : int
+        Number of classes.
+
+    References
+    ----------
+    Greg Ridgeway, Generalized Boosted Models: A guide to the gbm package, 2007
+    """
+    def __init__(self, n_classes):
+        if n_classes != 2:
+            raise ValueError("{0:s} requires 2 classes; got {1:d} class(es)"
+                             .format(self.__class__.__name__, n_classes))
+        # we only need to fit one tree for binary clf.
+        super().__init__(n_classes=1)
+
+    def init_estimator(self):
+        return DummyClassifier(strategy='prior')
+
+    def __call__(self, y, raw_predictions, sample_weight=None):
+        """Compute the exponential loss
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            True labels.
+
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble.
+
+        sample_weight : ndarray of shape (n_samples,), default=None
+            Sample weights.
+        """
+        raw_predictions = raw_predictions.ravel()
+        if sample_weight is None:
+            return np.mean(np.exp(-(2. * y - 1.) * raw_predictions))
+        else:
+            return (1.0 / sample_weight.sum() * np.sum(
+                sample_weight * np.exp(-(2 * y - 1) * raw_predictions)))
+
+    def negative_gradient(self, y, raw_predictions, **kargs):
+        """Compute the residual (= negative gradient).
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            True labels.
+
+        raw_predictions : ndarray of shape (n_samples, K)
+            The raw predictions (i.e. values from the tree leaves) of the
+            tree ensemble at iteration ``i - 1``.
+        """
+        y_ = -(2. * y - 1.)
+        return y_ * np.exp(y_ * raw_predictions.ravel())
+
+    def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
+                                residual, raw_predictions, sample_weight):
+        terminal_region = np.where(terminal_regions == leaf)[0]
+        raw_predictions = raw_predictions.take(terminal_region, axis=0)
+        y = y.take(terminal_region, axis=0)
+        sample_weight = sample_weight.take(terminal_region, axis=0)
+
+        y_ = 2. * y - 1.
+
+        numerator = np.sum(y_ * sample_weight * np.exp(-y_ * raw_predictions))
+        denominator = np.sum(sample_weight * np.exp(-y_ * raw_predictions))
+
+        # prevents overflow and division by zero
+        if abs(denominator) < 1e-150:
+            tree.value[leaf, 0, 0] = 0.0
+        else:
+            tree.value[leaf, 0, 0] = numerator / denominator
+
+    def _raw_prediction_to_proba(self, raw_predictions):
+        proba = np.ones((raw_predictions.shape[0], 2), dtype=np.float64)
+        proba[:, 1] = expit(2.0 * raw_predictions.ravel())
+        proba[:, 0] -= proba[:, 1]
+        return proba
+
+    def _raw_prediction_to_decision(self, raw_predictions):
+        return (raw_predictions.ravel() >= 0).astype(np.int)
+
+    def get_init_raw_predictions(self, X, estimator):
+        probas = estimator.predict_proba(X)
+        proba_pos_class = probas[:, 1]
+        eps = np.finfo(np.float32).eps
+        proba_pos_class = np.clip(proba_pos_class, eps, 1 - eps)
+        # according to The Elements of Statistical Learning sec. 10.5, the
+        # minimizer of the exponential loss is .5 * log odds ratio. So this is
+        # the equivalent to .5 * binomial_deviance.get_init_raw_predictions()
+        raw_predictions = .5 * np.log(proba_pos_class / (1 - proba_pos_class))
+        return raw_predictions.reshape(-1, 1).astype(np.float64)
+
+
+LOSS_FUNCTIONS = {
+    'ls': LeastSquaresError,
+    'lad': LeastAbsoluteError,
+    'huber': HuberLossFunction,
+    'quantile': QuantileLossFunction,
+    'deviance': None,  # for both, multinomial and binomial
+    'exponential': ExponentialLoss,
+}
--- a/venv/Lib/site-packages/sklearn/ensemble/_gradient_boosting.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_gradient_boosting.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/init.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/init.py
@ -0,0 +1,5 @@
+"""This module implements histogram-based gradient boosting estimators.
+
+The implementation is a port from pygbm which is itself strongly inspired
+from LightGBM.
+"""
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/binning.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/binning.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/gradient_boosting.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/gradient_boosting.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/grower.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/grower.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/loss.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/loss.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/predictor.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/predictor.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_loss.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_loss.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_predictor.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_predictor.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/binning.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/binning.py
@ -0,0 +1,204 @@
+"""
+This module contains the BinMapper class.
+
+BinMapper is used for mapping a real-valued dataset into integer-valued bins.
+Bin thresholds are computed with the quantiles so that each bin contains
+approximately the same number of samples.
+"""
+# Author: Nicolas Hug
+
+import numpy as np
+
+from ...utils import check_random_state, check_array
+from ...base import BaseEstimator, TransformerMixin
+from ...utils.validation import check_is_fitted
+from ._binning import _map_to_bins
+from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF
+
+
+def _find_binning_thresholds(data, max_bins, subsample, random_state):
+    """Extract feature-wise quantiles from numerical data.
+
+    Missing values are ignored for finding the thresholds.
+
+    Parameters
+    ----------
+    data : array-like, shape (n_samples, n_features)
+        The data to bin.
+    max_bins: int
+        The maximum number of bins to use for non-missing values. If for a
+        given feature the number of unique values is less than ``max_bins``,
+        then those unique values will be used to compute the bin thresholds,
+        instead of the quantiles.
+    subsample : int or None
+        If ``n_samples > subsample``, then ``sub_samples`` samples will be
+        randomly chosen to compute the quantiles. If ``None``, the whole data
+        is used.
+    random_state: int, RandomState instance or None
+        Pseudo-random number generator to control the random sub-sampling.
+        Pass an int for reproducible output across multiple
+        function calls.
+        See :term: `Glossary <random_state>`.
+
+    Return
+    ------
+    binning_thresholds: list of arrays
+        For each feature, stores the increasing numeric values that can
+        be used to separate the bins. Thus ``len(binning_thresholds) ==
+        n_features``.
+    """
+    rng = check_random_state(random_state)
+    if subsample is not None and data.shape[0] > subsample:
+        subset = rng.choice(data.shape[0], subsample, replace=False)
+        data = data.take(subset, axis=0)
+
+    binning_thresholds = []
+    for f_idx in range(data.shape[1]):
+        col_data = data[:, f_idx]
+        # ignore missing values when computing bin thresholds
+        missing_mask = np.isnan(col_data)
+        if missing_mask.any():
+            col_data = col_data[~missing_mask]
+        col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
+        distinct_values = np.unique(col_data)
+        if len(distinct_values) <= max_bins:
+            midpoints = distinct_values[:-1] + distinct_values[1:]
+            midpoints *= .5
+        else:
+            # We sort again the data in this case. We could compute
+            # approximate midpoint percentiles using the output of
+            # np.unique(col_data, return_counts) instead but this is more
+            # work and the performance benefit will be limited because we
+            # work on a fixed-size subsample of the full data.
+            percentiles = np.linspace(0, 100, num=max_bins + 1)
+            percentiles = percentiles[1:-1]
+            midpoints = np.percentile(col_data, percentiles,
+                                      interpolation='midpoint').astype(X_DTYPE)
+            assert midpoints.shape[0] == max_bins - 1
+
+        # We avoid having +inf thresholds: +inf thresholds are only allowed in
+        # a "split on nan" situation.
+        np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)
+
+        binning_thresholds.append(midpoints)
+
+    return binning_thresholds
+
+
+class _BinMapper(TransformerMixin, BaseEstimator):
+    """Transformer that maps a dataset into integer-valued bins.
+
+    The bins are created in a feature-wise fashion, using quantiles so that
+    each bins contains approximately the same number of samples.
+
+    For large datasets, quantiles are computed on a subset of the data to
+    speed-up the binning, but the quantiles should remain stable.
+
+    Features with a small number of values may be binned into less than
+    ``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved
+    for missing values.
+
+    Parameters
+    ----------
+    n_bins : int, optional (default=256)
+        The maximum number of bins to use (including the bin for missing
+        values). Non-missing values are binned on ``max_bins = n_bins - 1``
+        bins. The last bin is always reserved for missing values. If for a
+        given feature the number of unique values is less than ``max_bins``,
+        then those unique values will be used to compute the bin thresholds,
+        instead of the quantiles.
+    subsample : int or None, optional (default=2e5)
+        If ``n_samples > subsample``, then ``sub_samples`` samples will be
+        randomly chosen to compute the quantiles. If ``None``, the whole data
+        is used.
+    random_state: int, RandomState instance or None
+        Pseudo-random number generator to control the random sub-sampling.
+        Pass an int for reproducible output across multiple
+        function calls.
+        See :term: `Glossary <random_state>`.
+
+    Attributes
+    ----------
+    bin_thresholds_ : list of arrays
+        For each feature, gives the real-valued bin threhsolds. There are
+        ``max_bins - 1`` thresholds, where ``max_bins = n_bins - 1`` is the
+        number of bins used for non-missing values.
+    n_bins_non_missing_ : array of uint32
+        For each feature, gives the number of bins actually used for
+        non-missing values. For features with a lot of unique values, this is
+        equal to ``n_bins - 1``.
+    missing_values_bin_idx_ : uint8
+        The index of the bin where missing values are mapped. This is a
+        constant across all features. This corresponds to the last bin, and
+        it is always equal to ``n_bins - 1``. Note that if ``n_bins_missing_``
+        is less than ``n_bins - 1`` for a given feature, then there are
+        empty (and unused) bins.
+    """
+    def __init__(self, n_bins=256, subsample=int(2e5), random_state=None):
+        self.n_bins = n_bins
+        self.subsample = subsample
+        self.random_state = random_state
+
+    def fit(self, X, y=None):
+        """Fit data X by computing the binning thresholds.
+
+        The last bin is reserved for missing values, whether missing values
+        are present in the data or not.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The data to bin.
+        y: None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+        """
+        if not (3 <= self.n_bins <= 256):
+            # min is 3: at least 2 distinct bins and a missing values bin
+            raise ValueError('n_bins={} should be no smaller than 3 '
+                             'and no larger than 256.'.format(self.n_bins))
+
+        X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
+        max_bins = self.n_bins - 1
+        self.bin_thresholds_ = _find_binning_thresholds(
+            X, max_bins, subsample=self.subsample,
+            random_state=self.random_state)
+
+        self.n_bins_non_missing_ = np.array(
+            [thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_],
+            dtype=np.uint32)
+
+        self.missing_values_bin_idx_ = self.n_bins - 1
+
+        return self
+
+    def transform(self, X):
+        """Bin data X.
+
+        Missing values will be mapped to the last bin.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The data to bin.
+
+        Returns
+        -------
+        X_binned : array-like, shape (n_samples, n_features)
+            The binned data (fortran-aligned).
+        """
+        X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
+        check_is_fitted(self)
+        if X.shape[1] != self.n_bins_non_missing_.shape[0]:
+            raise ValueError(
+                'This estimator was fitted with {} features but {} got passed '
+                'to transform()'.format(self.n_bins_non_missing_.shape[0],
+                                        X.shape[1])
+            )
+        binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F')
+        _map_to_bins(X, self.bin_thresholds_, self.missing_values_bin_idx_,
+                     binned)
+        return binned
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pxd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pxd
@ -0,0 +1,40 @@
+# cython: language_level=3
+import numpy as np
+cimport numpy as np
+
+np.import_array()
+
+
+ctypedef np.npy_float64 X_DTYPE_C
+ctypedef np.npy_uint8 X_BINNED_DTYPE_C
+ctypedef np.npy_float64 Y_DTYPE_C
+ctypedef np.npy_float32 G_H_DTYPE_C
+
+cdef packed struct hist_struct:
+    # Same as histogram dtype but we need a struct to declare views. It needs
+    # to be packed since by default numpy dtypes aren't aligned
+    Y_DTYPE_C sum_gradients
+    Y_DTYPE_C sum_hessians
+    unsigned int count
+
+
+cdef packed struct node_struct:
+    # Equivalent struct to PREDICTOR_RECORD_DTYPE to use in memory views. It
+    # needs to be packed since by default numpy dtypes aren't aligned
+    Y_DTYPE_C value
+    unsigned int count
+    unsigned int feature_idx
+    X_DTYPE_C threshold
+    unsigned char missing_go_to_left
+    unsigned int left
+    unsigned int right
+    Y_DTYPE_C gain
+    unsigned int depth
+    unsigned char is_leaf
+    X_BINNED_DTYPE_C bin_threshold
+
+
+cpdef enum MonotonicConstraint:
+    NO_CST = 0
+    POS = 1
+    NEG = -1
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/grower.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/grower.py
@ -0,0 +1,571 @@
+"""
+This module contains the TreeGrower class.
+
+TreeGrowee builds a regression tree fitting a Newton-Raphson step, based on
+the gradients and hessians of the training data.
+"""
+# Author: Nicolas Hug
+
+from heapq import heappush, heappop
+import numpy as np
+from timeit import default_timer as time
+import numbers
+
+from .splitting import Splitter
+from .histogram import HistogramBuilder
+from .predictor import TreePredictor
+from .utils import sum_parallel
+from .common import PREDICTOR_RECORD_DTYPE
+from .common import Y_DTYPE
+from .common import MonotonicConstraint
+
+
+EPS = np.finfo(Y_DTYPE).eps  # to avoid zero division errors
+
+
+class TreeNode:
+    """Tree Node class used in TreeGrower.
+
+    This isn't used for prediction purposes, only for training (see
+    TreePredictor).
+
+    Parameters
+    ----------
+    depth : int
+        The depth of the node, i.e. its distance from the root.
+    sample_indices : ndarray of unsigned int, shape (n_samples_at_node,)
+        The indices of the samples at the node.
+    sum_gradients : float
+        The sum of the gradients of the samples at the node.
+    sum_hessians : float
+        The sum of the hessians of the samples at the node.
+    parent : TreeNode or None, optional (default=None)
+        The parent of the node. None for root.
+
+    Attributes
+    ----------
+    depth : int
+        The depth of the node, i.e. its distance from the root.
+    sample_indices : ndarray of unsigned int, shape (n_samples_at_node,)
+        The indices of the samples at the node.
+    sum_gradients : float
+        The sum of the gradients of the samples at the node.
+    sum_hessians : float
+        The sum of the hessians of the samples at the node.
+    parent : TreeNode or None
+        The parent of the node. None for root.
+    split_info : SplitInfo or None
+        The result of the split evaluation.
+    left_child : TreeNode or None
+        The left child of the node. None for leaves.
+    right_child : TreeNode or None
+        The right child of the node. None for leaves.
+    value : float or None
+        The value of the leaf, as computed in finalize_leaf(). None for
+        non-leaf nodes.
+    partition_start : int
+        start position of the node's sample_indices in splitter.partition.
+    partition_stop : int
+        stop position of the node's sample_indices in splitter.partition.
+    """
+
+    split_info = None
+    left_child = None
+    right_child = None
+    histograms = None
+    sibling = None
+    parent = None
+
+    # start and stop indices of the node in the splitter.partition
+    # array. Concretely,
+    # self.sample_indices = view(self.splitter.partition[start:stop])
+    # Please see the comments about splitter.partition and
+    # splitter.split_indices for more info about this design.
+    # These 2 attributes are only used in _update_raw_prediction, because we
+    # need to iterate over the leaves and I don't know how to efficiently
+    # store the sample_indices views because they're all of different sizes.
+    partition_start = 0
+    partition_stop = 0
+
+    def __init__(self, depth, sample_indices, sum_gradients,
+                 sum_hessians, parent=None, value=None):
+        self.depth = depth
+        self.sample_indices = sample_indices
+        self.n_samples = sample_indices.shape[0]
+        self.sum_gradients = sum_gradients
+        self.sum_hessians = sum_hessians
+        self.parent = parent
+        self.value = value
+        self.is_leaf = False
+        self.set_children_bounds(float('-inf'), float('+inf'))
+
+    def set_children_bounds(self, lower, upper):
+        """Set children values bounds to respect monotonic constraints."""
+
+        # These are bounds for the node's *children* values, not the node's
+        # value. The bounds are used in the splitter when considering potential
+        # left and right child.
+        self.children_lower_bound = lower
+        self.children_upper_bound = upper
+
+    def __lt__(self, other_node):
+        """Comparison for priority queue.
+
+        Nodes with high gain are higher priority than nodes with low gain.
+
+        heapq.heappush only need the '<' operator.
+        heapq.heappop take the smallest item first (smaller is higher
+        priority).
+
+        Parameters
+        ----------
+        other_node : TreeNode
+            The node to compare with.
+        """
+        return self.split_info.gain > other_node.split_info.gain
+
+
+class TreeGrower:
+    """Tree grower class used to build a tree.
+
+    The tree is fitted to predict the values of a Newton-Raphson step. The
+    splits are considered in a best-first fashion, and the quality of a
+    split is defined in splitting._split_gain.
+
+    Parameters
+    ----------
+    X_binned : ndarray of int, shape (n_samples, n_features)
+        The binned input samples. Must be Fortran-aligned.
+    gradients : ndarray, shape (n_samples,)
+        The gradients of each training sample. Those are the gradients of the
+        loss w.r.t the predictions, evaluated at iteration ``i - 1``.
+    hessians : ndarray, shape (n_samples,)
+        The hessians of each training sample. Those are the hessians of the
+        loss w.r.t the predictions, evaluated at iteration ``i - 1``.
+    max_leaf_nodes : int or None, optional (default=None)
+        The maximum number of leaves for each tree. If None, there is no
+        maximum limit.
+    max_depth : int or None, optional (default=None)
+        The maximum depth of each tree. The depth of a tree is the number of
+        edges to go from the root to the deepest leaf.
+        Depth isn't constrained by default.
+    min_samples_leaf : int, optional (default=20)
+        The minimum number of samples per leaf.
+    min_gain_to_split : float, optional (default=0.)
+        The minimum gain needed to split a node. Splits with lower gain will
+        be ignored.
+    n_bins : int, optional (default=256)
+        The total number of bins, including the bin for missing values. Used
+        to define the shape of the histograms.
+    n_bins_non_missing_ : array of uint32
+        For each feature, gives the number of bins actually used for
+        non-missing values. For features with a lot of unique values, this
+        is equal to ``n_bins - 1``. If it's an int, all features are
+        considered to have the same number of bins. If None, all features
+        are considered to have ``n_bins - 1`` bins.
+    has_missing_values : ndarray of bool or bool, optional (default=False)
+        Whether each feature contains missing values (in the training data).
+        If it's a bool, the same value is used for all features.
+    l2_regularization : float, optional (default=0)
+        The L2 regularization parameter.
+    min_hessian_to_split : float, optional (default=1e-3)
+        The minimum sum of hessians needed in each node. Splits that result in
+        at least one child having a sum of hessians less than
+        ``min_hessian_to_split`` are discarded.
+    shrinkage : float, optional (default=1)
+        The shrinkage parameter to apply to the leaves values, also known as
+        learning rate.
+    """
+    def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,
+                 max_depth=None, min_samples_leaf=20, min_gain_to_split=0.,
+                 n_bins=256, n_bins_non_missing=None, has_missing_values=False,
+                 monotonic_cst=None, l2_regularization=0.,
+                 min_hessian_to_split=1e-3, shrinkage=1.):
+
+        self._validate_parameters(X_binned, max_leaf_nodes, max_depth,
+                                  min_samples_leaf, min_gain_to_split,
+                                  l2_regularization, min_hessian_to_split)
+
+        if n_bins_non_missing is None:
+            n_bins_non_missing = n_bins - 1
+
+        if isinstance(n_bins_non_missing, numbers.Integral):
+            n_bins_non_missing = np.array(
+                [n_bins_non_missing] * X_binned.shape[1],
+                dtype=np.uint32)
+        else:
+            n_bins_non_missing = np.asarray(n_bins_non_missing,
+                                            dtype=np.uint32)
+
+        if isinstance(has_missing_values, bool):
+            has_missing_values = [has_missing_values] * X_binned.shape[1]
+        has_missing_values = np.asarray(has_missing_values, dtype=np.uint8)
+
+        if monotonic_cst is None:
+            self.with_monotonic_cst = False
+            monotonic_cst = np.full(shape=X_binned.shape[1],
+                                    fill_value=MonotonicConstraint.NO_CST,
+                                    dtype=np.int8)
+        else:
+            self.with_monotonic_cst = True
+            monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
+
+            if monotonic_cst.shape[0] != X_binned.shape[1]:
+                raise ValueError(
+                    "monotonic_cst has shape {} but the input data "
+                    "X has {} features.".format(
+                        monotonic_cst.shape[0], X_binned.shape[1]
+                    )
+                )
+            if np.any(monotonic_cst < -1) or np.any(monotonic_cst > 1):
+                raise ValueError(
+                    "monotonic_cst must be None or an array-like of "
+                    "-1, 0 or 1."
+                    )
+
+        hessians_are_constant = hessians.shape[0] == 1
+        self.histogram_builder = HistogramBuilder(
+            X_binned, n_bins, gradients, hessians, hessians_are_constant)
+        missing_values_bin_idx = n_bins - 1
+        self.splitter = Splitter(
+            X_binned, n_bins_non_missing, missing_values_bin_idx,
+            has_missing_values, monotonic_cst,
+            l2_regularization, min_hessian_to_split,
+            min_samples_leaf, min_gain_to_split, hessians_are_constant)
+        self.n_bins_non_missing = n_bins_non_missing
+        self.max_leaf_nodes = max_leaf_nodes
+        self.has_missing_values = has_missing_values
+        self.monotonic_cst = monotonic_cst
+        self.l2_regularization = l2_regularization
+        self.n_features = X_binned.shape[1]
+        self.max_depth = max_depth
+        self.min_samples_leaf = min_samples_leaf
+        self.X_binned = X_binned
+        self.min_gain_to_split = min_gain_to_split
+        self.shrinkage = shrinkage
+        self.splittable_nodes = []
+        self.finalized_leaves = []
+        self.total_find_split_time = 0.  # time spent finding the best splits
+        self.total_compute_hist_time = 0.  # time spent computing histograms
+        self.total_apply_split_time = 0.  # time spent splitting nodes
+        self._intilialize_root(gradients, hessians, hessians_are_constant)
+        self.n_nodes = 1
+
+    def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth,
+                             min_samples_leaf, min_gain_to_split,
+                             l2_regularization, min_hessian_to_split):
+        """Validate parameters passed to __init__.
+
+        Also validate parameters passed to splitter.
+        """
+        if X_binned.dtype != np.uint8:
+            raise NotImplementedError(
+                "X_binned must be of type uint8.")
+        if not X_binned.flags.f_contiguous:
+            raise ValueError(
+                "X_binned should be passed as Fortran contiguous "
+                "array for maximum efficiency.")
+        if max_leaf_nodes is not None and max_leaf_nodes <= 1:
+            raise ValueError('max_leaf_nodes={} should not be'
+                             ' smaller than 2'.format(max_leaf_nodes))
+        if max_depth is not None and max_depth < 1:
+            raise ValueError('max_depth={} should not be'
+                             ' smaller than 1'.format(max_depth))
+        if min_samples_leaf < 1:
+            raise ValueError('min_samples_leaf={} should '
+                             'not be smaller than 1'.format(min_samples_leaf))
+        if min_gain_to_split < 0:
+            raise ValueError('min_gain_to_split={} '
+                             'must be positive.'.format(min_gain_to_split))
+        if l2_regularization < 0:
+            raise ValueError('l2_regularization={} must be '
+                             'positive.'.format(l2_regularization))
+        if min_hessian_to_split < 0:
+            raise ValueError('min_hessian_to_split={} '
+                             'must be positive.'.format(min_hessian_to_split))
+
+    def grow(self):
+        """Grow the tree, from root to leaves."""
+        while self.splittable_nodes:
+            self.split_next()
+
+        self._apply_shrinkage()
+
+    def _apply_shrinkage(self):
+        """Multiply leaves values by shrinkage parameter.
+
+        This must be done at the very end of the growing process. If this were
+        done during the growing process e.g. in finalize_leaf(), then a leaf
+        would be shrunk but its sibling would potentially not be (if it's a
+        non-leaf), which would lead to a wrong computation of the 'middle'
+        value needed to enforce the monotonic constraints.
+        """
+        for leaf in self.finalized_leaves:
+            leaf.value *= self.shrinkage
+
+    def _intilialize_root(self, gradients, hessians, hessians_are_constant):
+        """Initialize root node and finalize it if needed."""
+        n_samples = self.X_binned.shape[0]
+        depth = 0
+        sum_gradients = sum_parallel(gradients)
+        if self.histogram_builder.hessians_are_constant:
+            sum_hessians = hessians[0] * n_samples
+        else:
+            sum_hessians = sum_parallel(hessians)
+        self.root = TreeNode(
+            depth=depth,
+            sample_indices=self.splitter.partition,
+            sum_gradients=sum_gradients,
+            sum_hessians=sum_hessians,
+            value=0
+        )
+
+        self.root.partition_start = 0
+        self.root.partition_stop = n_samples
+
+        if self.root.n_samples < 2 * self.min_samples_leaf:
+            # Do not even bother computing any splitting statistics.
+            self._finalize_leaf(self.root)
+            return
+        if sum_hessians < self.splitter.min_hessian_to_split:
+            self._finalize_leaf(self.root)
+            return
+
+        self.root.histograms = self.histogram_builder.compute_histograms_brute(
+            self.root.sample_indices)
+        self._compute_best_split_and_push(self.root)
+
+    def _compute_best_split_and_push(self, node):
+        """Compute the best possible split (SplitInfo) of a given node.
+
+        Also push it in the heap of splittable nodes if gain isn't zero.
+        The gain of a node is 0 if either all the leaves are pure
+        (best gain = 0), or if no split would satisfy the constraints,
+        (min_hessians_to_split, min_gain_to_split, min_samples_leaf)
+        """
+
+        node.split_info = self.splitter.find_node_split(
+            node.n_samples, node.histograms, node.sum_gradients,
+            node.sum_hessians, node.value, node.children_lower_bound,
+            node.children_upper_bound)
+
+        if node.split_info.gain <= 0:  # no valid split
+            self._finalize_leaf(node)
+        else:
+            heappush(self.splittable_nodes, node)
+
+    def split_next(self):
+        """Split the node with highest potential gain.
+
+        Returns
+        -------
+        left : TreeNode
+            The resulting left child.
+        right : TreeNode
+            The resulting right child.
+        """
+        # Consider the node with the highest loss reduction (a.k.a. gain)
+        node = heappop(self.splittable_nodes)
+
+        tic = time()
+        (sample_indices_left,
+         sample_indices_right,
+         right_child_pos) = self.splitter.split_indices(node.split_info,
+                                                        node.sample_indices)
+        self.total_apply_split_time += time() - tic
+
+        depth = node.depth + 1
+        n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes)
+        n_leaf_nodes += 2
+
+        left_child_node = TreeNode(depth,
+                                   sample_indices_left,
+                                   node.split_info.sum_gradient_left,
+                                   node.split_info.sum_hessian_left,
+                                   parent=node,
+                                   value=node.split_info.value_left,
+                                   )
+        right_child_node = TreeNode(depth,
+                                    sample_indices_right,
+                                    node.split_info.sum_gradient_right,
+                                    node.split_info.sum_hessian_right,
+                                    parent=node,
+                                    value=node.split_info.value_right,
+                                    )
+
+        left_child_node.sibling = right_child_node
+        right_child_node.sibling = left_child_node
+        node.right_child = right_child_node
+        node.left_child = left_child_node
+
+        # set start and stop indices
+        left_child_node.partition_start = node.partition_start
+        left_child_node.partition_stop = node.partition_start + right_child_pos
+        right_child_node.partition_start = left_child_node.partition_stop
+        right_child_node.partition_stop = node.partition_stop
+
+        if not self.has_missing_values[node.split_info.feature_idx]:
+            # If no missing values are encountered at fit time, then samples
+            # with missing values during predict() will go to whichever child
+            # has the most samples.
+            node.split_info.missing_go_to_left = (
+                left_child_node.n_samples > right_child_node.n_samples)
+
+        self.n_nodes += 2
+
+        if (self.max_leaf_nodes is not None
+                and n_leaf_nodes == self.max_leaf_nodes):
+            self._finalize_leaf(left_child_node)
+            self._finalize_leaf(right_child_node)
+            self._finalize_splittable_nodes()
+            return left_child_node, right_child_node
+
+        if self.max_depth is not None and depth == self.max_depth:
+            self._finalize_leaf(left_child_node)
+            self._finalize_leaf(right_child_node)
+            return left_child_node, right_child_node
+
+        if left_child_node.n_samples < self.min_samples_leaf * 2:
+            self._finalize_leaf(left_child_node)
+        if right_child_node.n_samples < self.min_samples_leaf * 2:
+            self._finalize_leaf(right_child_node)
+
+        if self.with_monotonic_cst:
+            # Set value bounds for respecting monotonic constraints
+            # See test_nodes_values() for details
+            if (self.monotonic_cst[node.split_info.feature_idx] ==
+                    MonotonicConstraint.NO_CST):
+                lower_left = lower_right = node.children_lower_bound
+                upper_left = upper_right = node.children_upper_bound
+            else:
+                mid = (left_child_node.value + right_child_node.value) / 2
+                if (self.monotonic_cst[node.split_info.feature_idx] ==
+                        MonotonicConstraint.POS):
+                    lower_left, upper_left = node.children_lower_bound, mid
+                    lower_right, upper_right = mid, node.children_upper_bound
+                else:  # NEG
+                    lower_left, upper_left = mid, node.children_upper_bound
+                    lower_right, upper_right = node.children_lower_bound, mid
+            left_child_node.set_children_bounds(lower_left, upper_left)
+            right_child_node.set_children_bounds(lower_right, upper_right)
+
+        # Compute histograms of children, and compute their best possible split
+        # (if needed)
+        should_split_left = not left_child_node.is_leaf
+        should_split_right = not right_child_node.is_leaf
+        if should_split_left or should_split_right:
+
+            # We will compute the histograms of both nodes even if one of them
+            # is a leaf, since computing the second histogram is very cheap
+            # (using histogram subtraction).
+            n_samples_left = left_child_node.sample_indices.shape[0]
+            n_samples_right = right_child_node.sample_indices.shape[0]
+            if n_samples_left < n_samples_right:
+                smallest_child = left_child_node
+                largest_child = right_child_node
+            else:
+                smallest_child = right_child_node
+                largest_child = left_child_node
+
+            # We use the brute O(n_samples) method on the child that has the
+            # smallest number of samples, and the subtraction trick O(n_bins)
+            # on the other one.
+            tic = time()
+            smallest_child.histograms = \
+                self.histogram_builder.compute_histograms_brute(
+                    smallest_child.sample_indices)
+            largest_child.histograms = \
+                self.histogram_builder.compute_histograms_subtraction(
+                    node.histograms, smallest_child.histograms)
+            self.total_compute_hist_time += time() - tic
+
+            tic = time()
+            if should_split_left:
+                self._compute_best_split_and_push(left_child_node)
+            if should_split_right:
+                self._compute_best_split_and_push(right_child_node)
+            self.total_find_split_time += time() - tic
+
+        return left_child_node, right_child_node
+
+    def _finalize_leaf(self, node):
+        """Make node a leaf of the tree being grown."""
+
+        node.is_leaf = True
+        self.finalized_leaves.append(node)
+
+    def _finalize_splittable_nodes(self):
+        """Transform all splittable nodes into leaves.
+
+        Used when some constraint is met e.g. maximum number of leaves or
+        maximum depth."""
+        while len(self.splittable_nodes) > 0:
+            node = self.splittable_nodes.pop()
+            self._finalize_leaf(node)
+
+    def make_predictor(self, bin_thresholds=None):
+        """Make a TreePredictor object out of the current tree.
+
+        Parameters
+        ----------
+        bin_thresholds : array-like of floats, optional (default=None)
+            The actual thresholds values of each bin.
+
+        Returns
+        -------
+        A TreePredictor object.
+        """
+        predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE)
+        _fill_predictor_node_array(predictor_nodes, self.root,
+                                   bin_thresholds, self.n_bins_non_missing)
+        return TreePredictor(predictor_nodes)
+
+
+def _fill_predictor_node_array(predictor_nodes, grower_node,
+                               bin_thresholds, n_bins_non_missing,
+                               next_free_idx=0):
+    """Helper used in make_predictor to set the TreePredictor fields."""
+    node = predictor_nodes[next_free_idx]
+    node['count'] = grower_node.n_samples
+    node['depth'] = grower_node.depth
+    if grower_node.split_info is not None:
+        node['gain'] = grower_node.split_info.gain
+    else:
+        node['gain'] = -1
+
+    node['value'] = grower_node.value
+
+    if grower_node.is_leaf:
+        # Leaf node
+        node['is_leaf'] = True
+        return next_free_idx + 1
+    else:
+        # Decision node
+        split_info = grower_node.split_info
+        feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx
+        node['feature_idx'] = feature_idx
+        node['bin_threshold'] = bin_idx
+        node['missing_go_to_left'] = split_info.missing_go_to_left
+
+        if split_info.bin_idx == n_bins_non_missing[feature_idx] - 1:
+            # Split is on the last non-missing bin: it's a "split on nans". All
+            # nans go to the right, the rest go to the left.
+            node['threshold'] = np.inf
+        elif bin_thresholds is not None:
+            node['threshold'] = bin_thresholds[feature_idx][bin_idx]
+
+        next_free_idx += 1
+
+        node['left'] = next_free_idx
+        next_free_idx = _fill_predictor_node_array(
+            predictor_nodes, grower_node.left_child,
+            bin_thresholds=bin_thresholds,
+            n_bins_non_missing=n_bins_non_missing,
+            next_free_idx=next_free_idx)
+
+        node['right'] = next_free_idx
+        return _fill_predictor_node_array(
+            predictor_nodes, grower_node.right_child,
+            bin_thresholds=bin_thresholds,
+            n_bins_non_missing=n_bins_non_missing,
+            next_free_idx=next_free_idx)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/histogram.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/histogram.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/loss.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/loss.py
@ -0,0 +1,426 @@
+"""
+This module contains the loss classes.
+
+Specific losses are used for regression, binary classification or multiclass
+classification.
+"""
+# Author: Nicolas Hug
+
+from abc import ABC, abstractmethod
+
+import numpy as np
+from scipy.special import expit, logsumexp, xlogy
+
+from .common import Y_DTYPE
+from .common import G_H_DTYPE
+from ._loss import _update_gradients_least_squares
+from ._loss import _update_gradients_hessians_least_squares
+from ._loss import _update_gradients_least_absolute_deviation
+from ._loss import _update_gradients_hessians_least_absolute_deviation
+from ._loss import _update_gradients_hessians_binary_crossentropy
+from ._loss import _update_gradients_hessians_categorical_crossentropy
+from ._loss import _update_gradients_hessians_poisson
+from ...utils.stats import _weighted_percentile
+
+
+class BaseLoss(ABC):
+    """Base class for a loss."""
+
+    def __init__(self, hessians_are_constant):
+        self.hessians_are_constant = hessians_are_constant
+
+    def __call__(self, y_true, raw_predictions, sample_weight):
+        """Return the weighted average loss"""
+        return np.average(self.pointwise_loss(y_true, raw_predictions),
+                          weights=sample_weight)
+
+    @abstractmethod
+    def pointwise_loss(self, y_true, raw_predictions):
+        """Return loss value for each input"""
+
+    # This variable indicates whether the loss requires the leaves values to
+    # be updated once the tree has been trained. The trees are trained to
+    # predict a Newton-Raphson step (see grower._finalize_leaf()). But for
+    # some losses (e.g. least absolute deviation) we need to adjust the tree
+    # values to account for the "line search" of the gradient descent
+    # procedure. See the original paper Greedy Function Approximation: A
+    # Gradient Boosting Machine by Friedman
+    # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
+    need_update_leaves_values = False
+
+    def init_gradients_and_hessians(self, n_samples, prediction_dim,
+                                    sample_weight):
+        """Return initial gradients and hessians.
+
+        Unless hessians are constant, arrays are initialized with undefined
+        values.
+
+        Parameters
+        ----------
+        n_samples : int
+            The number of samples passed to `fit()`.
+
+        prediction_dim : int
+            The dimension of a raw prediction, i.e. the number of trees
+            built at each iteration. Equals 1 for regression and binary
+            classification, or K where K is the number of classes for
+            multiclass classification.
+
+        sample_weight : array-like of shape(n_samples,) default=None
+            Weights of training data.
+
+        Returns
+        -------
+        gradients : ndarray, shape (prediction_dim, n_samples)
+            The initial gradients. The array is not initialized.
+        hessians : ndarray, shape (prediction_dim, n_samples)
+            If hessians are constant (e.g. for `LeastSquares` loss, the
+            array is initialized to ``1``. Otherwise, the array is allocated
+            without being initialized.
+        """
+        shape = (prediction_dim, n_samples)
+        gradients = np.empty(shape=shape, dtype=G_H_DTYPE)
+
+        if self.hessians_are_constant:
+            # If the hessians are constant, we consider they are equal to 1.
+            # - This is correct for the half LS loss
+            # - For LAD loss, hessians are actually 0, but they are always
+            #   ignored anyway.
+            hessians = np.ones(shape=(1, 1), dtype=G_H_DTYPE)
+        else:
+            hessians = np.empty(shape=shape, dtype=G_H_DTYPE)
+
+        return gradients, hessians
+
+    @abstractmethod
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        """Return initial predictions (before the first iteration).
+
+        Parameters
+        ----------
+        y_train : ndarray, shape (n_samples,)
+            The target training values.
+
+        sample_weight : array-like of shape(n_samples,) default=None
+            Weights of training data.
+
+        prediction_dim : int
+            The dimension of one prediction: 1 for binary classification and
+            regression, n_classes for multiclass classification.
+
+        Returns
+        -------
+        baseline_prediction : float or ndarray, shape (1, prediction_dim)
+            The baseline prediction.
+        """
+
+    @abstractmethod
+    def update_gradients_and_hessians(self, gradients, hessians, y_true,
+                                      raw_predictions, sample_weight):
+        """Update gradients and hessians arrays, inplace.
+
+        The gradients (resp. hessians) are the first (resp. second) order
+        derivatives of the loss for each sample with respect to the
+        predictions of model, evaluated at iteration ``i - 1``.
+
+        Parameters
+        ----------
+        gradients : ndarray, shape (prediction_dim, n_samples)
+            The gradients (treated as OUT array).
+
+        hessians : ndarray, shape (prediction_dim, n_samples) or \
+            (1,)
+            The hessians (treated as OUT array).
+
+        y_true : ndarray, shape (n_samples,)
+            The true target values or each training sample.
+
+        raw_predictions : ndarray, shape (prediction_dim, n_samples)
+            The raw_predictions (i.e. values from the trees) of the tree
+            ensemble at iteration ``i - 1``.
+
+        sample_weight : array-like of shape(n_samples,) default=None
+            Weights of training data.
+        """
+
+
+class LeastSquares(BaseLoss):
+    """Least squares loss, for regression.
+
+    For a given sample x_i, least squares loss is defined as::
+
+        loss(x_i) = 0.5 * (y_true_i - raw_pred_i)**2
+
+    This actually computes the half least squares loss to simplify
+    the computation of the gradients and get a unit hessian (and be consistent
+    with what is done in LightGBM).
+    """
+
+    def __init__(self, sample_weight):
+        # If sample weights are provided, the hessians and gradients
+        # are multiplied by sample_weight, which means the hessians are
+        # equal to sample weights.
+        super().__init__(hessians_are_constant=sample_weight is None)
+
+    def pointwise_loss(self, y_true, raw_predictions):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        loss = 0.5 * np.power(y_true - raw_predictions, 2)
+        return loss
+
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        return np.average(y_train, weights=sample_weight)
+
+    @staticmethod
+    def inverse_link_function(raw_predictions):
+        return raw_predictions
+
+    def update_gradients_and_hessians(self, gradients, hessians, y_true,
+                                      raw_predictions, sample_weight):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        gradients = gradients.reshape(-1)
+        if sample_weight is None:
+            _update_gradients_least_squares(gradients, y_true, raw_predictions)
+        else:
+            hessians = hessians.reshape(-1)
+            _update_gradients_hessians_least_squares(gradients, hessians,
+                                                     y_true, raw_predictions,
+                                                     sample_weight)
+
+
+class LeastAbsoluteDeviation(BaseLoss):
+    """Least absolute deviation, for regression.
+
+    For a given sample x_i, the loss is defined as::
+
+        loss(x_i) = |y_true_i - raw_pred_i|
+    """
+
+    def __init__(self, sample_weight):
+        # If sample weights are provided, the hessians and gradients
+        # are multiplied by sample_weight, which means the hessians are
+        # equal to sample weights.
+        super().__init__(hessians_are_constant=sample_weight is None)
+
+    # This variable indicates whether the loss requires the leaves values to
+    # be updated once the tree has been trained. The trees are trained to
+    # predict a Newton-Raphson step (see grower._finalize_leaf()). But for
+    # some losses (e.g. least absolute deviation) we need to adjust the tree
+    # values to account for the "line search" of the gradient descent
+    # procedure. See the original paper Greedy Function Approximation: A
+    # Gradient Boosting Machine by Friedman
+    # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
+    need_update_leaves_values = True
+
+    def pointwise_loss(self, y_true, raw_predictions):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        loss = np.abs(y_true - raw_predictions)
+        return loss
+
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        if sample_weight is None:
+            return np.median(y_train)
+        else:
+            return _weighted_percentile(y_train, sample_weight, 50)
+
+    @staticmethod
+    def inverse_link_function(raw_predictions):
+        return raw_predictions
+
+    def update_gradients_and_hessians(self, gradients, hessians, y_true,
+                                      raw_predictions, sample_weight):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        gradients = gradients.reshape(-1)
+        if sample_weight is None:
+            _update_gradients_least_absolute_deviation(gradients, y_true,
+                                                       raw_predictions)
+        else:
+            hessians = hessians.reshape(-1)
+            _update_gradients_hessians_least_absolute_deviation(
+                gradients, hessians, y_true, raw_predictions, sample_weight)
+
+    def update_leaves_values(self, grower, y_true, raw_predictions,
+                             sample_weight):
+        # Update the values predicted by the tree with
+        # median(y_true - raw_predictions).
+        # See note about need_update_leaves_values in BaseLoss.
+
+        # TODO: ideally this should be computed in parallel over the leaves
+        # using something similar to _update_raw_predictions(), but this
+        # requires a cython version of median()
+        for leaf in grower.finalized_leaves:
+            indices = leaf.sample_indices
+            if sample_weight is None:
+                median_res = np.median(y_true[indices]
+                                       - raw_predictions[indices])
+            else:
+                median_res = _weighted_percentile(y_true[indices]
+                                                  - raw_predictions[indices],
+                                                  sample_weight=sample_weight,
+                                                  percentile=50)
+            leaf.value = grower.shrinkage * median_res
+            # Note that the regularization is ignored here
+
+
+class Poisson(BaseLoss):
+    """Poisson deviance loss with log-link, for regression.
+
+    For a given sample x_i, Poisson deviance loss is defined as::
+
+        loss(x_i) = y_true_i * log(y_true_i/exp(raw_pred_i))
+                    - y_true_i + exp(raw_pred_i))
+
+    This actually computes half the Poisson deviance to simplify
+    the computation of the gradients.
+    """
+
+    def __init__(self, sample_weight):
+        super().__init__(hessians_are_constant=False)
+
+    inverse_link_function = staticmethod(np.exp)
+
+    def pointwise_loss(self, y_true, raw_predictions):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        # TODO: For speed, we could remove the constant xlogy(y_true, y_true)
+        # Advantage of this form: minimum of zero at raw_predictions = y_true.
+        loss = (xlogy(y_true, y_true) - y_true * (raw_predictions + 1)
+                + np.exp(raw_predictions))
+        return loss
+
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        y_pred = np.average(y_train, weights=sample_weight)
+        eps = np.finfo(y_train.dtype).eps
+        y_pred = np.clip(y_pred, eps, None)
+        return np.log(y_pred)
+
+    def update_gradients_and_hessians(self, gradients, hessians, y_true,
+                                      raw_predictions, sample_weight):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        gradients = gradients.reshape(-1)
+        hessians = hessians.reshape(-1)
+        _update_gradients_hessians_poisson(gradients, hessians,
+                                           y_true, raw_predictions,
+                                           sample_weight)
+
+
+class BinaryCrossEntropy(BaseLoss):
+    """Binary cross-entropy loss, for binary classification.
+
+    For a given sample x_i, the binary cross-entropy loss is defined as the
+    negative log-likelihood of the model which can be expressed as::
+
+        loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
+
+    See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
+    section 4.4.1 (about logistic regression).
+    """
+
+    def __init__(self, sample_weight):
+        super().__init__(hessians_are_constant=False)
+
+    inverse_link_function = staticmethod(expit)
+
+    def pointwise_loss(self, y_true, raw_predictions):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        # logaddexp(0, x) = log(1 + exp(x))
+        loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions
+        return loss
+
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        if prediction_dim > 2:
+            raise ValueError(
+                "loss='binary_crossentropy' is not defined for multiclass"
+                " classification with n_classes=%d, use"
+                " loss='categorical_crossentropy' instead" % prediction_dim)
+        proba_positive_class = np.average(y_train, weights=sample_weight)
+        eps = np.finfo(y_train.dtype).eps
+        proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps)
+        # log(x / 1 - x) is the anti function of sigmoid, or the link function
+        # of the Binomial model.
+        return np.log(proba_positive_class / (1 - proba_positive_class))
+
+    def update_gradients_and_hessians(self, gradients, hessians, y_true,
+                                      raw_predictions, sample_weight):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        gradients = gradients.reshape(-1)
+        hessians = hessians.reshape(-1)
+        _update_gradients_hessians_binary_crossentropy(
+            gradients, hessians, y_true, raw_predictions, sample_weight)
+
+    def predict_proba(self, raw_predictions):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        proba = np.empty((raw_predictions.shape[0], 2), dtype=Y_DTYPE)
+        proba[:, 1] = expit(raw_predictions)
+        proba[:, 0] = 1 - proba[:, 1]
+        return proba
+
+
+class CategoricalCrossEntropy(BaseLoss):
+    """Categorical cross-entropy loss, for multiclass classification.
+
+    For a given sample x_i, the categorical cross-entropy loss is defined as
+    the negative log-likelihood of the model and generalizes the binary
+    cross-entropy to more than 2 classes.
+    """
+
+    def __init__(self, sample_weight):
+        super().__init__(hessians_are_constant=False)
+
+    def pointwise_loss(self, y_true, raw_predictions):
+        one_hot_true = np.zeros_like(raw_predictions)
+        prediction_dim = raw_predictions.shape[0]
+        for k in range(prediction_dim):
+            one_hot_true[k, :] = (y_true == k)
+
+        loss = (logsumexp(raw_predictions, axis=0) -
+                (one_hot_true * raw_predictions).sum(axis=0))
+        return loss
+
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE)
+        eps = np.finfo(y_train.dtype).eps
+        for k in range(prediction_dim):
+            proba_kth_class = np.average(y_train == k,
+                                         weights=sample_weight)
+            proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps)
+            init_value[k, :] += np.log(proba_kth_class)
+
+        return init_value
+
+    def update_gradients_and_hessians(self, gradients, hessians, y_true,
+                                      raw_predictions, sample_weight):
+        _update_gradients_hessians_categorical_crossentropy(
+            gradients, hessians, y_true, raw_predictions, sample_weight)
+
+    def predict_proba(self, raw_predictions):
+        # TODO: This could be done in parallel
+        # compute softmax (using exp(log(softmax)))
+        proba = np.exp(raw_predictions -
+                       logsumexp(raw_predictions, axis=0)[np.newaxis, :])
+        return proba.T
+
+
+_LOSSES = {
+    'least_squares': LeastSquares,
+    'least_absolute_deviation': LeastAbsoluteDeviation,
+    'binary_crossentropy': BinaryCrossEntropy,
+    'categorical_crossentropy': CategoricalCrossEntropy,
+    'poisson': Poisson,
+}
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/predictor.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/predictor.py
@ -0,0 +1,86 @@
+"""
+This module contains the TreePredictor class which is used for prediction.
+"""
+# Author: Nicolas Hug
+
+import numpy as np
+
+from .common import Y_DTYPE
+from ._predictor import _predict_from_numeric_data
+from ._predictor import _predict_from_binned_data
+from ._predictor import _compute_partial_dependence
+
+
+class TreePredictor:
+    """Tree class used for predictions.
+
+    Parameters
+    ----------
+    nodes : ndarray of PREDICTOR_RECORD_DTYPE
+        The nodes of the tree.
+    """
+    def __init__(self, nodes):
+        self.nodes = nodes
+
+    def get_n_leaf_nodes(self):
+        """Return number of leaves."""
+        return int(self.nodes['is_leaf'].sum())
+
+    def get_max_depth(self):
+        """Return maximum depth among all leaves."""
+        return int(self.nodes['depth'].max())
+
+    def predict(self, X):
+        """Predict raw values for non-binned data.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        y : ndarray, shape (n_samples,)
+            The raw predicted values.
+        """
+        out = np.empty(X.shape[0], dtype=Y_DTYPE)
+        _predict_from_numeric_data(self.nodes, X, out)
+        return out
+
+    def predict_binned(self, X, missing_values_bin_idx):
+        """Predict raw values for binned data.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            The input samples.
+        missing_values_bin_idx : uint8
+            Index of the bin that is used for missing values. This is the
+            index of the last bin and is always equal to max_bins (as passed
+            to the GBDT classes), or equivalently to n_bins - 1.
+
+        Returns
+        -------
+        y : ndarray, shape (n_samples,)
+            The raw predicted values.
+        """
+        out = np.empty(X.shape[0], dtype=Y_DTYPE)
+        _predict_from_binned_data(self.nodes, X, missing_values_bin_idx, out)
+        return out
+
+    def compute_partial_dependence(self, grid, target_features, out):
+        """Fast partial dependence computation.
+
+        Parameters
+        ----------
+        grid : ndarray, shape (n_samples, n_target_features)
+            The grid points on which the partial dependence should be
+            evaluated.
+        target_features : ndarray, shape (n_target_features)
+            The set of target features for which the partial dependence
+            should be evaluated.
+        out : ndarray, shape (n_samples)
+            The value of the partial dependence function on each grid
+            point.
+        """
+        _compute_partial_dependence(self.nodes, grid, target_features, out)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/splitting.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/splitting.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/init.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/init.py
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_binning.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_binning.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_compare_lightgbm.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_compare_lightgbm.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_gradient_boosting.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_gradient_boosting.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_grower.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_grower.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_histogram.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_histogram.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_loss.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_loss.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_monotonic_contraints.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_monotonic_contraints.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_predictor.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_predictor.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_splitting.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_splitting.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_warm_start.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_warm_start.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
@ -0,0 +1,314 @@
+import numpy as np
+from numpy.testing import assert_array_equal, assert_allclose
+import pytest
+
+from sklearn.ensemble._hist_gradient_boosting.binning import (
+    _BinMapper,
+    _find_binning_thresholds as _find_binning_thresholds_orig,
+    _map_to_bins
+)
+from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import ALMOST_INF
+
+
+DATA = np.random.RandomState(42).normal(
+    loc=[0, 10], scale=[1, 0.01], size=(int(1e6), 2)
+).astype(X_DTYPE)
+
+
+def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5),
+                             random_state=None):
+    # Just a redef to avoid having to pass arguments all the time (as the
+    # function is private we don't use default values for parameters)
+    return _find_binning_thresholds_orig(data, max_bins, subsample,
+                                         random_state)
+
+
+def test_find_binning_thresholds_regular_data():
+    data = np.linspace(0, 10, 1001).reshape(-1, 1)
+    bin_thresholds = _find_binning_thresholds(data, max_bins=10)
+    assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9])
+    assert len(bin_thresholds) == 1
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=5)
+    assert_allclose(bin_thresholds[0], [2, 4, 6, 8])
+    assert len(bin_thresholds) == 1
+
+
+def test_find_binning_thresholds_small_regular_data():
+    data = np.linspace(0, 10, 11).reshape(-1, 1)
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=5)
+    assert_allclose(bin_thresholds[0], [2, 4, 6, 8])
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=10)
+    assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=11)
+    assert_allclose(bin_thresholds[0], np.arange(10) + .5)
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=255)
+    assert_allclose(bin_thresholds[0], np.arange(10) + .5)
+
+
+def test_find_binning_thresholds_random_data():
+    bin_thresholds = _find_binning_thresholds(DATA, max_bins=255,
+                                              random_state=0)
+    assert len(bin_thresholds) == 2
+    for i in range(len(bin_thresholds)):
+        assert bin_thresholds[i].shape == (254,)  # 255 - 1
+        assert bin_thresholds[i].dtype == DATA.dtype
+
+    assert_allclose(bin_thresholds[0][[64, 128, 192]],
+                    np.array([-0.7, 0.0, 0.7]), atol=1e-1)
+
+    assert_allclose(bin_thresholds[1][[64, 128, 192]],
+                    np.array([9.99, 10.00, 10.01]), atol=1e-2)
+
+
+def test_find_binning_thresholds_low_n_bins():
+    bin_thresholds = _find_binning_thresholds(DATA, max_bins=128,
+                                              random_state=0)
+    assert len(bin_thresholds) == 2
+    for i in range(len(bin_thresholds)):
+        assert bin_thresholds[i].shape == (127,)  # 128 - 1
+        assert bin_thresholds[i].dtype == DATA.dtype
+
+
+@pytest.mark.parametrize('n_bins', (2, 257))
+def test_invalid_n_bins(n_bins):
+    err_msg = (
+        'n_bins={} should be no smaller than 3 and no larger than 256'
+        .format(n_bins))
+    with pytest.raises(ValueError, match=err_msg):
+        _BinMapper(n_bins=n_bins).fit(DATA)
+
+
+def test_bin_mapper_n_features_transform():
+    mapper = _BinMapper(n_bins=42, random_state=42).fit(DATA)
+    err_msg = 'This estimator was fitted with 2 features but 4 got passed'
+    with pytest.raises(ValueError, match=err_msg):
+        mapper.transform(np.repeat(DATA, 2, axis=1))
+
+
+@pytest.mark.parametrize('max_bins', [16, 128, 255])
+def test_map_to_bins(max_bins):
+    bin_thresholds = _find_binning_thresholds(DATA, max_bins=max_bins,
+                                              random_state=0)
+    binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order='F')
+    last_bin_idx = max_bins
+    _map_to_bins(DATA, bin_thresholds, last_bin_idx, binned)
+    assert binned.shape == DATA.shape
+    assert binned.dtype == np.uint8
+    assert binned.flags.f_contiguous
+
+    min_indices = DATA.argmin(axis=0)
+    max_indices = DATA.argmax(axis=0)
+
+    for feature_idx, min_idx in enumerate(min_indices):
+        assert binned[min_idx, feature_idx] == 0
+    for feature_idx, max_idx in enumerate(max_indices):
+        assert binned[max_idx, feature_idx] == max_bins - 1
+
+
+@pytest.mark.parametrize("max_bins", [5, 10, 42])
+def test_bin_mapper_random_data(max_bins):
+    n_samples, n_features = DATA.shape
+
+    expected_count_per_bin = n_samples // max_bins
+    tol = int(0.05 * expected_count_per_bin)
+
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    mapper = _BinMapper(n_bins=n_bins, random_state=42).fit(DATA)
+    binned = mapper.transform(DATA)
+
+    assert binned.shape == (n_samples, n_features)
+    assert binned.dtype == np.uint8
+    assert_array_equal(binned.min(axis=0), np.array([0, 0]))
+    assert_array_equal(binned.max(axis=0),
+                       np.array([max_bins - 1, max_bins - 1]))
+    assert len(mapper.bin_thresholds_) == n_features
+    for bin_thresholds_feature in mapper.bin_thresholds_:
+        assert bin_thresholds_feature.shape == (max_bins - 1,)
+        assert bin_thresholds_feature.dtype == DATA.dtype
+    assert np.all(mapper.n_bins_non_missing_ == max_bins)
+
+    # Check that the binned data is approximately balanced across bins.
+    for feature_idx in range(n_features):
+        for bin_idx in range(max_bins):
+            count = (binned[:, feature_idx] == bin_idx).sum()
+            assert abs(count - expected_count_per_bin) < tol
+
+
+@pytest.mark.parametrize("n_samples, max_bins", [
+    (5, 5),
+    (5, 10),
+    (5, 11),
+    (42, 255)
+])
+def test_bin_mapper_small_random_data(n_samples, max_bins):
+    data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1)
+    assert len(np.unique(data)) == n_samples
+
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    mapper = _BinMapper(n_bins=n_bins, random_state=42)
+    binned = mapper.fit_transform(data)
+
+    assert binned.shape == data.shape
+    assert binned.dtype == np.uint8
+    assert_array_equal(binned.ravel()[np.argsort(data.ravel())],
+                       np.arange(n_samples))
+
+
+@pytest.mark.parametrize("max_bins, n_distinct, multiplier", [
+    (5, 5, 1),
+    (5, 5, 3),
+    (255, 12, 42),
+])
+def test_bin_mapper_identity_repeated_values(max_bins, n_distinct, multiplier):
+    data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1)
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    binned = _BinMapper(n_bins=n_bins).fit_transform(data)
+    assert_array_equal(data, binned)
+
+
+@pytest.mark.parametrize('n_distinct', [2, 7, 42])
+def test_bin_mapper_repeated_values_invariance(n_distinct):
+    rng = np.random.RandomState(42)
+    distinct_values = rng.normal(size=n_distinct)
+    assert len(np.unique(distinct_values)) == n_distinct
+
+    repeated_indices = rng.randint(low=0, high=n_distinct, size=1000)
+    data = distinct_values[repeated_indices]
+    rng.shuffle(data)
+    assert_array_equal(np.unique(data), np.sort(distinct_values))
+
+    data = data.reshape(-1, 1)
+
+    mapper_1 = _BinMapper(n_bins=n_distinct + 1)
+    binned_1 = mapper_1.fit_transform(data)
+    assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct))
+
+    # Adding more bins to the mapper yields the same results (same thresholds)
+    mapper_2 = _BinMapper(n_bins=min(256, n_distinct * 3) + 1)
+    binned_2 = mapper_2.fit_transform(data)
+
+    assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0])
+    assert_array_equal(binned_1, binned_2)
+
+
+@pytest.mark.parametrize("max_bins, scale, offset", [
+    (3, 2, -1),
+    (42, 1, 0),
+    (255, 0.3, 42),
+])
+def test_bin_mapper_identity_small(max_bins, scale, offset):
+    data = np.arange(max_bins).reshape(-1, 1) * scale + offset
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    binned = _BinMapper(n_bins=n_bins).fit_transform(data)
+    assert_array_equal(binned, np.arange(max_bins).reshape(-1, 1))
+
+
+@pytest.mark.parametrize('max_bins_small, max_bins_large', [
+    (2, 2),
+    (3, 3),
+    (4, 4),
+    (42, 42),
+    (255, 255),
+    (5, 17),
+    (42, 255),
+])
+def test_bin_mapper_idempotence(max_bins_small, max_bins_large):
+    assert max_bins_large >= max_bins_small
+    data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1)
+    mapper_small = _BinMapper(n_bins=max_bins_small + 1)
+    mapper_large = _BinMapper(n_bins=max_bins_small + 1)
+    binned_small = mapper_small.fit_transform(data)
+    binned_large = mapper_large.fit_transform(binned_small)
+    assert_array_equal(binned_small, binned_large)
+
+
+@pytest.mark.parametrize('n_bins', [10, 100, 256])
+@pytest.mark.parametrize('diff', [-5, 0, 5])
+def test_n_bins_non_missing(n_bins, diff):
+    # Check that n_bins_non_missing is n_unique_values when
+    # there are not a lot of unique values, else n_bins - 1.
+
+    n_unique_values = n_bins + diff
+    X = list(range(n_unique_values)) * 2
+    X = np.array(X).reshape(-1, 1)
+    mapper = _BinMapper(n_bins=n_bins).fit(X)
+    assert np.all(mapper.n_bins_non_missing_ == min(
+        n_bins - 1, n_unique_values))
+
+
+def test_subsample():
+    # Make sure bin thresholds are different when applying subsampling
+    mapper_no_subsample = _BinMapper(subsample=None, random_state=0).fit(DATA)
+    mapper_subsample = _BinMapper(subsample=256, random_state=0).fit(DATA)
+
+    for feature in range(DATA.shape[1]):
+        assert not np.allclose(mapper_no_subsample.bin_thresholds_[feature],
+                               mapper_subsample.bin_thresholds_[feature],
+                               rtol=1e-4)
+
+
+@pytest.mark.parametrize(
+    'n_bins, n_bins_non_missing, X_trans_expected', [
+        (256, [4, 2, 2], [[0,   0,   0],  # 255 <=> missing value
+                          [255, 255, 0],
+                          [1,   0,   0],
+                          [255, 1,   1],
+                          [2,   1,   1],
+                          [3,   0,   0]]),
+        (3, [2, 2, 2], [[0, 0, 0],  # 2 <=> missing value
+                        [2, 2, 0],
+                        [0, 0, 0],
+                        [2, 1, 1],
+                        [1, 1, 1],
+                        [1, 0, 0]])])
+def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected):
+    # check for missing values: make sure nans are mapped to the last bin
+    # and that the _BinMapper attributes are correct
+
+    X = [[1,      1,      0],
+         [np.NaN, np.NaN, 0],
+         [2,      1,      0],
+         [np.NaN, 2,      1],
+         [3,      2,      1],
+         [4,      1,      0]]
+
+    X = np.array(X)
+
+    mapper = _BinMapper(n_bins=n_bins)
+    mapper.fit(X)
+
+    assert_array_equal(mapper.n_bins_non_missing_, n_bins_non_missing)
+
+    for feature_idx in range(X.shape[1]):
+        assert len(mapper.bin_thresholds_[feature_idx]) == \
+            n_bins_non_missing[feature_idx] - 1
+
+    assert mapper.missing_values_bin_idx_ == n_bins - 1
+
+    X_trans = mapper.transform(X)
+    assert_array_equal(X_trans, X_trans_expected)
+
+
+def test_infinite_values():
+    # Make sure infinite values are properly handled.
+    bin_mapper = _BinMapper()
+
+    X = np.array([-np.inf, 0, 1,  np.inf]).reshape(-1, 1)
+
+    bin_mapper.fit(X)
+    assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, .5, ALMOST_INF])
+    assert bin_mapper.n_bins_non_missing_ == [4]
+
+    expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1)
+    assert_array_equal(bin_mapper.transform(X), expected_binned_X)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
@ -0,0 +1,223 @@
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+from sklearn.datasets import make_classification, make_regression
+import numpy as np
+import pytest
+
+# To use this experimental feature, we need to explicitly ask for it:
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.ensemble._hist_gradient_boosting.utils import (
+    get_equivalent_estimator)
+
+
+@pytest.mark.parametrize('seed', range(5))
+@pytest.mark.parametrize('min_samples_leaf', (1, 20))
+@pytest.mark.parametrize('n_samples, max_leaf_nodes', [
+    (255, 4096),
+    (1000, 8),
+])
+def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
+                                     max_leaf_nodes):
+    # Make sure sklearn has the same predictions as lightgbm for easy targets.
+    #
+    # In particular when the size of the trees are bound and the number of
+    # samples is large enough, the structure of the prediction trees found by
+    # LightGBM and sklearn should be exactly identical.
+    #
+    # Notes:
+    # - Several candidate splits may have equal gains when the number of
+    #   samples in a node is low (and because of float errors). Therefore the
+    #   predictions on the test set might differ if the structure of the tree
+    #   is not exactly the same. To avoid this issue we only compare the
+    #   predictions on the test set when the number of samples is large enough
+    #   and max_leaf_nodes is low enough.
+    # - To ignore  discrepancies caused by small differences the binning
+    #   strategy, data is pre-binned if n_samples > 255.
+    # - We don't check the least_absolute_deviation loss here. This is because
+    #   LightGBM's computation of the median (used for the initial value of
+    #   raw_prediction) is a bit off (they'll e.g. return midpoints when there
+    #   is no need to.). Since these tests only run 1 iteration, the
+    #   discrepancy between the initial values leads to biggish differences in
+    #   the predictions. These differences are much smaller with more
+    #   iterations.
+    pytest.importorskip("lightgbm")
+
+    rng = np.random.RandomState(seed=seed)
+    n_samples = n_samples
+    max_iter = 1
+    max_bins = 255
+
+    X, y = make_regression(n_samples=n_samples, n_features=5,
+                           n_informative=5, random_state=0)
+
+    if n_samples > 255:
+        # bin data and convert it to float32 so that the estimator doesn't
+        # treat it as pre-binned
+        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+
+    est_sklearn = HistGradientBoostingRegressor(
+        max_iter=max_iter,
+        max_bins=max_bins,
+        learning_rate=1,
+        early_stopping=False,
+        min_samples_leaf=min_samples_leaf,
+        max_leaf_nodes=max_leaf_nodes)
+    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
+
+    est_lightgbm.fit(X_train, y_train)
+    est_sklearn.fit(X_train, y_train)
+
+    # We need X to be treated an numerical data, not pre-binned data.
+    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
+
+    pred_lightgbm = est_lightgbm.predict(X_train)
+    pred_sklearn = est_sklearn.predict(X_train)
+    # less than 1% of the predictions are different up to the 3rd decimal
+    assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < .011
+
+    if max_leaf_nodes < 10 and n_samples >= 1000:
+        pred_lightgbm = est_lightgbm.predict(X_test)
+        pred_sklearn = est_sklearn.predict(X_test)
+        # less than 1% of the predictions are different up to the 4th decimal
+        assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < .01
+
+
+@pytest.mark.parametrize('seed', range(5))
+@pytest.mark.parametrize('min_samples_leaf', (1, 20))
+@pytest.mark.parametrize('n_samples, max_leaf_nodes', [
+    (255, 4096),
+    (1000, 8),
+])
+def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
+                                         max_leaf_nodes):
+    # Same as test_same_predictions_regression but for classification
+    pytest.importorskip("lightgbm")
+
+    rng = np.random.RandomState(seed=seed)
+    n_samples = n_samples
+    max_iter = 1
+    max_bins = 255
+
+    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5,
+                               n_informative=5, n_redundant=0, random_state=0)
+
+    if n_samples > 255:
+        # bin data and convert it to float32 so that the estimator doesn't
+        # treat it as pre-binned
+        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+
+    est_sklearn = HistGradientBoostingClassifier(
+        loss='binary_crossentropy',
+        max_iter=max_iter,
+        max_bins=max_bins,
+        learning_rate=1,
+        early_stopping=False,
+        min_samples_leaf=min_samples_leaf,
+        max_leaf_nodes=max_leaf_nodes)
+    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
+
+    est_lightgbm.fit(X_train, y_train)
+    est_sklearn.fit(X_train, y_train)
+
+    # We need X to be treated an numerical data, not pre-binned data.
+    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
+
+    pred_lightgbm = est_lightgbm.predict(X_train)
+    pred_sklearn = est_sklearn.predict(X_train)
+    assert np.mean(pred_sklearn == pred_lightgbm) > .89
+
+    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
+    acc_sklearn = accuracy_score(y_train, pred_sklearn)
+    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)
+
+    if max_leaf_nodes < 10 and n_samples >= 1000:
+
+        pred_lightgbm = est_lightgbm.predict(X_test)
+        pred_sklearn = est_sklearn.predict(X_test)
+        assert np.mean(pred_sklearn == pred_lightgbm) > .89
+
+        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
+        acc_sklearn = accuracy_score(y_test, pred_sklearn)
+        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
+
+
+@pytest.mark.parametrize('seed', range(5))
+@pytest.mark.parametrize('min_samples_leaf', (1, 20))
+@pytest.mark.parametrize('n_samples, max_leaf_nodes', [
+    (255, 4096),
+    (10000, 8),
+])
+def test_same_predictions_multiclass_classification(
+        seed, min_samples_leaf, n_samples, max_leaf_nodes):
+    # Same as test_same_predictions_regression but for classification
+    pytest.importorskip("lightgbm")
+
+    rng = np.random.RandomState(seed=seed)
+    n_samples = n_samples
+    max_iter = 1
+    max_bins = 255
+    lr = 1
+
+    X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5,
+                               n_informative=5, n_redundant=0,
+                               n_clusters_per_class=1, random_state=0)
+
+    if n_samples > 255:
+        # bin data and convert it to float32 so that the estimator doesn't
+        # treat it as pre-binned
+        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+
+    est_sklearn = HistGradientBoostingClassifier(
+        loss='categorical_crossentropy',
+        max_iter=max_iter,
+        max_bins=max_bins,
+        learning_rate=lr,
+        early_stopping=False,
+        min_samples_leaf=min_samples_leaf,
+        max_leaf_nodes=max_leaf_nodes)
+    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
+
+    est_lightgbm.fit(X_train, y_train)
+    est_sklearn.fit(X_train, y_train)
+
+    # We need X to be treated an numerical data, not pre-binned data.
+    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
+
+    pred_lightgbm = est_lightgbm.predict(X_train)
+    pred_sklearn = est_sklearn.predict(X_train)
+    assert np.mean(pred_sklearn == pred_lightgbm) > .89
+
+    proba_lightgbm = est_lightgbm.predict_proba(X_train)
+    proba_sklearn = est_sklearn.predict_proba(X_train)
+    # assert more than 75% of the predicted probabilities are the same up to
+    # the second decimal
+    assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75
+
+    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
+    acc_sklearn = accuracy_score(y_train, pred_sklearn)
+    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
+
+    if max_leaf_nodes < 10 and n_samples >= 1000:
+
+        pred_lightgbm = est_lightgbm.predict(X_test)
+        pred_sklearn = est_sklearn.predict(X_test)
+        assert np.mean(pred_sklearn == pred_lightgbm) > .89
+
+        proba_lightgbm = est_lightgbm.predict_proba(X_train)
+        proba_sklearn = est_sklearn.predict_proba(X_train)
+        # assert more than 75% of the predicted probabilities are the same up
+        # to the second decimal
+        assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75
+
+        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
+        acc_sklearn = accuracy_score(y_test, pred_sklearn)
+        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@ -0,0 +1,746 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+from sklearn.datasets import make_classification, make_regression
+from sklearn.datasets import make_low_rank_matrix
+from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler
+from sklearn.model_selection import train_test_split
+from sklearn.base import clone, BaseEstimator, TransformerMixin
+from sklearn.pipeline import make_pipeline
+from sklearn.metrics import mean_poisson_deviance
+from sklearn.dummy import DummyRegressor
+
+# To use this experimental feature, we need to explicitly ask for it:
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
+from sklearn.ensemble._hist_gradient_boosting.loss import LeastSquares
+from sklearn.ensemble._hist_gradient_boosting.loss import BinaryCrossEntropy
+from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.utils import shuffle
+
+
+X_classification, y_classification = make_classification(random_state=0)
+X_regression, y_regression = make_regression(random_state=0)
+
+
+def _make_dumb_dataset(n_samples):
+    """Make a dumb dataset to test early stopping."""
+    rng = np.random.RandomState(42)
+    X_dumb = rng.randn(n_samples, 1)
+    y_dumb = (X_dumb[:, 0] > 0).astype('int64')
+    return X_dumb, y_dumb
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+@pytest.mark.parametrize(
+    'params, err_msg',
+    [({'loss': 'blah'}, 'Loss blah is not supported for'),
+     ({'learning_rate': 0}, 'learning_rate=0 must be strictly positive'),
+     ({'learning_rate': -1}, 'learning_rate=-1 must be strictly positive'),
+     ({'max_iter': 0}, 'max_iter=0 must not be smaller than 1'),
+     ({'max_leaf_nodes': 0}, 'max_leaf_nodes=0 should not be smaller than 2'),
+     ({'max_leaf_nodes': 1}, 'max_leaf_nodes=1 should not be smaller than 2'),
+     ({'max_depth': 0}, 'max_depth=0 should not be smaller than 1'),
+     ({'min_samples_leaf': 0}, 'min_samples_leaf=0 should not be smaller'),
+     ({'l2_regularization': -1}, 'l2_regularization=-1 must be positive'),
+     ({'max_bins': 1}, 'max_bins=1 should be no smaller than 2 and no larger'),
+     ({'max_bins': 256}, 'max_bins=256 should be no smaller than 2 and no'),
+     ({'n_iter_no_change': -1}, 'n_iter_no_change=-1 must be positive'),
+     ({'validation_fraction': -1}, 'validation_fraction=-1 must be strictly'),
+     ({'validation_fraction': 0}, 'validation_fraction=0 must be strictly'),
+     ({'tol': -1}, 'tol=-1 must not be smaller than 0')]
+)
+def test_init_parameters_validation(GradientBoosting, X, y, params, err_msg):
+
+    with pytest.raises(ValueError, match=err_msg):
+        GradientBoosting(**params).fit(X, y)
+
+
+def test_invalid_classification_loss():
+    binary_clf = HistGradientBoostingClassifier(loss="binary_crossentropy")
+    err_msg = ("loss='binary_crossentropy' is not defined for multiclass "
+               "classification with n_classes=3, use "
+               "loss='categorical_crossentropy' instead")
+    with pytest.raises(ValueError, match=err_msg):
+        binary_clf.fit(np.zeros(shape=(3, 2)), np.arange(3))
+
+
+@pytest.mark.parametrize(
+    'scoring, validation_fraction, early_stopping, n_iter_no_change, tol', [
+        ('neg_mean_squared_error', .1, True, 5, 1e-7),  # use scorer
+        ('neg_mean_squared_error', None, True, 5, 1e-1),  # use scorer on train
+        (None, .1, True, 5, 1e-7),  # same with default scorer
+        (None, None, True, 5, 1e-1),
+        ('loss', .1, True, 5, 1e-7),  # use loss
+        ('loss', None, True, 5, 1e-1),  # use loss on training data
+        (None, None, False, 5, None),  # no early stopping
+        ])
+def test_early_stopping_regression(scoring, validation_fraction,
+                                   early_stopping, n_iter_no_change, tol):
+
+    max_iter = 200
+
+    X, y = make_regression(n_samples=50, random_state=0)
+
+    gb = HistGradientBoostingRegressor(
+        verbose=1,  # just for coverage
+        min_samples_leaf=5,  # easier to overfit fast
+        scoring=scoring,
+        tol=tol,
+        early_stopping=early_stopping,
+        validation_fraction=validation_fraction,
+        max_iter=max_iter,
+        n_iter_no_change=n_iter_no_change,
+        random_state=0
+    )
+    gb.fit(X, y)
+
+    if early_stopping:
+        assert n_iter_no_change <= gb.n_iter_ < max_iter
+    else:
+        assert gb.n_iter_ == max_iter
+
+
+@pytest.mark.parametrize('data', (
+    make_classification(n_samples=30, random_state=0),
+    make_classification(n_samples=30, n_classes=3, n_clusters_per_class=1,
+                        random_state=0)
+))
+@pytest.mark.parametrize(
+    'scoring, validation_fraction, early_stopping, n_iter_no_change, tol', [
+        ('accuracy', .1, True, 5, 1e-7),  # use scorer
+        ('accuracy', None, True, 5, 1e-1),  # use scorer on training data
+        (None, .1, True, 5, 1e-7),  # same with default scorer
+        (None, None, True, 5, 1e-1),
+        ('loss', .1, True, 5, 1e-7),  # use loss
+        ('loss', None, True, 5, 1e-1),  # use loss on training data
+        (None, None, False, 5, None),  # no early stopping
+        ])
+def test_early_stopping_classification(data, scoring, validation_fraction,
+                                       early_stopping, n_iter_no_change, tol):
+
+    max_iter = 50
+
+    X, y = data
+
+    gb = HistGradientBoostingClassifier(
+        verbose=1,  # just for coverage
+        min_samples_leaf=5,  # easier to overfit fast
+        scoring=scoring,
+        tol=tol,
+        early_stopping=early_stopping,
+        validation_fraction=validation_fraction,
+        max_iter=max_iter,
+        n_iter_no_change=n_iter_no_change,
+        random_state=0
+    )
+    gb.fit(X, y)
+
+    if early_stopping is True:
+        assert n_iter_no_change <= gb.n_iter_ < max_iter
+    else:
+        assert gb.n_iter_ == max_iter
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, *_make_dumb_dataset(10000)),
+    (HistGradientBoostingClassifier, *_make_dumb_dataset(10001)),
+    (HistGradientBoostingRegressor, *_make_dumb_dataset(10000)),
+    (HistGradientBoostingRegressor, *_make_dumb_dataset(10001))
+])
+def test_early_stopping_default(GradientBoosting, X, y):
+    # Test that early stopping is enabled by default if and only if there
+    # are more than 10000 samples
+    gb = GradientBoosting(max_iter=10, n_iter_no_change=2, tol=1e-1)
+    gb.fit(X, y)
+    if X.shape[0] > 10000:
+        assert gb.n_iter_ < gb.max_iter
+    else:
+        assert gb.n_iter_ == gb.max_iter
+
+
+@pytest.mark.parametrize(
+    'scores, n_iter_no_change, tol, stopping',
+    [
+        ([], 1, 0.001, False),  # not enough iterations
+        ([1, 1, 1], 5, 0.001, False),  # not enough iterations
+        ([1, 1, 1, 1, 1], 5, 0.001, False),  # not enough iterations
+        ([1, 2, 3, 4, 5, 6], 5, 0.001, False),  # significant improvement
+        ([1, 2, 3, 4, 5, 6], 5, 0., False),  # significant improvement
+        ([1, 2, 3, 4, 5, 6], 5, 0.999, False),  # significant improvement
+        ([1, 2, 3, 4, 5, 6], 5, 5 - 1e-5, False),  # significant improvement
+        ([1] * 6, 5, 0., True),  # no significant improvement
+        ([1] * 6, 5, 0.001, True),  # no significant improvement
+        ([1] * 6, 5, 5, True),  # no significant improvement
+    ]
+)
+def test_should_stop(scores, n_iter_no_change, tol, stopping):
+
+    gbdt = HistGradientBoostingClassifier(
+        n_iter_no_change=n_iter_no_change, tol=tol
+    )
+    assert gbdt._should_stop(scores) == stopping
+
+
+def test_least_absolute_deviation():
+    # For coverage only.
+    X, y = make_regression(n_samples=500, random_state=0)
+    gbdt = HistGradientBoostingRegressor(loss='least_absolute_deviation',
+                                         random_state=0)
+    gbdt.fit(X, y)
+    assert gbdt.score(X, y) > .9
+
+
+@pytest.mark.parametrize('y', [([1., -2., 0.]), ([0., 0., 0.])])
+def test_poisson_y_positive(y):
+    # Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0.
+    err_msg = r"loss='poisson' requires non-negative y and sum\(y\) > 0."
+    gbdt = HistGradientBoostingRegressor(loss='poisson', random_state=0)
+    with pytest.raises(ValueError, match=err_msg):
+        gbdt.fit(np.zeros(shape=(len(y), 1)), y)
+
+
+def test_poisson():
+    # For Poisson distributed target, Poisson loss should give better results
+    # than least squares measured in Poisson deviance as metric.
+    rng = np.random.RandomState(42)
+    n_train, n_test, n_features = 500, 100, 100
+    X = make_low_rank_matrix(n_samples=n_train+n_test, n_features=n_features,
+                             random_state=rng)
+    # We create a log-linear Poisson model and downscale coef as it will get
+    # exponentiated.
+    coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
+    y = rng.poisson(lam=np.exp(X @ coef))
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test,
+                                                        random_state=rng)
+    gbdt_pois = HistGradientBoostingRegressor(loss='poisson', random_state=rng)
+    gbdt_ls = HistGradientBoostingRegressor(loss='least_squares',
+                                            random_state=rng)
+    gbdt_pois.fit(X_train, y_train)
+    gbdt_ls.fit(X_train, y_train)
+    dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)
+
+    for X, y in [(X_train, y_train), (X_test, y_test)]:
+        metric_pois = mean_poisson_deviance(y, gbdt_pois.predict(X))
+        # least_squares might produce non-positive predictions => clip
+        metric_ls = mean_poisson_deviance(y, np.clip(gbdt_ls.predict(X), 1e-15,
+                                                     None))
+        metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
+        assert metric_pois < metric_ls
+        assert metric_pois < metric_dummy
+
+
+def test_binning_train_validation_are_separated():
+    # Make sure training and validation data are binned separately.
+    # See issue 13926
+
+    rng = np.random.RandomState(0)
+    validation_fraction = .2
+    gb = HistGradientBoostingClassifier(
+        early_stopping=True,
+        validation_fraction=validation_fraction,
+        random_state=rng
+    )
+    gb.fit(X_classification, y_classification)
+    mapper_training_data = gb.bin_mapper_
+
+    # Note that since the data is small there is no subsampling and the
+    # random_state doesn't matter
+    mapper_whole_data = _BinMapper(random_state=0)
+    mapper_whole_data.fit(X_classification)
+
+    n_samples = X_classification.shape[0]
+    assert np.all(mapper_training_data.n_bins_non_missing_ ==
+                  int((1 - validation_fraction) * n_samples))
+    assert np.all(mapper_training_data.n_bins_non_missing_ !=
+                  mapper_whole_data.n_bins_non_missing_)
+
+
+def test_missing_values_trivial():
+    # sanity check for missing values support. With only one feature and
+    # y == isnan(X), the gbdt is supposed to reach perfect accuracy on the
+    # training set.
+
+    n_samples = 100
+    n_features = 1
+    rng = np.random.RandomState(0)
+
+    X = rng.normal(size=(n_samples, n_features))
+    mask = rng.binomial(1, .5, size=X.shape).astype(np.bool)
+    X[mask] = np.nan
+    y = mask.ravel()
+    gb = HistGradientBoostingClassifier()
+    gb.fit(X, y)
+
+    assert gb.score(X, y) == pytest.approx(1)
+
+
+@pytest.mark.parametrize('problem', ('classification', 'regression'))
+@pytest.mark.parametrize(
+    'missing_proportion, expected_min_score_classification, '
+    'expected_min_score_regression', [
+        (.1, .97, .89),
+        (.2, .93, .81),
+        (.5, .79, .52)])
+def test_missing_values_resilience(problem, missing_proportion,
+                                   expected_min_score_classification,
+                                   expected_min_score_regression):
+    # Make sure the estimators can deal with missing values and still yield
+    # decent predictions
+
+    rng = np.random.RandomState(0)
+    n_samples = 1000
+    n_features = 2
+    if problem == 'regression':
+        X, y = make_regression(n_samples=n_samples, n_features=n_features,
+                               n_informative=n_features, random_state=rng)
+        gb = HistGradientBoostingRegressor()
+        expected_min_score = expected_min_score_regression
+    else:
+        X, y = make_classification(n_samples=n_samples, n_features=n_features,
+                                   n_informative=n_features, n_redundant=0,
+                                   n_repeated=0, random_state=rng)
+        gb = HistGradientBoostingClassifier()
+        expected_min_score = expected_min_score_classification
+
+    mask = rng.binomial(1, missing_proportion, size=X.shape).astype(np.bool)
+    X[mask] = np.nan
+
+    gb.fit(X, y)
+
+    assert gb.score(X, y) > expected_min_score
+
+
+@pytest.mark.parametrize('data', [
+    make_classification(random_state=0, n_classes=2),
+    make_classification(random_state=0, n_classes=3, n_informative=3)
+], ids=['binary_crossentropy', 'categorical_crossentropy'])
+def test_zero_division_hessians(data):
+    # non regression test for issue #14018
+    # make sure we avoid zero division errors when computing the leaves values.
+
+    # If the learning rate is too high, the raw predictions are bad and will
+    # saturate the softmax (or sigmoid in binary classif). This leads to
+    # probabilities being exactly 0 or 1, gradients being constant, and
+    # hessians being zero.
+    X, y = data
+    gb = HistGradientBoostingClassifier(learning_rate=100, max_iter=10)
+    gb.fit(X, y)
+
+
+def test_small_trainset():
+    # Make sure that the small trainset is stratified and has the expected
+    # length (10k samples)
+    n_samples = 20000
+    original_distrib = {0: 0.1, 1: 0.2, 2: 0.3, 3: 0.4}
+    rng = np.random.RandomState(42)
+    X = rng.randn(n_samples).reshape(n_samples, 1)
+    y = [[class_] * int(prop * n_samples) for (class_, prop)
+         in original_distrib.items()]
+    y = shuffle(np.concatenate(y))
+    gb = HistGradientBoostingClassifier()
+
+    # Compute the small training set
+    X_small, y_small, _ = gb._get_small_trainset(X, y, seed=42,
+                                                 sample_weight_train=None)
+
+    # Compute the class distribution in the small training set
+    unique, counts = np.unique(y_small, return_counts=True)
+    small_distrib = {class_: count / 10000 for (class_, count)
+                     in zip(unique, counts)}
+
+    # Test that the small training set has the expected length
+    assert X_small.shape[0] == 10000
+    assert y_small.shape[0] == 10000
+
+    # Test that the class distributions in the whole dataset and in the small
+    # training set are identical
+    assert small_distrib == pytest.approx(original_distrib)
+
+
+def test_missing_values_minmax_imputation():
+    # Compare the buit-in missing value handling of Histogram GBC with an
+    # a-priori missing value imputation strategy that should yield the same
+    # results in terms of decision function.
+    #
+    # Each feature (containing NaNs) is replaced by 2 features:
+    # - one where the nans are replaced by min(feature) - 1
+    # - one where the nans are replaced by max(feature) + 1
+    # A split where nans go to the left has an equivalent split in the
+    # first (min) feature, and a split where nans go to the right has an
+    # equivalent split in the second (max) feature.
+    #
+    # Assuming the data is such that there is never a tie to select the best
+    # feature to split on during training, the learned decision trees should be
+    # strictly equivalent (learn a sequence of splits that encode the same
+    # decision function).
+    #
+    # The MinMaxImputer transformer is meant to be a toy implementation of the
+    # "Missing In Attributes" (MIA) missing value handling for decision trees
+    # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305
+    # The implementation of MIA as an imputation transformer was suggested by
+    # "Remark 3" in https://arxiv.org/abs/1902.06931
+
+    class MinMaxImputer(BaseEstimator, TransformerMixin):
+
+        def fit(self, X, y=None):
+            mm = MinMaxScaler().fit(X)
+            self.data_min_ = mm.data_min_
+            self.data_max_ = mm.data_max_
+            return self
+
+        def transform(self, X):
+            X_min, X_max = X.copy(), X.copy()
+
+            for feature_idx in range(X.shape[1]):
+                nan_mask = np.isnan(X[:, feature_idx])
+                X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1
+                X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1
+
+            return np.concatenate([X_min, X_max], axis=1)
+
+    def make_missing_value_data(n_samples=int(1e4), seed=0):
+        rng = np.random.RandomState(seed)
+        X, y = make_regression(n_samples=n_samples, n_features=4,
+                               random_state=rng)
+
+        # Pre-bin the data to ensure a deterministic handling by the 2
+        # strategies and also make it easier to insert np.nan in a structured
+        # way:
+        X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X)
+
+        # First feature has missing values completely at random:
+        rnd_mask = rng.rand(X.shape[0]) > 0.9
+        X[rnd_mask, 0] = np.nan
+
+        # Second and third features have missing values for extreme values
+        # (censoring missingness):
+        low_mask = X[:, 1] == 0
+        X[low_mask, 1] = np.nan
+
+        high_mask = X[:, 2] == X[:, 2].max()
+        X[high_mask, 2] = np.nan
+
+        # Make the last feature nan pattern very informative:
+        y_max = np.percentile(y, 70)
+        y_max_mask = y >= y_max
+        y[y_max_mask] = y_max
+        X[y_max_mask, 3] = np.nan
+
+        # Check that there is at least one missing value in each feature:
+        for feature_idx in range(X.shape[1]):
+            assert any(np.isnan(X[:, feature_idx]))
+
+        # Let's use a test set to check that the learned decision function is
+        # the same as evaluated on unseen data. Otherwise it could just be the
+        # case that we find two independent ways to overfit the training set.
+        return train_test_split(X, y, random_state=rng)
+
+    # n_samples need to be large enough to minimize the likelihood of having
+    # several candidate splits with the same gain value in a given tree.
+    X_train, X_test, y_train, y_test = make_missing_value_data(
+        n_samples=int(1e4), seed=0)
+
+    # Use a small number of leaf nodes and iterations so as to keep
+    # under-fitting models to minimize the likelihood of ties when training the
+    # model.
+    gbm1 = HistGradientBoostingRegressor(max_iter=100,
+                                         max_leaf_nodes=5,
+                                         random_state=0)
+    gbm1.fit(X_train, y_train)
+
+    gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1))
+    gbm2.fit(X_train, y_train)
+
+    # Check that the model reach the same score:
+    assert gbm1.score(X_train, y_train) == \
+        pytest.approx(gbm2.score(X_train, y_train))
+
+    assert gbm1.score(X_test, y_test) == \
+        pytest.approx(gbm2.score(X_test, y_test))
+
+    # Check the individual prediction match as a finer grained
+    # decision function check.
+    assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train))
+    assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test))
+
+
+def test_infinite_values():
+    # Basic test for infinite values
+
+    X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
+    y = np.array([0, 0, 1, 1])
+
+    gbdt = HistGradientBoostingRegressor(min_samples_leaf=1)
+    gbdt.fit(X, y)
+    np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4)
+
+
+def test_consistent_lengths():
+    X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
+    y = np.array([0, 0, 1, 1])
+    sample_weight = np.array([.1, .3, .1])
+    gbdt = HistGradientBoostingRegressor()
+    with pytest.raises(ValueError,
+                       match=r"sample_weight.shape == \(3,\), expected"):
+        gbdt.fit(X, y, sample_weight)
+
+    with pytest.raises(ValueError,
+                       match="Found input variables with inconsistent number"):
+        gbdt.fit(X, y[1:])
+
+
+def test_infinite_values_missing_values():
+    # High level test making sure that inf and nan values are properly handled
+    # when both are present. This is similar to
+    # test_split_on_nan_with_infinite_values() in test_grower.py, though we
+    # cannot check the predictions for binned values here.
+
+    X = np.asarray([-np.inf, 0, 1, np.inf, np.nan]).reshape(-1, 1)
+    y_isnan = np.isnan(X.ravel())
+    y_isinf = X.ravel() == np.inf
+
+    stump_clf = HistGradientBoostingClassifier(min_samples_leaf=1, max_iter=1,
+                                               learning_rate=1, max_depth=2)
+
+    assert stump_clf.fit(X, y_isinf).score(X, y_isinf) == 1
+    assert stump_clf.fit(X, y_isnan).score(X, y_isnan) == 1
+
+
+def test_crossentropy_binary_problem():
+    # categorical_crossentropy should only be used if there are more than two
+    # classes present. PR #14869
+    X = [[1], [0]]
+    y = [0, 1]
+    gbrt = HistGradientBoostingClassifier(loss='categorical_crossentropy')
+    with pytest.raises(ValueError,
+                       match="'categorical_crossentropy' is not suitable for"):
+        gbrt.fit(X, y)
+
+
+@pytest.mark.parametrize("scoring", [None, 'loss'])
+def test_string_target_early_stopping(scoring):
+    # Regression tests for #14709 where the targets need to be encoded before
+    # to compute the score
+    rng = np.random.RandomState(42)
+    X = rng.randn(100, 10)
+    y = np.array(['x'] * 50 + ['y'] * 50, dtype=object)
+    gbrt = HistGradientBoostingClassifier(n_iter_no_change=10, scoring=scoring)
+    gbrt.fit(X, y)
+
+
+def test_zero_sample_weights_regression():
+    # Make sure setting a SW to zero amounts to ignoring the corresponding
+    # sample
+
+    X = [[1, 0],
+         [1, 0],
+         [1, 0],
+         [0, 1]]
+    y = [0, 0, 1, 0]
+    # ignore the first 2 training samples by setting their weight to 0
+    sample_weight = [0, 0, 1, 1]
+    gb = HistGradientBoostingRegressor(min_samples_leaf=1)
+    gb.fit(X, y, sample_weight=sample_weight)
+    assert gb.predict([[1, 0]])[0] > 0.5
+
+
+def test_zero_sample_weights_classification():
+    # Make sure setting a SW to zero amounts to ignoring the corresponding
+    # sample
+
+    X = [[1, 0],
+         [1, 0],
+         [1, 0],
+         [0, 1]]
+    y = [0, 0, 1, 0]
+    # ignore the first 2 training samples by setting their weight to 0
+    sample_weight = [0, 0, 1, 1]
+    gb = HistGradientBoostingClassifier(loss='binary_crossentropy',
+                                        min_samples_leaf=1)
+    gb.fit(X, y, sample_weight=sample_weight)
+    assert_array_equal(gb.predict([[1, 0]]), [1])
+
+    X = [[1, 0],
+         [1, 0],
+         [1, 0],
+         [0, 1],
+         [1, 1]]
+    y = [0, 0, 1, 0, 2]
+    # ignore the first 2 training samples by setting their weight to 0
+    sample_weight = [0, 0, 1, 1, 1]
+    gb = HistGradientBoostingClassifier(loss='categorical_crossentropy',
+                                        min_samples_leaf=1)
+    gb.fit(X, y, sample_weight=sample_weight)
+    assert_array_equal(gb.predict([[1, 0]]), [1])
+
+
+@pytest.mark.parametrize('problem', (
+    'regression',
+    'binary_classification',
+    'multiclass_classification'
+))
+@pytest.mark.parametrize('duplication', ('half', 'all'))
+def test_sample_weight_effect(problem, duplication):
+    # High level test to make sure that duplicating a sample is equivalent to
+    # giving it weight of 2.
+
+    # fails for n_samples > 255 because binning does not take sample weights
+    # into account. Keeping n_samples <= 255 makes
+    # sure only unique values are used so SW have no effect on binning.
+    n_samples = 255
+    n_features = 2
+    if problem == 'regression':
+        X, y = make_regression(n_samples=n_samples, n_features=n_features,
+                               n_informative=n_features, random_state=0)
+        Klass = HistGradientBoostingRegressor
+    else:
+        n_classes = 2 if problem == 'binary_classification' else 3
+        X, y = make_classification(n_samples=n_samples, n_features=n_features,
+                                   n_informative=n_features, n_redundant=0,
+                                   n_clusters_per_class=1,
+                                   n_classes=n_classes, random_state=0)
+        Klass = HistGradientBoostingClassifier
+
+    # This test can't pass if min_samples_leaf > 1 because that would force 2
+    # samples to be in the same node in est_sw, while these samples would be
+    # free to be separate in est_dup: est_dup would just group together the
+    # duplicated samples.
+    est = Klass(min_samples_leaf=1)
+
+    # Create dataset with duplicate and corresponding sample weights
+    if duplication == 'half':
+        lim = n_samples // 2
+    else:
+        lim = n_samples
+    X_dup = np.r_[X, X[:lim]]
+    y_dup = np.r_[y, y[:lim]]
+    sample_weight = np.ones(shape=(n_samples))
+    sample_weight[:lim] = 2
+
+    est_sw = clone(est).fit(X, y, sample_weight=sample_weight)
+    est_dup = clone(est).fit(X_dup, y_dup)
+
+    # checking raw_predict is stricter than just predict for classification
+    assert np.allclose(est_sw._raw_predict(X_dup),
+                       est_dup._raw_predict(X_dup))
+
+
+@pytest.mark.parametrize('loss_name', ('least_squares',
+                                       'least_absolute_deviation'))
+def test_sum_hessians_are_sample_weight(loss_name):
+    # For losses with constant hessians, the sum_hessians field of the
+    # histograms must be equal to the sum of the sample weight of samples at
+    # the corresponding bin.
+
+    rng = np.random.RandomState(0)
+    n_samples = 1000
+    n_features = 2
+    X, y = make_regression(n_samples=n_samples, n_features=n_features,
+                           random_state=rng)
+    bin_mapper = _BinMapper()
+    X_binned = bin_mapper.fit_transform(X)
+
+    sample_weight = rng.normal(size=n_samples)
+
+    loss = _LOSSES[loss_name](sample_weight=sample_weight)
+    gradients, hessians = loss.init_gradients_and_hessians(
+        n_samples=n_samples, prediction_dim=1, sample_weight=sample_weight)
+    raw_predictions = rng.normal(size=(1, n_samples))
+    loss.update_gradients_and_hessians(gradients, hessians, y,
+                                       raw_predictions, sample_weight)
+
+    # build sum_sample_weight which contains the sum of the sample weights at
+    # each bin (for each feature). This must be equal to the sum_hessians
+    # field of the corresponding histogram
+    sum_sw = np.zeros(shape=(n_features, bin_mapper.n_bins))
+    for feature_idx in range(n_features):
+        for sample_idx in range(n_samples):
+            sum_sw[feature_idx, X_binned[sample_idx, feature_idx]] += (
+                sample_weight[sample_idx])
+
+    # Build histogram
+    grower = TreeGrower(X_binned, gradients[0], hessians[0],
+                        n_bins=bin_mapper.n_bins)
+    histograms = grower.histogram_builder.compute_histograms_brute(
+        grower.root.sample_indices)
+
+    for feature_idx in range(n_features):
+        for bin_idx in range(bin_mapper.n_bins):
+            assert histograms[feature_idx, bin_idx]['sum_hessians'] == (
+                pytest.approx(sum_sw[feature_idx, bin_idx], rel=1e-5))
+
+
+def test_max_depth_max_leaf_nodes():
+    # Non regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/16179
+    # there was a bug when the max_depth and the max_leaf_nodes criteria were
+    # met at the same time, which would lead to max_leaf_nodes not being
+    # respected.
+    X, y = make_classification(random_state=0)
+    est = HistGradientBoostingClassifier(max_depth=2, max_leaf_nodes=3,
+                                         max_iter=1).fit(X, y)
+    tree = est._predictors[0][0]
+    assert tree.get_max_depth() == 2
+    assert tree.get_n_leaf_nodes() == 3  # would be 4 prior to bug fix
+
+
+def test_early_stopping_on_test_set_with_warm_start():
+    # Non regression test for #16661 where second fit fails with
+    # warm_start=True, early_stopping is on, and no validation set
+    X, y = make_classification(random_state=0)
+    gb = HistGradientBoostingClassifier(
+        max_iter=1, scoring='loss', warm_start=True, early_stopping=True,
+        n_iter_no_change=1, validation_fraction=None)
+
+    gb.fit(X, y)
+    # does not raise on second call
+    gb.set_params(max_iter=2)
+    gb.fit(X, y)
+
+
+@pytest.mark.parametrize('Est', (HistGradientBoostingClassifier,
+                                 HistGradientBoostingRegressor))
+def test_single_node_trees(Est):
+    # Make sure it's still possible to build single-node trees. In that case
+    # the value of the root is set to 0. That's a correct value: if the tree is
+    # single-node that's because min_gain_to_split is not respected right from
+    # the root, so we don't want the tree to have any impact on the
+    # predictions.
+
+    X, y = make_classification(random_state=0)
+    y[:] = 1  # constant target will lead to a single root node
+
+    est = Est(max_iter=20)
+    est.fit(X, y)
+
+    assert all(len(predictor[0].nodes) == 1 for predictor in est._predictors)
+    assert all(predictor[0].nodes[0]['value'] == 0
+               for predictor in est._predictors)
+    # Still gives correct predictions thanks to the baseline prediction
+    assert_allclose(est.predict(X), y)
+
+
+@pytest.mark.parametrize('Est, loss, X, y', [
+    (
+        HistGradientBoostingClassifier,
+        BinaryCrossEntropy(sample_weight=None),
+        X_classification,
+        y_classification
+    ),
+    (
+        HistGradientBoostingRegressor,
+        LeastSquares(sample_weight=None),
+        X_regression,
+        y_regression
+    )
+])
+def test_custom_loss(Est, loss, X, y):
+    est = Est(loss=loss, max_iter=20)
+    est.fit(X, y)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
@ -0,0 +1,399 @@
+import numpy as np
+import pytest
+from pytest import approx
+
+from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+
+
+def _make_training_data(n_bins=256, constant_hessian=True):
+    rng = np.random.RandomState(42)
+    n_samples = 10000
+
+    # Generate some test data directly binned so as to test the grower code
+    # independently of the binning logic.
+    X_binned = rng.randint(0, n_bins - 1, size=(n_samples, 2),
+                           dtype=X_BINNED_DTYPE)
+    X_binned = np.asfortranarray(X_binned)
+
+    def true_decision_function(input_features):
+        """Ground truth decision function
+
+        This is a very simple yet asymmetric decision tree. Therefore the
+        grower code should have no trouble recovering the decision function
+        from 10000 training samples.
+        """
+        if input_features[0] <= n_bins // 2:
+            return -1
+        else:
+            return -1 if input_features[1] <= n_bins // 3 else 1
+
+    target = np.array([true_decision_function(x) for x in X_binned],
+                      dtype=Y_DTYPE)
+
+    # Assume a square loss applied to an initial model that always predicts 0
+    # (hardcoded for this test):
+    all_gradients = target.astype(G_H_DTYPE)
+    shape_hessians = 1 if constant_hessian else all_gradients.shape
+    all_hessians = np.ones(shape=shape_hessians, dtype=G_H_DTYPE)
+
+    return X_binned, all_gradients, all_hessians
+
+
+def _check_children_consistency(parent, left, right):
+    # Make sure the samples are correctly dispatched from a parent to its
+    # children
+    assert parent.left_child is left
+    assert parent.right_child is right
+
+    # each sample from the parent is propagated to one of the two children
+    assert (len(left.sample_indices) + len(right.sample_indices)
+            == len(parent.sample_indices))
+
+    assert (set(left.sample_indices).union(set(right.sample_indices))
+            == set(parent.sample_indices))
+
+    # samples are sent either to the left or the right node, never to both
+    assert (set(left.sample_indices).intersection(set(right.sample_indices))
+            == set())
+
+
+@pytest.mark.parametrize(
+    'n_bins, constant_hessian, stopping_param, shrinkage',
+    [
+        (11, True, "min_gain_to_split", 0.5),
+        (11, False, "min_gain_to_split", 1.),
+        (11, True, "max_leaf_nodes", 1.),
+        (11, False, "max_leaf_nodes", 0.1),
+        (42, True, "max_leaf_nodes", 0.01),
+        (42, False, "max_leaf_nodes", 1.),
+        (256, True, "min_gain_to_split", 1.),
+        (256, True, "max_leaf_nodes", 0.1),
+    ]
+)
+def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage):
+    X_binned, all_gradients, all_hessians = _make_training_data(
+        n_bins=n_bins, constant_hessian=constant_hessian)
+    n_samples = X_binned.shape[0]
+
+    if stopping_param == "max_leaf_nodes":
+        stopping_param = {"max_leaf_nodes": 3}
+    else:
+        stopping_param = {"min_gain_to_split": 0.01}
+
+    grower = TreeGrower(X_binned, all_gradients, all_hessians,
+                        n_bins=n_bins, shrinkage=shrinkage,
+                        min_samples_leaf=1, **stopping_param)
+
+    # The root node is not yet splitted, but the best possible split has
+    # already been evaluated:
+    assert grower.root.left_child is None
+    assert grower.root.right_child is None
+
+    root_split = grower.root.split_info
+    assert root_split.feature_idx == 0
+    assert root_split.bin_idx == n_bins // 2
+    assert len(grower.splittable_nodes) == 1
+
+    # Calling split next applies the next split and computes the best split
+    # for each of the two newly introduced children nodes.
+    left_node, right_node = grower.split_next()
+
+    # All training samples have ben splitted in the two nodes, approximately
+    # 50%/50%
+    _check_children_consistency(grower.root, left_node, right_node)
+    assert len(left_node.sample_indices) > 0.4 * n_samples
+    assert len(left_node.sample_indices) < 0.6 * n_samples
+
+    if grower.min_gain_to_split > 0:
+        # The left node is too pure: there is no gain to split it further.
+        assert left_node.split_info.gain < grower.min_gain_to_split
+        assert left_node in grower.finalized_leaves
+
+    # The right node can still be splitted further, this time on feature #1
+    split_info = right_node.split_info
+    assert split_info.gain > 1.
+    assert split_info.feature_idx == 1
+    assert split_info.bin_idx == n_bins // 3
+    assert right_node.left_child is None
+    assert right_node.right_child is None
+
+    # The right split has not been applied yet. Let's do it now:
+    assert len(grower.splittable_nodes) == 1
+    right_left_node, right_right_node = grower.split_next()
+    _check_children_consistency(right_node, right_left_node, right_right_node)
+    assert len(right_left_node.sample_indices) > 0.1 * n_samples
+    assert len(right_left_node.sample_indices) < 0.2 * n_samples
+
+    assert len(right_right_node.sample_indices) > 0.2 * n_samples
+    assert len(right_right_node.sample_indices) < 0.4 * n_samples
+
+    # All the leafs are pure, it is not possible to split any further:
+    assert not grower.splittable_nodes
+
+    grower._apply_shrinkage()
+
+    # Check the values of the leaves:
+    assert grower.root.left_child.value == approx(shrinkage)
+    assert grower.root.right_child.left_child.value == approx(shrinkage)
+    assert grower.root.right_child.right_child.value == approx(-shrinkage,
+                                                               rel=1e-3)
+
+
+def test_predictor_from_grower():
+    # Build a tree on the toy 3-leaf dataset to extract the predictor.
+    n_bins = 256
+    X_binned, all_gradients, all_hessians = _make_training_data(
+        n_bins=n_bins)
+    grower = TreeGrower(X_binned, all_gradients, all_hessians,
+                        n_bins=n_bins, shrinkage=1.,
+                        max_leaf_nodes=3, min_samples_leaf=5)
+    grower.grow()
+    assert grower.n_nodes == 5  # (2 decision nodes + 3 leaves)
+
+    # Check that the node structure can be converted into a predictor
+    # object to perform predictions at scale
+    predictor = grower.make_predictor()
+    assert predictor.nodes.shape[0] == 5
+    assert predictor.nodes['is_leaf'].sum() == 3
+
+    # Probe some predictions for each leaf of the tree
+    # each group of 3 samples corresponds to a condition in _make_training_data
+    input_data = np.array([
+        [0, 0],
+        [42, 99],
+        [128, 254],
+
+        [129, 0],
+        [129, 85],
+        [254, 85],
+
+        [129, 86],
+        [129, 254],
+        [242, 100],
+    ], dtype=np.uint8)
+    missing_values_bin_idx = n_bins - 1
+    predictions = predictor.predict_binned(input_data, missing_values_bin_idx)
+    expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1]
+    assert np.allclose(predictions, expected_targets)
+
+    # Check that training set can be recovered exactly:
+    predictions = predictor.predict_binned(X_binned, missing_values_bin_idx)
+    assert np.allclose(predictions, -all_gradients)
+
+
+@pytest.mark.parametrize(
+    'n_samples, min_samples_leaf, n_bins, constant_hessian, noise',
+    [
+        (11, 10, 7, True, 0),
+        (13, 10, 42, False, 0),
+        (56, 10, 255, True, 0.1),
+        (101, 3, 7, True, 0),
+        (200, 42, 42, False, 0),
+        (300, 55, 255, True, 0.1),
+        (300, 301, 255, True, 0.1),
+    ]
+)
+def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins,
+                          constant_hessian, noise):
+    rng = np.random.RandomState(seed=0)
+    # data = linear target, 3 features, 1 irrelevant.
+    X = rng.normal(size=(n_samples, 3))
+    y = X[:, 0] - X[:, 1]
+    if noise:
+        y_scale = y.std()
+        y += rng.normal(scale=noise, size=n_samples) * y_scale
+    mapper = _BinMapper(n_bins=n_bins)
+    X = mapper.fit_transform(X)
+
+    all_gradients = y.astype(G_H_DTYPE)
+    shape_hessian = 1 if constant_hessian else all_gradients.shape
+    all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE)
+    grower = TreeGrower(X, all_gradients, all_hessians,
+                        n_bins=n_bins, shrinkage=1.,
+                        min_samples_leaf=min_samples_leaf,
+                        max_leaf_nodes=n_samples)
+    grower.grow()
+    predictor = grower.make_predictor(
+        bin_thresholds=mapper.bin_thresholds_)
+
+    if n_samples >= min_samples_leaf:
+        for node in predictor.nodes:
+            if node['is_leaf']:
+                assert node['count'] >= min_samples_leaf
+    else:
+        assert predictor.nodes.shape[0] == 1
+        assert predictor.nodes[0]['is_leaf']
+        assert predictor.nodes[0]['count'] == n_samples
+
+
+@pytest.mark.parametrize('n_samples, min_samples_leaf', [
+                         (99, 50),
+                         (100, 50)])
+def test_min_samples_leaf_root(n_samples, min_samples_leaf):
+    # Make sure root node isn't split if n_samples is not at least twice
+    # min_samples_leaf
+    rng = np.random.RandomState(seed=0)
+
+    n_bins = 256
+
+    # data = linear target, 3 features, 1 irrelevant.
+    X = rng.normal(size=(n_samples, 3))
+    y = X[:, 0] - X[:, 1]
+    mapper = _BinMapper(n_bins=n_bins)
+    X = mapper.fit_transform(X)
+
+    all_gradients = y.astype(G_H_DTYPE)
+    all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+    grower = TreeGrower(X, all_gradients, all_hessians,
+                        n_bins=n_bins, shrinkage=1.,
+                        min_samples_leaf=min_samples_leaf,
+                        max_leaf_nodes=n_samples)
+    grower.grow()
+    if n_samples >= min_samples_leaf * 2:
+        assert len(grower.finalized_leaves) >= 2
+    else:
+        assert len(grower.finalized_leaves) == 1
+
+
+def assert_is_stump(grower):
+    # To assert that stumps are created when max_depth=1
+    for leaf in (grower.root.left_child, grower.root.right_child):
+        assert leaf.left_child is None
+        assert leaf.right_child is None
+
+
+@pytest.mark.parametrize('max_depth', [1, 2, 3])
+def test_max_depth(max_depth):
+    # Make sure max_depth parameter works as expected
+    rng = np.random.RandomState(seed=0)
+
+    n_bins = 256
+    n_samples = 1000
+
+    # data = linear target, 3 features, 1 irrelevant.
+    X = rng.normal(size=(n_samples, 3))
+    y = X[:, 0] - X[:, 1]
+    mapper = _BinMapper(n_bins=n_bins)
+    X = mapper.fit_transform(X)
+
+    all_gradients = y.astype(G_H_DTYPE)
+    all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+    grower = TreeGrower(X, all_gradients, all_hessians, max_depth=max_depth)
+    grower.grow()
+
+    depth = max(leaf.depth for leaf in grower.finalized_leaves)
+    assert depth == max_depth
+
+    if max_depth == 1:
+        assert_is_stump(grower)
+
+
+def test_input_validation():
+
+    X_binned, all_gradients, all_hessians = _make_training_data()
+
+    X_binned_float = X_binned.astype(np.float32)
+    with pytest.raises(NotImplementedError,
+                       match="X_binned must be of type uint8"):
+        TreeGrower(X_binned_float, all_gradients, all_hessians)
+
+    X_binned_C_array = np.ascontiguousarray(X_binned)
+    with pytest.raises(
+            ValueError,
+            match="X_binned should be passed as Fortran contiguous array"):
+        TreeGrower(X_binned_C_array, all_gradients, all_hessians)
+
+
+def test_init_parameters_validation():
+    X_binned, all_gradients, all_hessians = _make_training_data()
+    with pytest.raises(ValueError,
+                       match="min_gain_to_split=-1 must be positive"):
+
+        TreeGrower(X_binned, all_gradients, all_hessians,
+                   min_gain_to_split=-1)
+
+    with pytest.raises(ValueError,
+                       match="min_hessian_to_split=-1 must be positive"):
+        TreeGrower(X_binned, all_gradients, all_hessians,
+                   min_hessian_to_split=-1)
+
+
+def test_missing_value_predict_only():
+    # Make sure that missing values are supported at predict time even if they
+    # were not encountered in the training data: the missing values are
+    # assigned to whichever child has the most samples.
+
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8)
+    X_binned = np.asfortranarray(X_binned)
+
+    gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
+    hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+
+    grower = TreeGrower(X_binned, gradients, hessians, min_samples_leaf=5,
+                        has_missing_values=False)
+    grower.grow()
+
+    predictor = grower.make_predictor()
+
+    # go from root to a leaf, always following node with the most samples.
+    # That's the path nans are supposed to take
+    node = predictor.nodes[0]
+    while not node['is_leaf']:
+        left = predictor.nodes[node['left']]
+        right = predictor.nodes[node['right']]
+        node = left if left['count'] > right['count'] else right
+
+    prediction_main_path = node['value']
+
+    # now build X_test with only nans, and make sure all predictions are equal
+    # to prediction_main_path
+    all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan)
+    assert np.all(predictor.predict(all_nans) == prediction_main_path)
+
+
+def test_split_on_nan_with_infinite_values():
+    # Make sure the split on nan situations are respected even when there are
+    # samples with +inf values (we set the threshold to +inf when we have a
+    # split on nan so this test makes sure this does not introduce edge-case
+    # bugs). We need to use the private API so that we can also test
+    # predict_binned().
+
+    X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1)
+    # the gradient values will force a split on nan situation
+    gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE)
+    hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+
+    bin_mapper = _BinMapper()
+    X_binned = bin_mapper.fit_transform(X)
+
+    n_bins_non_missing = 3
+    has_missing_values = True
+    grower = TreeGrower(X_binned, gradients, hessians,
+                        n_bins_non_missing=n_bins_non_missing,
+                        has_missing_values=has_missing_values,
+                        min_samples_leaf=1)
+
+    grower.grow()
+
+    predictor = grower.make_predictor(
+        bin_thresholds=bin_mapper.bin_thresholds_
+    )
+
+    # sanity check: this was a split on nan
+    assert predictor.nodes[0]['threshold'] == np.inf
+    assert predictor.nodes[0]['bin_threshold'] == n_bins_non_missing - 1
+
+    # Make sure in particular that the +inf sample is mapped to the left child
+    # Note that lightgbm "fails" here and will assign the inf sample to the
+    # right child, even though it's a "split on nan" situation.
+    predictions = predictor.predict(X)
+    predictions_binned = predictor.predict_binned(
+        X_binned, missing_values_bin_idx=bin_mapper.missing_values_bin_idx_)
+    np.testing.assert_allclose(predictions, -gradients)
+    np.testing.assert_allclose(predictions_binned, -gradients)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py
@ -0,0 +1,202 @@
+import numpy as np
+import pytest
+
+from numpy.testing import assert_allclose
+from numpy.testing import assert_array_equal
+
+from sklearn.ensemble._hist_gradient_boosting.histogram import (
+    _build_histogram_naive,
+    _build_histogram,
+    _build_histogram_no_hessian,
+    _build_histogram_root_no_hessian,
+    _build_histogram_root,
+    _subtract_histograms
+)
+from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
+
+
+@pytest.mark.parametrize(
+    'build_func', [_build_histogram_naive, _build_histogram])
+def test_build_histogram(build_func):
+    binned_feature = np.array([0, 2, 0, 1, 2, 0, 2, 1], dtype=X_BINNED_DTYPE)
+
+    # Small sample_indices (below unrolling threshold)
+    ordered_gradients = np.array([0, 1, 3], dtype=G_H_DTYPE)
+    ordered_hessians = np.array([1, 1, 2], dtype=G_H_DTYPE)
+
+    sample_indices = np.array([0, 2, 3], dtype=np.uint32)
+    hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE)
+    build_func(0, sample_indices, binned_feature, ordered_gradients,
+               ordered_hessians, hist)
+    hist = hist[0]
+    assert_array_equal(hist['count'], [2, 1, 0])
+    assert_allclose(hist['sum_gradients'], [1, 3, 0])
+    assert_allclose(hist['sum_hessians'], [2, 2, 0])
+
+    # Larger sample_indices (above unrolling threshold)
+    sample_indices = np.array([0, 2, 3, 6, 7], dtype=np.uint32)
+    ordered_gradients = np.array([0, 1, 3, 0, 1], dtype=G_H_DTYPE)
+    ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=G_H_DTYPE)
+
+    hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE)
+    build_func(0, sample_indices, binned_feature, ordered_gradients,
+               ordered_hessians, hist)
+    hist = hist[0]
+    assert_array_equal(hist['count'], [2, 2, 1])
+    assert_allclose(hist['sum_gradients'], [1, 4, 0])
+    assert_allclose(hist['sum_hessians'], [2, 2, 1])
+
+
+def test_histogram_sample_order_independence():
+    # Make sure the order of the samples has no impact on the histogram
+    # computations
+    rng = np.random.RandomState(42)
+    n_sub_samples = 100
+    n_samples = 1000
+    n_bins = 256
+
+    binned_feature = rng.randint(0, n_bins - 1, size=n_samples,
+                                 dtype=X_BINNED_DTYPE)
+    sample_indices = rng.choice(np.arange(n_samples, dtype=np.uint32),
+                                n_sub_samples, replace=False)
+    ordered_gradients = rng.randn(n_sub_samples).astype(G_H_DTYPE)
+    hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    _build_histogram_no_hessian(0, sample_indices, binned_feature,
+                                ordered_gradients, hist_gc)
+
+    ordered_hessians = rng.exponential(size=n_sub_samples).astype(G_H_DTYPE)
+    hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    _build_histogram(0, sample_indices, binned_feature,
+                     ordered_gradients, ordered_hessians, hist_ghc)
+
+    permutation = rng.permutation(n_sub_samples)
+    hist_gc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    _build_histogram_no_hessian(0, sample_indices[permutation],
+                                binned_feature, ordered_gradients[permutation],
+                                hist_gc_perm)
+
+    hist_ghc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    _build_histogram(0, sample_indices[permutation], binned_feature,
+                     ordered_gradients[permutation],
+                     ordered_hessians[permutation], hist_ghc_perm)
+
+    hist_gc = hist_gc[0]
+    hist_ghc = hist_ghc[0]
+    hist_gc_perm = hist_gc_perm[0]
+    hist_ghc_perm = hist_ghc_perm[0]
+
+    assert_allclose(hist_gc['sum_gradients'], hist_gc_perm['sum_gradients'])
+    assert_array_equal(hist_gc['count'], hist_gc_perm['count'])
+
+    assert_allclose(hist_ghc['sum_gradients'], hist_ghc_perm['sum_gradients'])
+    assert_allclose(hist_ghc['sum_hessians'], hist_ghc_perm['sum_hessians'])
+    assert_array_equal(hist_ghc['count'], hist_ghc_perm['count'])
+
+
+@pytest.mark.parametrize("constant_hessian", [True, False])
+def test_unrolled_equivalent_to_naive(constant_hessian):
+    # Make sure the different unrolled histogram computations give the same
+    # results as the naive one.
+    rng = np.random.RandomState(42)
+    n_samples = 10
+    n_bins = 5
+    sample_indices = np.arange(n_samples).astype(np.uint32)
+    binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8)
+    ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
+    if constant_hessian:
+        ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
+    else:
+        ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
+
+    hist_gc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    hist_ghc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    hist_naive = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+
+    _build_histogram_root_no_hessian(0, binned_feature,
+                                     ordered_gradients, hist_gc_root)
+    _build_histogram_root(0, binned_feature, ordered_gradients,
+                          ordered_hessians, hist_ghc_root)
+    _build_histogram_no_hessian(0, sample_indices, binned_feature,
+                                ordered_gradients, hist_gc)
+    _build_histogram(0, sample_indices, binned_feature,
+                     ordered_gradients, ordered_hessians, hist_ghc)
+    _build_histogram_naive(0, sample_indices, binned_feature,
+                           ordered_gradients, ordered_hessians, hist_naive)
+
+    hist_naive = hist_naive[0]
+    hist_gc_root = hist_gc_root[0]
+    hist_ghc_root = hist_ghc_root[0]
+    hist_gc = hist_gc[0]
+    hist_ghc = hist_ghc[0]
+    for hist in (hist_gc_root, hist_ghc_root, hist_gc, hist_ghc):
+        assert_array_equal(hist['count'], hist_naive['count'])
+        assert_allclose(hist['sum_gradients'], hist_naive['sum_gradients'])
+    for hist in (hist_ghc_root, hist_ghc):
+        assert_allclose(hist['sum_hessians'], hist_naive['sum_hessians'])
+    for hist in (hist_gc_root, hist_gc):
+        assert_array_equal(hist['sum_hessians'], np.zeros(n_bins))
+
+
+@pytest.mark.parametrize("constant_hessian", [True, False])
+def test_hist_subtraction(constant_hessian):
+    # Make sure the histogram subtraction trick gives the same result as the
+    # classical method.
+    rng = np.random.RandomState(42)
+    n_samples = 10
+    n_bins = 5
+    sample_indices = np.arange(n_samples).astype(np.uint32)
+    binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8)
+    ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
+    if constant_hessian:
+        ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
+    else:
+        ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
+
+    hist_parent = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    if constant_hessian:
+        _build_histogram_no_hessian(0, sample_indices, binned_feature,
+                                    ordered_gradients, hist_parent)
+    else:
+        _build_histogram(0, sample_indices, binned_feature,
+                         ordered_gradients, ordered_hessians, hist_parent)
+
+    mask = rng.randint(0, 2, n_samples).astype(np.bool)
+
+    sample_indices_left = sample_indices[mask]
+    ordered_gradients_left = ordered_gradients[mask]
+    ordered_hessians_left = ordered_hessians[mask]
+    hist_left = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    if constant_hessian:
+        _build_histogram_no_hessian(0, sample_indices_left,
+                                    binned_feature, ordered_gradients_left,
+                                    hist_left)
+    else:
+        _build_histogram(0, sample_indices_left, binned_feature,
+                         ordered_gradients_left, ordered_hessians_left,
+                         hist_left)
+
+    sample_indices_right = sample_indices[~mask]
+    ordered_gradients_right = ordered_gradients[~mask]
+    ordered_hessians_right = ordered_hessians[~mask]
+    hist_right = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    if constant_hessian:
+        _build_histogram_no_hessian(0, sample_indices_right,
+                                    binned_feature, ordered_gradients_right,
+                                    hist_right)
+    else:
+        _build_histogram(0, sample_indices_right, binned_feature,
+                         ordered_gradients_right, ordered_hessians_right,
+                         hist_right)
+
+    hist_left_sub = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    hist_right_sub = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    _subtract_histograms(0, n_bins, hist_parent, hist_right, hist_left_sub)
+    _subtract_histograms(0, n_bins, hist_parent, hist_left, hist_right_sub)
+
+    for key in ('count', 'sum_hessians', 'sum_gradients'):
+        assert_allclose(hist_left[key], hist_left_sub[key], rtol=1e-6)
+        assert_allclose(hist_right[key], hist_right_sub[key], rtol=1e-6)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
@ -0,0 +1,318 @@
+import numpy as np
+from numpy.testing import assert_almost_equal
+from numpy.testing import assert_allclose
+from scipy.optimize import newton
+from sklearn.utils import assert_all_finite
+from sklearn.utils.fixes import sp_version, parse_version
+import pytest
+
+from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
+from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+from sklearn.utils._testing import skip_if_32bit
+
+
+def get_derivatives_helper(loss):
+    """Return get_gradients() and get_hessians() functions for a given loss.
+    """
+
+    def get_gradients(y_true, raw_predictions):
+        # create gradients and hessians array, update inplace, and return
+        gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
+        hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
+        loss.update_gradients_and_hessians(gradients, hessians, y_true,
+                                           raw_predictions, None)
+        return gradients
+
+    def get_hessians(y_true, raw_predictions):
+        # create gradients and hessians array, update inplace, and return
+        gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
+        hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
+        loss.update_gradients_and_hessians(gradients, hessians, y_true,
+                                           raw_predictions, None)
+
+        if loss.__class__.__name__ == 'LeastSquares':
+            # hessians aren't updated because they're constant:
+            # the value is 1 (and not 2) because the loss is actually an half
+            # least squares loss.
+            hessians = np.full_like(raw_predictions, fill_value=1)
+        elif loss.__class__.__name__ == 'LeastAbsoluteDeviation':
+            # hessians aren't updated because they're constant
+            hessians = np.full_like(raw_predictions, fill_value=0)
+
+        return hessians
+
+    return get_gradients, get_hessians
+
+
+@pytest.mark.parametrize('loss, x0, y_true', [
+    ('least_squares', -2., 42),
+    ('least_squares', 117., 1.05),
+    ('least_squares', 0., 0.),
+    # I don't understand why but y_true == 0 fails :/
+    # ('binary_crossentropy', 0.3, 0),
+    ('binary_crossentropy', -12, 1),
+    ('binary_crossentropy', 30, 1),
+    ('poisson', 12., 1.),
+    ('poisson', 0., 2.),
+    ('poisson', -22., 10.),
+])
+@pytest.mark.skipif(sp_version == parse_version('1.2.0'),
+                    reason='bug in scipy 1.2.0, see scipy issue #9608')
+@skip_if_32bit
+def test_derivatives(loss, x0, y_true):
+    # Check that gradients are zero when the loss is minimized on 1D array
+    # using Halley's method with the first and second order derivatives
+    # computed by the Loss instance.
+
+    loss = _LOSSES[loss](sample_weight=None)
+    y_true = np.array([y_true], dtype=Y_DTYPE)
+    x0 = np.array([x0], dtype=Y_DTYPE).reshape(1, 1)
+    get_gradients, get_hessians = get_derivatives_helper(loss)
+
+    def func(x):
+        return loss.pointwise_loss(y_true, x)
+
+    def fprime(x):
+        return get_gradients(y_true, x)
+
+    def fprime2(x):
+        return get_hessians(y_true, x)
+
+    optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2,
+                     maxiter=70, tol=2e-8)
+    assert np.allclose(loss.inverse_link_function(optimum), y_true)
+    assert np.allclose(loss.pointwise_loss(y_true, optimum), 0)
+    assert np.allclose(get_gradients(y_true, optimum), 0, atol=1e-7)
+
+
+@pytest.mark.parametrize('loss, n_classes, prediction_dim', [
+    ('least_squares', 0, 1),
+    ('least_absolute_deviation', 0, 1),
+    ('binary_crossentropy', 2, 1),
+    ('categorical_crossentropy', 3, 3),
+    ('poisson', 0, 1),
+])
+@pytest.mark.skipif(Y_DTYPE != np.float64,
+                    reason='Need 64 bits float precision for numerical checks')
+def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
+    # Make sure gradients and hessians computed in the loss are correct, by
+    # comparing with their approximations computed with finite central
+    # differences.
+    # See https://en.wikipedia.org/wiki/Finite_difference.
+
+    rng = np.random.RandomState(seed)
+    n_samples = 100
+    if loss in ('least_squares', 'least_absolute_deviation'):
+        y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
+    elif loss in ('poisson'):
+        y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
+    else:
+        y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
+    raw_predictions = rng.normal(
+        size=(prediction_dim, n_samples)
+    ).astype(Y_DTYPE)
+    loss = _LOSSES[loss](sample_weight=None)
+    get_gradients, get_hessians = get_derivatives_helper(loss)
+
+    # only take gradients and hessians of first tree / class.
+    gradients = get_gradients(y_true, raw_predictions)[0, :].ravel()
+    hessians = get_hessians(y_true, raw_predictions)[0, :].ravel()
+
+    # Approximate gradients
+    # For multiclass loss, we should only change the predictions of one tree
+    # (here the first), hence the use of offset[0, :] += eps
+    # As a softmax is computed, offsetting the whole array by a constant would
+    # have no effect on the probabilities, and thus on the loss
+    eps = 1e-9
+    offset = np.zeros_like(raw_predictions)
+    offset[0, :] = eps
+    f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset / 2)
+    f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset / 2)
+    numerical_gradients = (f_plus_eps - f_minus_eps) / eps
+
+    # Approximate hessians
+    eps = 1e-4  # need big enough eps as we divide by its square
+    offset[0, :] = eps
+    f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset)
+    f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset)
+    f = loss.pointwise_loss(y_true, raw_predictions)
+    numerical_hessians = (f_plus_eps + f_minus_eps - 2 * f) / eps**2
+
+    assert_allclose(numerical_gradients, gradients, rtol=1e-4, atol=1e-7)
+    assert_allclose(numerical_hessians, hessians, rtol=1e-4, atol=1e-7)
+
+
+def test_baseline_least_squares():
+    rng = np.random.RandomState(0)
+
+    loss = _LOSSES['least_squares'](sample_weight=None)
+    y_train = rng.normal(size=100)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+    assert baseline_prediction.shape == tuple()  # scalar
+    assert baseline_prediction.dtype == y_train.dtype
+    # Make sure baseline prediction is the mean of all targets
+    assert_almost_equal(baseline_prediction, y_train.mean())
+    assert np.allclose(loss.inverse_link_function(baseline_prediction),
+                       baseline_prediction)
+
+
+def test_baseline_least_absolute_deviation():
+    rng = np.random.RandomState(0)
+
+    loss = _LOSSES['least_absolute_deviation'](sample_weight=None)
+    y_train = rng.normal(size=100)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+    assert baseline_prediction.shape == tuple()  # scalar
+    assert baseline_prediction.dtype == y_train.dtype
+    # Make sure baseline prediction is the median of all targets
+    assert np.allclose(loss.inverse_link_function(baseline_prediction),
+                       baseline_prediction)
+    assert baseline_prediction == pytest.approx(np.median(y_train))
+
+
+def test_baseline_poisson():
+    rng = np.random.RandomState(0)
+
+    loss = _LOSSES['poisson'](sample_weight=None)
+    y_train = rng.poisson(size=100).astype(np.float64)
+    # Sanity check, make sure at least one sample is non-zero so we don't take
+    # log(0)
+    assert y_train.sum() > 0
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+    assert np.isscalar(baseline_prediction)
+    assert baseline_prediction.dtype == y_train.dtype
+    assert_all_finite(baseline_prediction)
+    # Make sure baseline prediction produces the log of the mean of all targets
+    assert_almost_equal(np.log(y_train.mean()), baseline_prediction)
+
+    # Test baseline for y_true = 0
+    y_train.fill(0.)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+    assert_all_finite(baseline_prediction)
+
+
+def test_baseline_binary_crossentropy():
+    rng = np.random.RandomState(0)
+
+    loss = _LOSSES['binary_crossentropy'](sample_weight=None)
+    for y_train in (np.zeros(shape=100), np.ones(shape=100)):
+        y_train = y_train.astype(np.float64)
+        baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+        assert_all_finite(baseline_prediction)
+        assert np.allclose(loss.inverse_link_function(baseline_prediction),
+                           y_train[0])
+
+    # Make sure baseline prediction is equal to link_function(p), where p
+    # is the proba of the positive class. We want predict_proba() to return p,
+    # and by definition
+    # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction)
+    # So we want raw_prediction = link_function(p) = log(p / (1 - p))
+    y_train = rng.randint(0, 2, size=100).astype(np.float64)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+    assert baseline_prediction.shape == tuple()  # scalar
+    assert baseline_prediction.dtype == y_train.dtype
+    p = y_train.mean()
+    assert np.allclose(baseline_prediction, np.log(p / (1 - p)))
+
+
+def test_baseline_categorical_crossentropy():
+    rng = np.random.RandomState(0)
+
+    prediction_dim = 4
+    loss = _LOSSES['categorical_crossentropy'](sample_weight=None)
+    for y_train in (np.zeros(shape=100), np.ones(shape=100)):
+        y_train = y_train.astype(np.float64)
+        baseline_prediction = loss.get_baseline_prediction(y_train, None,
+                                                           prediction_dim)
+        assert baseline_prediction.dtype == y_train.dtype
+        assert_all_finite(baseline_prediction)
+
+    # Same logic as for above test. Here inverse_link_function = softmax and
+    # link_function = log
+    y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None,
+                                                       prediction_dim)
+    assert baseline_prediction.shape == (prediction_dim, 1)
+    for k in range(prediction_dim):
+        p = (y_train == k).mean()
+        assert np.allclose(baseline_prediction[k, :], np.log(p))
+
+
+@pytest.mark.parametrize('loss, problem', [
+    ('least_squares', 'regression'),
+    ('least_absolute_deviation', 'regression'),
+    ('binary_crossentropy', 'classification'),
+    ('categorical_crossentropy', 'classification'),
+    ('poisson', 'poisson_regression'),
+    ])
+@pytest.mark.parametrize('sample_weight', ['ones', 'random'])
+def test_sample_weight_multiplies_gradients(loss, problem, sample_weight):
+    # Make sure that passing sample weights to the gradient and hessians
+    # computation methods is equivalent to multiplying by the weights.
+
+    rng = np.random.RandomState(42)
+    n_samples = 1000
+
+    if loss == 'categorical_crossentropy':
+        n_classes = prediction_dim = 3
+    else:
+        n_classes = prediction_dim = 1
+
+    if problem == 'regression':
+        y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
+    elif problem == 'poisson_regression':
+        y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
+    else:
+        y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
+
+    if sample_weight == 'ones':
+        sample_weight = np.ones(shape=n_samples, dtype=Y_DTYPE)
+    else:
+        sample_weight = rng.normal(size=n_samples).astype(Y_DTYPE)
+
+    loss_ = _LOSSES[loss](sample_weight=sample_weight)
+
+    baseline_prediction = loss_.get_baseline_prediction(
+        y_true, None, prediction_dim
+    )
+    raw_predictions = np.zeros(shape=(prediction_dim, n_samples),
+                               dtype=baseline_prediction.dtype)
+    raw_predictions += baseline_prediction
+
+    gradients = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
+    hessians = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
+    loss_.update_gradients_and_hessians(gradients, hessians, y_true,
+                                        raw_predictions, None)
+
+    gradients_sw = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
+    hessians_sw = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
+    loss_.update_gradients_and_hessians(gradients_sw, hessians_sw, y_true,
+                                        raw_predictions, sample_weight)
+
+    assert np.allclose(gradients * sample_weight, gradients_sw)
+    assert np.allclose(hessians * sample_weight, hessians_sw)
+
+
+def test_init_gradient_and_hessians_sample_weight():
+    # Make sure that passing sample_weight to a loss correctly influences the
+    # hessians_are_constant attribute, and consequently the shape of the
+    # hessians array.
+
+    prediction_dim = 2
+    n_samples = 5
+    sample_weight = None
+    loss = _LOSSES['least_squares'](sample_weight=sample_weight)
+    _, hessians = loss.init_gradients_and_hessians(
+        n_samples=n_samples, prediction_dim=prediction_dim,
+        sample_weight=None)
+    assert loss.hessians_are_constant
+    assert hessians.shape == (1, 1)
+
+    sample_weight = np.ones(n_samples)
+    loss = _LOSSES['least_squares'](sample_weight=sample_weight)
+    _, hessians = loss.init_gradients_and_hessians(
+        n_samples=n_samples, prediction_dim=prediction_dim,
+        sample_weight=sample_weight)
+    assert not loss.hessians_are_constant
+    assert hessians.shape == (prediction_dim, n_samples)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
@ -0,0 +1,341 @@
+import numpy as np
+import pytest
+
+from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
+from sklearn.ensemble._hist_gradient_boosting.splitting import (
+    Splitter,
+    compute_node_value
+)
+from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.ensemble import HistGradientBoostingClassifier
+
+
+def is_increasing(a):
+    return (np.diff(a) >= 0.0).all()
+
+
+def is_decreasing(a):
+    return (np.diff(a) <= 0.0).all()
+
+
+def assert_leaves_values_monotonic(predictor, monotonic_cst):
+    # make sure leaves values (from left to right) are either all increasing
+    # or all decreasing (or neither) depending on the monotonic constraint.
+    nodes = predictor.nodes
+
+    def get_leaves_values():
+        """get leaves values from left to right"""
+        values = []
+
+        def depth_first_collect_leaf_values(node_idx):
+            node = nodes[node_idx]
+            if node['is_leaf']:
+                values.append(node['value'])
+                return
+            depth_first_collect_leaf_values(node['left'])
+            depth_first_collect_leaf_values(node['right'])
+
+        depth_first_collect_leaf_values(0)  # start at root (0)
+        return values
+
+    values = get_leaves_values()
+
+    if monotonic_cst == MonotonicConstraint.NO_CST:
+        # some increasing, some decreasing
+        assert not is_increasing(values) and not is_decreasing(values)
+    elif monotonic_cst == MonotonicConstraint.POS:
+        # all increasing
+        assert is_increasing(values)
+    else:  # NEG
+        # all decreasing
+        assert is_decreasing(values)
+
+
+def assert_children_values_monotonic(predictor, monotonic_cst):
+    # Make sure siblings values respect the monotonic constraints. Left should
+    # be lower (resp greater) than right child if constraint is POS (resp.
+    # NEG).
+    # Note that this property alone isn't enough to ensure full monotonicity,
+    # since we also need to guanrantee that all the descendents of the left
+    # child won't be greater (resp. lower) than the right child, or its
+    # descendents. That's why we need to bound the predicted values (this is
+    # tested in assert_children_values_bounded)
+    nodes = predictor.nodes
+    left_lower = []
+    left_greater = []
+    for node in nodes:
+        if node['is_leaf']:
+            continue
+
+        left_idx = node['left']
+        right_idx = node['right']
+
+        if nodes[left_idx]['value'] < nodes[right_idx]['value']:
+            left_lower.append(node)
+        elif nodes[left_idx]['value'] > nodes[right_idx]['value']:
+            left_greater.append(node)
+
+    if monotonic_cst == MonotonicConstraint.NO_CST:
+        assert left_lower and left_greater
+    elif monotonic_cst == MonotonicConstraint.POS:
+        assert left_lower and not left_greater
+    else:  # NEG
+        assert not left_lower and left_greater
+
+
+def assert_children_values_bounded(grower, monotonic_cst):
+    # Make sure that the values of the children of a node are bounded by the
+    # middle value between that node and its sibling (if there is a monotonic
+    # constraint).
+    # As a bonus, we also check that the siblings values are properly ordered
+    # which is slightly redundant with assert_children_values_monotonic (but
+    # this check is done on the grower nodes whereas
+    # assert_children_values_monotonic is done on the predictor nodes)
+
+    if monotonic_cst == MonotonicConstraint.NO_CST:
+        return
+
+    def recursively_check_children_node_values(node):
+        if node.is_leaf:
+            return
+        if node is not grower.root and node is node.parent.left_child:
+            sibling = node.sibling  # on the right
+            middle = (node.value + sibling.value) / 2
+            if monotonic_cst == MonotonicConstraint.POS:
+                assert (node.left_child.value <=
+                        node.right_child.value <=
+                        middle)
+                if not sibling.is_leaf:
+                    assert (middle <=
+                            sibling.left_child.value <=
+                            sibling.right_child.value)
+            else:  # NEG
+                assert (node.left_child.value >=
+                        node.right_child.value >=
+                        middle)
+                if not sibling.is_leaf:
+                    assert (middle >=
+                            sibling.left_child.value >=
+                            sibling.right_child.value)
+
+        recursively_check_children_node_values(node.left_child)
+        recursively_check_children_node_values(node.right_child)
+
+    recursively_check_children_node_values(grower.root)
+
+
+@pytest.mark.parametrize('seed', range(3))
+@pytest.mark.parametrize('monotonic_cst', (
+    MonotonicConstraint.NO_CST,
+    MonotonicConstraint.POS,
+    MonotonicConstraint.NEG,
+))
+def test_nodes_values(monotonic_cst, seed):
+    # Build a single tree with only one feature, and make sure the nodes
+    # values respect the monotonic constraints.
+
+    # Considering the following tree with a monotonic POS constraint, we
+    # should have:
+    #
+    #       root
+    #      /    \
+    #     5     10    # middle = 7.5
+    #    / \   / \
+    #   a  b  c  d
+    #
+    # a <= b and c <= d  (assert_children_values_monotonic)
+    # a, b <= middle <= c, d (assert_children_values_bounded)
+    # a <= b <= c <= d (assert_leaves_values_monotonic)
+    #
+    # The last one is a consequence of the others, but can't hurt to check
+
+    rng = np.random.RandomState(seed)
+    n_samples = 1000
+    n_features = 1
+    X_binned = rng.randint(0, 255, size=(n_samples, n_features),
+                           dtype=np.uint8)
+    X_binned = np.asfortranarray(X_binned)
+
+    gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
+    hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+
+    grower = TreeGrower(X_binned, gradients, hessians,
+                        monotonic_cst=[monotonic_cst],
+                        shrinkage=.1)
+    grower.grow()
+
+    # grow() will shrink the leaves values at the very end. For our comparison
+    # tests, we need to revert the shrinkage of the leaves, else we would
+    # compare the value of a leaf (shrunk) with a node (not shrunk) and the
+    # test would not be correct.
+    for leave in grower.finalized_leaves:
+        leave.value /= grower.shrinkage
+
+    # The consistency of the bounds can only be checked on the tree grower
+    # as the node bounds are not copied into the predictor tree. The
+    # consistency checks on the values of node children and leaves can be
+    # done either on the grower tree or on the predictor tree. We only
+    # do those checks on the predictor tree as the latter is derived from
+    # the former.
+    predictor = grower.make_predictor()
+    assert_children_values_monotonic(predictor, monotonic_cst)
+    assert_children_values_bounded(grower, monotonic_cst)
+    assert_leaves_values_monotonic(predictor, monotonic_cst)
+
+
+@pytest.mark.parametrize('seed', range(3))
+def test_predictions(seed):
+    # Train a model with a POS constraint on the first feature and a NEG
+    # constraint on the second feature, and make sure the constraints are
+    # respected by checking the predictions.
+    # test adapted from lightgbm's test_monotone_constraint(), itself inspired
+    # by https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html
+
+    rng = np.random.RandomState(seed)
+
+    n_samples = 1000
+    f_0 = rng.rand(n_samples)  # positive correlation with y
+    f_1 = rng.rand(n_samples)  # negative correslation with y
+    X = np.c_[f_0, f_1]
+    noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
+    y = (5 * f_0 + np.sin(10 * np.pi * f_0) -
+         5 * f_1 - np.cos(10 * np.pi * f_1) +
+         noise)
+
+    gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1])
+    gbdt.fit(X, y)
+
+    linspace = np.linspace(0, 1, 100)
+    sin = np.sin(linspace)
+    constant = np.full_like(linspace, fill_value=.5)
+
+    # We now assert the predictions properly respect the constraints, on each
+    # feature. When testing for a feature we need to set the other one to a
+    # constant, because the monotonic constraints are only a "all else being
+    # equal" type of constraints:
+    # a constraint on the first feature only means that
+    # x0 < x0' => f(x0, x1) < f(x0', x1)
+    # while x1 stays constant.
+    # The constraint does not guanrantee that
+    # x0 < x0' => f(x0, x1) < f(x0', x1')
+
+    # First feature (POS)
+    # assert pred is all increasing when f_0 is all increasing
+    X = np.c_[linspace, constant]
+    pred = gbdt.predict(X)
+    assert is_increasing(pred)
+    # assert pred actually follows the variations of f_0
+    X = np.c_[sin, constant]
+    pred = gbdt.predict(X)
+    assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0))
+
+    # Second feature (NEG)
+    # assert pred is all decreasing when f_1 is all increasing
+    X = np.c_[constant, linspace]
+    pred = gbdt.predict(X)
+    assert is_decreasing(pred)
+    # assert pred actually follows the inverse variations of f_1
+    X = np.c_[constant, sin]
+    pred = gbdt.predict(X)
+    assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all()
+
+
+def test_input_error():
+    X = [[1, 2], [2, 3], [3, 4]]
+    y = [0, 1, 2]
+
+    gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, 0, -1])
+    with pytest.raises(ValueError,
+                       match='monotonic_cst has shape 3 but the input data'):
+        gbdt.fit(X, y)
+
+    for monotonic_cst in ([1, 3], [1, -3]):
+        gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
+        with pytest.raises(ValueError,
+                           match='must be None or an array-like of '
+                                 '-1, 0 or 1'):
+            gbdt.fit(X, y)
+
+    gbdt = HistGradientBoostingClassifier(monotonic_cst=[0, 1])
+    with pytest.raises(
+            ValueError,
+            match='monotonic constraints are not supported '
+                  'for multiclass classification'
+            ):
+        gbdt.fit(X, y)
+
+
+def test_bounded_value_min_gain_to_split():
+    # The purpose of this test is to show that when computing the gain at a
+    # given split, the value of the current node should be properly bounded to
+    # respect the monotonic constraints, because it strongly interacts with
+    # min_gain_to_split. We build a simple example where gradients are [1, 1,
+    # 100, 1, 1] (hessians are all ones). The best split happens on the 3rd
+    # bin, and depending on whether the value of the node is bounded or not,
+    # the min_gain_to_split constraint is or isn't satisfied.
+    l2_regularization = 0
+    min_hessian_to_split = 0
+    min_samples_leaf = 1
+    n_bins = n_samples = 5
+    X_binned = np.arange(n_samples).reshape(-1, 1).astype(X_BINNED_DTYPE)
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
+    all_gradients = np.array([1, 1, 100, 1, 1], dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = all_hessians.sum()
+    hessians_are_constant = False
+
+    builder = HistogramBuilder(X_binned, n_bins, all_gradients,
+                               all_hessians, hessians_are_constant)
+    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
+                                  dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
+    missing_values_bin_idx = n_bins - 1
+    children_lower_bound, children_upper_bound = -np.inf, np.inf
+
+    min_gain_to_split = 2000
+    splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
+                        has_missing_values, monotonic_cst, l2_regularization,
+                        min_hessian_to_split, min_samples_leaf,
+                        min_gain_to_split, hessians_are_constant)
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+
+    # Since the gradient array is [1, 1, 100, 1, 1]
+    # the max possible gain happens on the 3rd bin (or equivalently in the 2nd)
+    # and is equal to about 1307, which less than min_gain_to_split = 2000, so
+    # the node is considered unsplittable (gain = -1)
+    current_lower_bound, current_upper_bound = -np.inf, np.inf
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               current_lower_bound, current_upper_bound,
+                               l2_regularization)
+    # the unbounded value is equal to -sum_gradients / sum_hessians
+    assert value == pytest.approx(-104 / 5)
+    split_info = splitter.find_node_split(n_samples, histograms,
+                                          sum_gradients, sum_hessians, value,
+                                          lower_bound=children_lower_bound,
+                                          upper_bound=children_upper_bound)
+    assert split_info.gain == -1  # min_gain_to_split not respected
+
+    # here again the max possible gain is on the 3rd bin but we now cap the
+    # value of the node into [-10, inf].
+    # This means the gain is now about 2430 which is more than the
+    # min_gain_to_split constraint.
+    current_lower_bound, current_upper_bound = -10, np.inf
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               current_lower_bound, current_upper_bound,
+                               l2_regularization)
+    assert value == -10
+    split_info = splitter.find_node_split(n_samples, histograms,
+                                          sum_gradients, sum_hessians, value,
+                                          lower_bound=children_lower_bound,
+                                          upper_bound=children_upper_bound)
+    assert split_info.gain > min_gain_to_split
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
@ -0,0 +1,76 @@
+import numpy as np
+from sklearn.datasets import make_regression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import r2_score
+import pytest
+
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
+from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor
+from sklearn.ensemble._hist_gradient_boosting.common import (
+    G_H_DTYPE, PREDICTOR_RECORD_DTYPE, ALMOST_INF)
+
+
+@pytest.mark.parametrize('n_bins', [200, 256])
+def test_regression_dataset(n_bins):
+    X, y = make_regression(n_samples=500, n_features=10, n_informative=5,
+                           random_state=42)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, random_state=42)
+
+    mapper = _BinMapper(n_bins=n_bins, random_state=42)
+    X_train_binned = mapper.fit_transform(X_train)
+
+    # Init gradients and hessians to that of least squares loss
+    gradients = -y_train.astype(G_H_DTYPE)
+    hessians = np.ones(1, dtype=G_H_DTYPE)
+
+    min_samples_leaf = 10
+    max_leaf_nodes = 30
+    grower = TreeGrower(X_train_binned, gradients, hessians,
+                        min_samples_leaf=min_samples_leaf,
+                        max_leaf_nodes=max_leaf_nodes, n_bins=n_bins,
+                        n_bins_non_missing=mapper.n_bins_non_missing_)
+    grower.grow()
+
+    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)
+
+    assert r2_score(y_train, predictor.predict(X_train)) > 0.82
+    assert r2_score(y_test, predictor.predict(X_test)) > 0.67
+
+
+@pytest.mark.parametrize('threshold, expected_predictions', [
+    (-np.inf, [0, 1, 1, 1]),
+    (10, [0, 0, 1, 1]),
+    (20, [0, 0, 0, 1]),
+    (ALMOST_INF, [0, 0, 0, 1]),
+    (np.inf, [0, 0, 0, 0]),
+])
+def test_infinite_values_and_thresholds(threshold, expected_predictions):
+    # Make sure infinite values and infinite thresholds are handled properly.
+    # In particular, if a value is +inf and the threshold is ALMOST_INF the
+    # sample should go to the right child. If the threshold is inf (split on
+    # nan), the +inf sample will go to the left child.
+
+    X = np.array([-np.inf, 10, 20,  np.inf]).reshape(-1, 1)
+    nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE)
+
+    # We just construct a simple tree with 1 root and 2 children
+    # parent node
+    nodes[0]['left'] = 1
+    nodes[0]['right'] = 2
+    nodes[0]['feature_idx'] = 0
+    nodes[0]['threshold'] = threshold
+
+    # left child
+    nodes[1]['is_leaf'] = True
+    nodes[1]['value'] = 0
+
+    # right child
+    nodes[2]['is_leaf'] = True
+    nodes[2]['value'] = 1
+
+    predictor = TreePredictor(nodes)
+    predictions = predictor.predict(X)
+
+    assert np.all(predictions == expected_predictions)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
@ -0,0 +1,480 @@
+import numpy as np
+import pytest
+
+from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
+from sklearn.ensemble._hist_gradient_boosting.splitting import (
+    Splitter,
+    compute_node_value
+)
+from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
+from sklearn.utils._testing import skip_if_32bit
+
+
+@pytest.mark.parametrize('n_bins', [3, 32, 256])
+def test_histogram_split(n_bins):
+    rng = np.random.RandomState(42)
+    feature_idx = 0
+    l2_regularization = 0
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.
+    X_binned = np.asfortranarray(
+        rng.randint(0, n_bins - 1, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE)
+    binned_feature = X_binned.T[feature_idx]
+    sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32)
+    ordered_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
+    all_hessians = ordered_hessians
+    sum_hessians = all_hessians.sum()
+    hessians_are_constant = False
+
+    for true_bin in range(1, n_bins - 2):
+        for sign in [-1, 1]:
+            ordered_gradients = np.full_like(binned_feature, sign,
+                                             dtype=G_H_DTYPE)
+            ordered_gradients[binned_feature <= true_bin] *= -1
+            all_gradients = ordered_gradients
+            sum_gradients = all_gradients.sum()
+
+            builder = HistogramBuilder(X_binned,
+                                       n_bins,
+                                       all_gradients,
+                                       all_hessians,
+                                       hessians_are_constant)
+            n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
+                                          dtype=np.uint32)
+            has_missing_values = np.array([False] * X_binned.shape[1],
+                                          dtype=np.uint8)
+            monotonic_cst = np.array(
+                [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+                dtype=np.int8)
+            missing_values_bin_idx = n_bins - 1
+            splitter = Splitter(X_binned,
+                                n_bins_non_missing,
+                                missing_values_bin_idx,
+                                has_missing_values,
+                                monotonic_cst,
+                                l2_regularization,
+                                min_hessian_to_split,
+                                min_samples_leaf, min_gain_to_split,
+                                hessians_are_constant)
+
+            histograms = builder.compute_histograms_brute(sample_indices)
+            value = compute_node_value(sum_gradients, sum_hessians,
+                                       -np.inf, np.inf, l2_regularization)
+            split_info = splitter.find_node_split(
+                sample_indices.shape[0], histograms, sum_gradients,
+                sum_hessians, value)
+
+            assert split_info.bin_idx == true_bin
+            assert split_info.gain >= 0
+            assert split_info.feature_idx == feature_idx
+            assert (split_info.n_samples_left + split_info.n_samples_right
+                    == sample_indices.shape[0])
+            # Constant hessian: 1. per sample.
+            assert split_info.n_samples_left == split_info.sum_hessian_left
+
+
+@skip_if_32bit
+@pytest.mark.parametrize('constant_hessian', [True, False])
+def test_gradient_and_hessian_sanity(constant_hessian):
+    # This test checks that the values of gradients and hessians are
+    # consistent in different places:
+    # - in split_info: si.sum_gradient_left + si.sum_gradient_right must be
+    #   equal to the gradient at the node. Same for hessians.
+    # - in the histograms: summing 'sum_gradients' over the bins must be
+    #   constant across all features, and those sums must be equal to the
+    #   node's gradient. Same for hessians.
+
+    rng = np.random.RandomState(42)
+
+    n_bins = 10
+    n_features = 20
+    n_samples = 500
+    l2_regularization = 0.
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.
+
+    X_binned = rng.randint(0, n_bins, size=(n_samples, n_features),
+                           dtype=X_BINNED_DTYPE)
+    X_binned = np.asfortranarray(X_binned)
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    if constant_hessian:
+        all_hessians = np.ones(1, dtype=G_H_DTYPE)
+        sum_hessians = 1 * n_samples
+    else:
+        all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
+        sum_hessians = all_hessians.sum()
+
+    builder = HistogramBuilder(X_binned, n_bins, all_gradients,
+                               all_hessians, constant_hessian)
+    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
+                                  dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
+                        has_missing_values, monotonic_cst, l2_regularization,
+                        min_hessian_to_split, min_samples_leaf,
+                        min_gain_to_split, constant_hessian)
+
+    hists_parent = builder.compute_histograms_brute(sample_indices)
+    value_parent = compute_node_value(sum_gradients, sum_hessians,
+                                      -np.inf, np.inf, l2_regularization)
+    si_parent = splitter.find_node_split(n_samples, hists_parent,
+                                         sum_gradients, sum_hessians,
+                                         value_parent)
+    sample_indices_left, sample_indices_right, _ = splitter.split_indices(
+        si_parent, sample_indices)
+
+    hists_left = builder.compute_histograms_brute(sample_indices_left)
+    value_left = compute_node_value(si_parent.sum_gradient_left,
+                                    si_parent.sum_hessian_left,
+                                    -np.inf, np.inf, l2_regularization)
+    hists_right = builder.compute_histograms_brute(sample_indices_right)
+    value_right = compute_node_value(si_parent.sum_gradient_right,
+                                     si_parent.sum_hessian_right,
+                                     -np.inf, np.inf, l2_regularization)
+    si_left = splitter.find_node_split(n_samples, hists_left,
+                                       si_parent.sum_gradient_left,
+                                       si_parent.sum_hessian_left,
+                                       value_left)
+    si_right = splitter.find_node_split(n_samples, hists_right,
+                                        si_parent.sum_gradient_right,
+                                        si_parent.sum_hessian_right,
+                                        value_right)
+
+    # make sure that si.sum_gradient_left + si.sum_gradient_right have their
+    # expected value, same for hessians
+    for si, indices in (
+            (si_parent, sample_indices),
+            (si_left, sample_indices_left),
+            (si_right, sample_indices_right)):
+        gradient = si.sum_gradient_right + si.sum_gradient_left
+        expected_gradient = all_gradients[indices].sum()
+        hessian = si.sum_hessian_right + si.sum_hessian_left
+        if constant_hessian:
+            expected_hessian = indices.shape[0] * all_hessians[0]
+        else:
+            expected_hessian = all_hessians[indices].sum()
+
+        assert np.isclose(gradient, expected_gradient)
+        assert np.isclose(hessian, expected_hessian)
+
+    # make sure sum of gradients in histograms are the same for all features,
+    # and make sure they're equal to their expected value
+    hists_parent = np.asarray(hists_parent, dtype=HISTOGRAM_DTYPE)
+    hists_left = np.asarray(hists_left, dtype=HISTOGRAM_DTYPE)
+    hists_right = np.asarray(hists_right, dtype=HISTOGRAM_DTYPE)
+    for hists, indices in (
+            (hists_parent, sample_indices),
+            (hists_left, sample_indices_left),
+            (hists_right, sample_indices_right)):
+        # note: gradients and hessians have shape (n_features,),
+        # we're comparing them to *scalars*. This has the benefit of also
+        # making sure that all the entries are equal across features.
+        gradients = hists['sum_gradients'].sum(axis=1)  # shape = (n_features,)
+        expected_gradient = all_gradients[indices].sum()  # scalar
+        hessians = hists['sum_hessians'].sum(axis=1)
+        if constant_hessian:
+            # 0 is not the actual hessian, but it's not computed in this case
+            expected_hessian = 0.
+        else:
+            expected_hessian = all_hessians[indices].sum()
+
+        assert np.allclose(gradients, expected_gradient)
+        assert np.allclose(hessians, expected_hessian)
+
+
+def test_split_indices():
+    # Check that split_indices returns the correct splits and that
+    # splitter.partition is consistent with what is returned.
+    rng = np.random.RandomState(421)
+
+    n_bins = 5
+    n_samples = 10
+    l2_regularization = 0.
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.
+
+    # split will happen on feature 1 and on bin 3
+    X_binned = [[0, 0],
+                [0, 3],
+                [0, 4],
+                [0, 0],
+                [0, 0],
+                [0, 0],
+                [0, 0],
+                [0, 4],
+                [0, 0],
+                [0, 4]]
+    X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE)
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
+    all_hessians = np.ones(1, dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = 1 * n_samples
+    hessians_are_constant = True
+
+    builder = HistogramBuilder(X_binned, n_bins,
+                               all_gradients, all_hessians,
+                               hessians_are_constant)
+    n_bins_non_missing = np.array([n_bins] * X_binned.shape[1],
+                                  dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
+                        has_missing_values, monotonic_cst, l2_regularization,
+                        min_hessian_to_split, min_samples_leaf,
+                        min_gain_to_split, hessians_are_constant)
+
+    assert np.all(sample_indices == splitter.partition)
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               -np.inf, np.inf, l2_regularization)
+    si_root = splitter.find_node_split(n_samples, histograms,
+                                       sum_gradients, sum_hessians, value)
+
+    # sanity checks for best split
+    assert si_root.feature_idx == 1
+    assert si_root.bin_idx == 3
+
+    samples_left, samples_right, position_right = splitter.split_indices(
+        si_root, splitter.partition)
+    assert set(samples_left) == set([0, 1, 3, 4, 5, 6, 8])
+    assert set(samples_right) == set([2, 7, 9])
+
+    assert list(samples_left) == list(splitter.partition[:position_right])
+    assert list(samples_right) == list(splitter.partition[position_right:])
+
+    # Check that the resulting split indices sizes are consistent with the
+    # count statistics anticipated when looking for the best split.
+    assert samples_left.shape[0] == si_root.n_samples_left
+    assert samples_right.shape[0] == si_root.n_samples_right
+
+
+def test_min_gain_to_split():
+    # Try to split a pure node (all gradients are equal, same for hessians)
+    # with min_gain_to_split = 0 and make sure that the node is not split (best
+    # possible gain = -1). Note: before the strict inequality comparison, this
+    # test would fail because the node would be split with a gain of 0.
+    rng = np.random.RandomState(42)
+    l2_regularization = 0
+    min_hessian_to_split = 0
+    min_samples_leaf = 1
+    min_gain_to_split = 0.
+    n_bins = 255
+    n_samples = 100
+    X_binned = np.asfortranarray(
+        rng.randint(0, n_bins, size=(n_samples, 1)), dtype=X_BINNED_DTYPE)
+    binned_feature = X_binned[:, 0]
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
+    all_gradients = np.ones_like(binned_feature, dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = all_hessians.sum()
+    hessians_are_constant = False
+
+    builder = HistogramBuilder(X_binned, n_bins, all_gradients,
+                               all_hessians, hessians_are_constant)
+    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
+                                  dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
+                        has_missing_values, monotonic_cst, l2_regularization,
+                        min_hessian_to_split, min_samples_leaf,
+                        min_gain_to_split, hessians_are_constant)
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               -np.inf, np.inf, l2_regularization)
+    split_info = splitter.find_node_split(n_samples, histograms,
+                                          sum_gradients, sum_hessians, value)
+    assert split_info.gain == -1
+
+
+@pytest.mark.parametrize(
+    'X_binned, all_gradients, has_missing_values, n_bins_non_missing, '
+    ' expected_split_on_nan, expected_bin_idx, expected_go_to_left', [
+
+        # basic sanity check with no missing values: given the gradient
+        # values, the split must occur on bin_idx=3
+        ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],  # X_binned
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],  # gradients
+         False,  # no missing values
+         10,  # n_bins_non_missing
+         False,  # don't split on nans
+         3,  # expected_bin_idx
+         'not_applicable'),
+
+        # We replace 2 samples by NaNs (bin_idx=8)
+        # These 2 samples were mapped to the left node before, so they should
+        # be mapped to left node again
+        # Notice how the bin_idx threshold changes from 3 to 1.
+        ([8, 0, 1, 8, 2, 3, 4, 5, 6, 7],  # 8 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         8,  # n_bins_non_missing
+         False,  # don't split on nans
+         1,  # cut on bin_idx=1
+         True),  # missing values go to left
+
+        # same as above, but with non-consecutive missing_values_bin
+        ([9, 0, 1, 9, 2, 3, 4, 5, 6, 7],  # 9 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         8,  # n_bins_non_missing
+         False,  # don't split on nans
+         1,  # cut on bin_idx=1
+         True),  # missing values go to left
+
+        # this time replacing 2 samples that were on the right.
+        ([0, 1, 2, 3, 8, 4, 8, 5, 6, 7],  # 8 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         8,  # n_bins_non_missing
+         False,  # don't split on nans
+         3,  # cut on bin_idx=3 (like in first case)
+         False),  # missing values go to right
+
+        # same as above, but with non-consecutive missing_values_bin
+        ([0, 1, 2, 3, 9, 4, 9, 5, 6, 7],  # 9 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         8,  # n_bins_non_missing
+         False,  # don't split on nans
+         3,  # cut on bin_idx=3 (like in first case)
+         False),  # missing values go to right
+
+        # For the following cases, split_on_nans is True (we replace all of
+        # the samples with nans, instead of just 2).
+        ([0, 1, 2, 3, 4, 4, 4, 4, 4, 4],  # 4 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         4,  # n_bins_non_missing
+         True,  # split on nans
+         3,  # cut on bin_idx=3
+         False),  # missing values go to right
+
+        # same as above, but with non-consecutive missing_values_bin
+        ([0, 1, 2, 3, 9, 9, 9, 9, 9, 9],  # 9 <=> missing
+         [1, 1, 1, 1, 1, 1, 5, 5, 5, 5],
+         True,  # missing values
+         4,  # n_bins_non_missing
+         True,  # split on nans
+         3,  # cut on bin_idx=3
+         False),  # missing values go to right
+
+        ([6, 6, 6, 6, 0, 1, 2, 3, 4, 5],  # 6 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         6,  # n_bins_non_missing
+         True,  # split on nans
+         5,  # cut on bin_idx=5
+         False),  # missing values go to right
+
+        # same as above, but with non-consecutive missing_values_bin
+        ([9, 9, 9, 9, 0, 1, 2, 3, 4, 5],  # 9 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         6,  # n_bins_non_missing
+         True,  # split on nans
+         5,  # cut on bin_idx=5
+         False),  # missing values go to right
+    ]
+)
+def test_splitting_missing_values(X_binned, all_gradients,
+                                  has_missing_values, n_bins_non_missing,
+                                  expected_split_on_nan, expected_bin_idx,
+                                  expected_go_to_left):
+    # Make sure missing values are properly supported.
+    # we build an artificial example with gradients such that the best split
+    # is on bin_idx=3, when there are no missing values.
+    # Then we introduce missing values and:
+    #   - make sure the chosen bin is correct (find_best_bin()): it's
+    #     still the same split, even though the index of the bin may change
+    #   - make sure the missing values are mapped to the correct child
+    #     (split_indices())
+
+    n_bins = max(X_binned) + 1
+    n_samples = len(X_binned)
+    l2_regularization = 0.
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.
+
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1)
+    X_binned = np.asfortranarray(X_binned)
+    all_gradients = np.array(all_gradients, dtype=G_H_DTYPE)
+    has_missing_values = np.array([has_missing_values], dtype=np.uint8)
+    all_hessians = np.ones(1, dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = 1 * n_samples
+    hessians_are_constant = True
+
+    builder = HistogramBuilder(X_binned, n_bins,
+                               all_gradients, all_hessians,
+                               hessians_are_constant)
+
+    n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(X_binned, n_bins_non_missing,
+                        missing_values_bin_idx, has_missing_values,
+                        monotonic_cst,
+                        l2_regularization, min_hessian_to_split,
+                        min_samples_leaf, min_gain_to_split,
+                        hessians_are_constant)
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               -np.inf, np.inf, l2_regularization)
+    split_info = splitter.find_node_split(n_samples, histograms,
+                                          sum_gradients, sum_hessians, value)
+
+    assert split_info.bin_idx == expected_bin_idx
+    if has_missing_values:
+        assert split_info.missing_go_to_left == expected_go_to_left
+
+    split_on_nan = split_info.bin_idx == n_bins_non_missing[0] - 1
+    assert split_on_nan == expected_split_on_nan
+
+    # Make sure the split is properly computed.
+    # This also make sure missing values are properly assigned to the correct
+    # child in split_indices()
+    samples_left, samples_right, _ = splitter.split_indices(
+        split_info, splitter.partition)
+
+    if not expected_split_on_nan:
+        # When we don't split on nans, the split should always be the same.
+        assert set(samples_left) == set([0, 1, 2, 3])
+        assert set(samples_right) == set([4, 5, 6, 7, 8, 9])
+    else:
+        # When we split on nans, samples with missing values are always mapped
+        # to the right child.
+        missing_samples_indices = np.flatnonzero(
+            np.array(X_binned) == missing_values_bin_idx)
+        non_missing_samples_indices = np.flatnonzero(
+            np.array(X_binned) != missing_values_bin_idx)
+
+        assert set(samples_right) == set(missing_samples_indices)
+        assert set(samples_left) == set(non_missing_samples_indices)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
@ -0,0 +1,206 @@
+import numpy as np
+from numpy.testing import assert_array_equal
+from numpy.testing import assert_allclose
+
+import pytest
+
+from sklearn.base import clone
+from sklearn.datasets import make_classification, make_regression
+
+# To use this experimental feature, we need to explicitly ask for it:
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.metrics import check_scoring
+
+
+X_classification, y_classification = make_classification(random_state=0)
+X_regression, y_regression = make_regression(random_state=0)
+
+
+def _assert_predictor_equal(gb_1, gb_2, X):
+    """Assert that two HistGBM instances are identical."""
+    # Check identical nodes for each tree
+    for (pred_ith_1, pred_ith_2) in zip(gb_1._predictors, gb_2._predictors):
+        for (predictor_1, predictor_2) in zip(pred_ith_1, pred_ith_2):
+            assert_array_equal(predictor_1.nodes, predictor_2.nodes)
+
+    # Check identical predictions
+    assert_allclose(gb_1.predict(X), gb_2.predict(X))
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+def test_max_iter_with_warm_start_validation(GradientBoosting, X, y):
+    # Check that a ValueError is raised when the maximum number of iterations
+    # is smaller than the number of iterations from the previous fit when warm
+    # start is True.
+
+    estimator = GradientBoosting(max_iter=10, early_stopping=False,
+                                 warm_start=True)
+    estimator.fit(X, y)
+    estimator.set_params(max_iter=5)
+    err_msg = ('max_iter=5 must be larger than or equal to n_iter_=10 '
+               'when warm_start==True')
+    with pytest.raises(ValueError, match=err_msg):
+        estimator.fit(X, y)
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+def test_warm_start_yields_identical_results(GradientBoosting, X, y):
+    # Make sure that fitting 50 iterations and then 25 with warm start is
+    # equivalent to fitting 75 iterations.
+
+    rng = 42
+    gb_warm_start = GradientBoosting(
+        n_iter_no_change=100, max_iter=50, random_state=rng, warm_start=True
+    )
+    gb_warm_start.fit(X, y).set_params(max_iter=75).fit(X, y)
+
+    gb_no_warm_start = GradientBoosting(
+        n_iter_no_change=100, max_iter=75, random_state=rng, warm_start=False
+    )
+    gb_no_warm_start.fit(X, y)
+
+    # Check that both predictors are equal
+    _assert_predictor_equal(gb_warm_start, gb_no_warm_start, X)
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+def test_warm_start_max_depth(GradientBoosting, X, y):
+    # Test if possible to fit trees of different depth in ensemble.
+    gb = GradientBoosting(max_iter=20, min_samples_leaf=1,
+                          warm_start=True, max_depth=2, early_stopping=False)
+    gb.fit(X, y)
+    gb.set_params(max_iter=30, max_depth=3, n_iter_no_change=110)
+    gb.fit(X, y)
+
+    # First 20 trees have max_depth == 2
+    for i in range(20):
+        assert gb._predictors[i][0].get_max_depth() == 2
+    # Last 10 trees have max_depth == 3
+    for i in range(1, 11):
+        assert gb._predictors[-i][0].get_max_depth() == 3
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+@pytest.mark.parametrize('scoring', (None, 'loss'))
+def test_warm_start_early_stopping(GradientBoosting, X, y, scoring):
+    # Make sure that early stopping occurs after a small number of iterations
+    # when fitting a second time with warm starting.
+
+    n_iter_no_change = 5
+    gb = GradientBoosting(
+        n_iter_no_change=n_iter_no_change, max_iter=10000, early_stopping=True,
+        random_state=42, warm_start=True, tol=1e-3, scoring=scoring,
+    )
+    gb.fit(X, y)
+    n_iter_first_fit = gb.n_iter_
+    gb.fit(X, y)
+    n_iter_second_fit = gb.n_iter_
+    assert 0 < n_iter_second_fit - n_iter_first_fit < n_iter_no_change
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+def test_warm_start_equal_n_estimators(GradientBoosting, X, y):
+    # Test if warm start with equal n_estimators does nothing
+    gb_1 = GradientBoosting(max_depth=2, early_stopping=False)
+    gb_1.fit(X, y)
+
+    gb_2 = clone(gb_1)
+    gb_2.set_params(max_iter=gb_1.max_iter, warm_start=True,
+                    n_iter_no_change=5)
+    gb_2.fit(X, y)
+
+    # Check that both predictors are equal
+    _assert_predictor_equal(gb_1, gb_2, X)
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+def test_warm_start_clear(GradientBoosting, X, y):
+    # Test if fit clears state.
+    gb_1 = GradientBoosting(n_iter_no_change=5, random_state=42)
+    gb_1.fit(X, y)
+
+    gb_2 = GradientBoosting(n_iter_no_change=5, random_state=42,
+                            warm_start=True)
+    gb_2.fit(X, y)  # inits state
+    gb_2.set_params(warm_start=False)
+    gb_2.fit(X, y)  # clears old state and equals est
+
+    # Check that both predictors have the same train_score_ and
+    # validation_score_ attributes
+    assert_allclose(gb_1.train_score_, gb_2.train_score_)
+    assert_allclose(gb_1.validation_score_, gb_2.validation_score_)
+
+    # Check that both predictors are equal
+    _assert_predictor_equal(gb_1, gb_2, X)
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+@pytest.mark.parametrize('rng_type', ('none', 'int', 'instance'))
+def test_random_seeds_warm_start(GradientBoosting, X, y, rng_type):
+    # Make sure the seeds for train/val split and small trainset subsampling
+    # are correctly set in a warm start context.
+    def _get_rng(rng_type):
+        # Helper to avoid consuming rngs
+        if rng_type == 'none':
+            return None
+        elif rng_type == 'int':
+            return 42
+        else:
+            return np.random.RandomState(0)
+
+    random_state = _get_rng(rng_type)
+    gb_1 = GradientBoosting(early_stopping=True, max_iter=2,
+                            random_state=random_state)
+    gb_1.set_params(scoring=check_scoring(gb_1))
+    gb_1.fit(X, y)
+    random_seed_1_1 = gb_1._random_seed
+
+    gb_1.fit(X, y)
+    random_seed_1_2 = gb_1._random_seed  # clear the old state, different seed
+
+    random_state = _get_rng(rng_type)
+    gb_2 = GradientBoosting(early_stopping=True, max_iter=2,
+                            random_state=random_state, warm_start=True)
+    gb_2.set_params(scoring=check_scoring(gb_2))
+    gb_2.fit(X, y)  # inits state
+    random_seed_2_1 = gb_2._random_seed
+    gb_2.fit(X, y)  # clears old state and equals est
+    random_seed_2_2 = gb_2._random_seed
+
+    # Without warm starting, the seeds should be
+    # * all different if random state is None
+    # * all equal if random state is an integer
+    # * different when refitting and equal with a new estimator (because
+    #   the random state is mutated)
+    if rng_type == 'none':
+        assert random_seed_1_1 != random_seed_1_2 != random_seed_2_1
+    elif rng_type == 'int':
+        assert random_seed_1_1 == random_seed_1_2 == random_seed_2_1
+    else:
+        assert random_seed_1_1 == random_seed_2_1 != random_seed_1_2
+
+    # With warm starting, the seeds must be equal
+    assert random_seed_2_1 == random_seed_2_2
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/utils.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/utils.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_iforest.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_iforest.py
@ -0,0 +1,513 @@
+# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
+#          Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
+# License: BSD 3 clause
+
+import numbers
+import numpy as np
+from scipy.sparse import issparse
+from warnings import warn
+
+from ..tree import ExtraTreeRegressor
+from ..utils import (
+    check_random_state,
+    check_array,
+    gen_batches,
+    get_chunk_n_rows,
+)
+from ..utils.fixes import _joblib_parallel_args
+from ..utils.validation import check_is_fitted, _num_samples
+from ..utils.validation import _deprecate_positional_args
+from ..base import OutlierMixin
+
+from ._bagging import BaseBagging
+
+__all__ = ["IsolationForest"]
+
+
+class IsolationForest(OutlierMixin, BaseBagging):
+    """
+    Isolation Forest Algorithm.
+
+    Return the anomaly score of each sample using the IsolationForest algorithm
+
+    The IsolationForest 'isolates' observations by randomly selecting a feature
+    and then randomly selecting a split value between the maximum and minimum
+    values of the selected feature.
+
+    Since recursive partitioning can be represented by a tree structure, the
+    number of splittings required to isolate a sample is equivalent to the path
+    length from the root node to the terminating node.
+
+    This path length, averaged over a forest of such random trees, is a
+    measure of normality and our decision function.
+
+    Random partitioning produces noticeably shorter paths for anomalies.
+    Hence, when a forest of random trees collectively produce shorter path
+    lengths for particular samples, they are highly likely to be anomalies.
+
+    Read more in the :ref:`User Guide <isolation_forest>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    n_estimators : int, default=100
+        The number of base estimators in the ensemble.
+
+    max_samples : "auto", int or float, default="auto"
+        The number of samples to draw from X to train each base estimator.
+            - If int, then draw `max_samples` samples.
+            - If float, then draw `max_samples * X.shape[0]` samples.
+            - If "auto", then `max_samples=min(256, n_samples)`.
+
+        If max_samples is larger than the number of samples provided,
+        all samples will be used for all trees (no sampling).
+
+    contamination : 'auto' or float, default='auto'
+        The amount of contamination of the data set, i.e. the proportion
+        of outliers in the data set. Used when fitting to define the threshold
+        on the scores of the samples.
+
+            - If 'auto', the threshold is determined as in the
+              original paper.
+            - If float, the contamination should be in the range [0, 0.5].
+
+        .. versionchanged:: 0.22
+           The default value of ``contamination`` changed from 0.1
+           to ``'auto'``.
+
+    max_features : int or float, default=1.0
+        The number of features to draw from X to train each base estimator.
+
+            - If int, then draw `max_features` features.
+            - If float, then draw `max_features * X.shape[1]` features.
+
+    bootstrap : bool, default=False
+        If True, individual trees are fit on random subsets of the training
+        data sampled with replacement. If False, sampling without replacement
+        is performed.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel for both :meth:`fit` and
+        :meth:`predict`. ``None`` means 1 unless in a
+        :obj:`joblib.parallel_backend` context. ``-1`` means using all
+        processors. See :term:`Glossary <n_jobs>` for more details.
+
+    behaviour : str, default='deprecated'
+        This parameter has no effect, is deprecated, and will be removed.
+
+        .. versionadded:: 0.20
+           ``behaviour`` is added in 0.20 for back-compatibility purpose.
+
+        .. deprecated:: 0.20
+           ``behaviour='old'`` is deprecated in 0.20 and will not be possible
+           in 0.22.
+
+        .. deprecated:: 0.22
+           ``behaviour`` parameter is deprecated in 0.22 and removed in
+           0.24.
+
+    random_state : int or RandomState, default=None
+        Controls the pseudo-randomness of the selection of the feature
+        and split values for each branching step and each tree in the forest.
+
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    verbose : int, default=0
+        Controls the verbosity of the tree building process.
+
+    warm_start : bool, default=False
+        When set to ``True``, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit a whole
+        new forest. See :term:`the Glossary <warm_start>`.
+
+        .. versionadded:: 0.21
+
+    Attributes
+    ----------
+    estimators_ : list of DecisionTreeClassifier
+        The collection of fitted sub-estimators.
+
+    estimators_samples_ : list of arrays
+        The subset of drawn samples (i.e., the in-bag samples) for each base
+        estimator.
+
+    max_samples_ : int
+        The actual number of samples.
+
+    offset_ : float
+        Offset used to define the decision function from the raw scores. We
+        have the relation: ``decision_function = score_samples - offset_``.
+        ``offset_`` is defined as follows. When the contamination parameter is
+        set to "auto", the offset is equal to -0.5 as the scores of inliers are
+        close to 0 and the scores of outliers are close to -1. When a
+        contamination parameter different than "auto" is provided, the offset
+        is defined in such a way we obtain the expected number of outliers
+        (samples with decision function < 0) in training.
+
+        .. versionadded:: 0.20
+
+    estimators_features_ : list of arrays
+        The subset of drawn features for each base estimator.
+
+    Notes
+    -----
+    The implementation is based on an ensemble of ExtraTreeRegressor. The
+    maximum depth of each tree is set to ``ceil(log_2(n))`` where
+    :math:`n` is the number of samples used to build the tree
+    (see (Liu et al., 2008) for more details).
+
+    References
+    ----------
+    .. [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
+           Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
+    .. [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation-based
+           anomaly detection." ACM Transactions on Knowledge Discovery from
+           Data (TKDD) 6.1 (2012): 3.
+
+    See Also
+    ----------
+    sklearn.covariance.EllipticEnvelope : An object for detecting outliers in a
+        Gaussian distributed dataset.
+    sklearn.svm.OneClassSVM : Unsupervised Outlier Detection.
+        Estimate the support of a high-dimensional distribution.
+        The implementation is based on libsvm.
+    sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection
+        using Local Outlier Factor (LOF).
+
+    Examples
+    --------
+    >>> from sklearn.ensemble import IsolationForest
+    >>> X = [[-1.1], [0.3], [0.5], [100]]
+    >>> clf = IsolationForest(random_state=0).fit(X)
+    >>> clf.predict([[0.1], [0], [90]])
+    array([ 1,  1, -1])
+    """
+    @_deprecate_positional_args
+    def __init__(self, *,
+                 n_estimators=100,
+                 max_samples="auto",
+                 contamination="auto",
+                 max_features=1.,
+                 bootstrap=False,
+                 n_jobs=None,
+                 behaviour='deprecated',
+                 random_state=None,
+                 verbose=0,
+                 warm_start=False):
+        super().__init__(
+            base_estimator=ExtraTreeRegressor(
+                max_features=1,
+                splitter='random',
+                random_state=random_state),
+            # here above max_features has no links with self.max_features
+            bootstrap=bootstrap,
+            bootstrap_features=False,
+            n_estimators=n_estimators,
+            max_samples=max_samples,
+            max_features=max_features,
+            warm_start=warm_start,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose)
+
+        self.behaviour = behaviour
+        self.contamination = contamination
+
+    def _set_oob_score(self, X, y):
+        raise NotImplementedError("OOB score not supported by iforest")
+
+    def _parallel_args(self):
+        # ExtraTreeRegressor releases the GIL, so it's more efficient to use
+        # a thread-based backend rather than a process-based backend so as
+        # to avoid suffering from communication overhead and extra memory
+        # copies.
+        return _joblib_parallel_args(prefer='threads')
+
+    def fit(self, X, y=None, sample_weight=None):
+        """
+        Fit estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Use ``dtype=np.float32`` for maximum
+            efficiency. Sparse matrices are also supported, use sparse
+            ``csc_matrix`` for maximum efficiency.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        if self.behaviour != 'deprecated':
+            if self.behaviour == 'new':
+                warn(
+                    "'behaviour' is deprecated in 0.22 and will be removed "
+                    "in 0.24. You should not pass or set this parameter.",
+                    FutureWarning
+                )
+            else:
+                raise NotImplementedError(
+                    "The old behaviour of IsolationForest is not implemented "
+                    "anymore. Remove the 'behaviour' parameter."
+                )
+
+        X = check_array(X, accept_sparse=['csc'])
+        if issparse(X):
+            # Pre-sort indices to avoid that each individual tree of the
+            # ensemble sorts the indices.
+            X.sort_indices()
+
+        rnd = check_random_state(self.random_state)
+        y = rnd.uniform(size=X.shape[0])
+
+        # ensure that max_sample is in [1, n_samples]:
+        n_samples = X.shape[0]
+
+        if isinstance(self.max_samples, str):
+            if self.max_samples == 'auto':
+                max_samples = min(256, n_samples)
+            else:
+                raise ValueError('max_samples (%s) is not supported.'
+                                 'Valid choices are: "auto", int or'
+                                 'float' % self.max_samples)
+
+        elif isinstance(self.max_samples, numbers.Integral):
+            if self.max_samples > n_samples:
+                warn("max_samples (%s) is greater than the "
+                     "total number of samples (%s). max_samples "
+                     "will be set to n_samples for estimation."
+                     % (self.max_samples, n_samples))
+                max_samples = n_samples
+            else:
+                max_samples = self.max_samples
+        else:  # float
+            if not 0. < self.max_samples <= 1.:
+                raise ValueError("max_samples must be in (0, 1], got %r"
+                                 % self.max_samples)
+            max_samples = int(self.max_samples * X.shape[0])
+
+        self.max_samples_ = max_samples
+        max_depth = int(np.ceil(np.log2(max(max_samples, 2))))
+        super()._fit(X, y, max_samples,
+                     max_depth=max_depth,
+                     sample_weight=sample_weight)
+
+        if self.contamination == "auto":
+            # 0.5 plays a special role as described in the original paper.
+            # we take the opposite as we consider the opposite of their score.
+            self.offset_ = -0.5
+            return self
+
+        # else, define offset_ wrt contamination parameter
+        self.offset_ = np.percentile(self.score_samples(X),
+                                     100. * self.contamination)
+
+        return self
+
+    def predict(self, X):
+        """
+        Predict if a particular sample is an outlier or not.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        is_inlier : ndarray of shape (n_samples,)
+            For each observation, tells whether or not (+1 or -1) it should
+            be considered as an inlier according to the fitted model.
+        """
+        check_is_fitted(self)
+        X = check_array(X, accept_sparse='csr')
+        is_inlier = np.ones(X.shape[0], dtype=int)
+        is_inlier[self.decision_function(X) < 0] = -1
+        return is_inlier
+
+    def decision_function(self, X):
+        """
+        Average anomaly score of X of the base classifiers.
+
+        The anomaly score of an input sample is computed as
+        the mean anomaly score of the trees in the forest.
+
+        The measure of normality of an observation given a tree is the depth
+        of the leaf containing this observation, which is equivalent to
+        the number of splittings required to isolate this point. In case of
+        several observations n_left in the leaf, the average path length of
+        a n_left samples isolation tree is added.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        scores : ndarray of shape (n_samples,)
+            The anomaly score of the input samples.
+            The lower, the more abnormal. Negative scores represent outliers,
+            positive scores represent inliers.
+        """
+        # We subtract self.offset_ to make 0 be the threshold value for being
+        # an outlier:
+
+        return self.score_samples(X) - self.offset_
+
+    def score_samples(self, X):
+        """
+        Opposite of the anomaly score defined in the original paper.
+
+        The anomaly score of an input sample is computed as
+        the mean anomaly score of the trees in the forest.
+
+        The measure of normality of an observation given a tree is the depth
+        of the leaf containing this observation, which is equivalent to
+        the number of splittings required to isolate this point. In case of
+        several observations n_left in the leaf, the average path length of
+        a n_left samples isolation tree is added.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        scores : ndarray of shape (n_samples,)
+            The anomaly score of the input samples.
+            The lower, the more abnormal.
+        """
+        # code structure from ForestClassifier/predict_proba
+
+        check_is_fitted(self)
+
+        # Check data
+        X = check_array(X, accept_sparse='csr')
+        if self.n_features_ != X.shape[1]:
+            raise ValueError("Number of features of the model must "
+                             "match the input. Model n_features is {0} and "
+                             "input n_features is {1}."
+                             "".format(self.n_features_, X.shape[1]))
+
+        # Take the opposite of the scores as bigger is better (here less
+        # abnormal)
+        return -self._compute_chunked_score_samples(X)
+
+    def _compute_chunked_score_samples(self, X):
+
+        n_samples = _num_samples(X)
+
+        if self._max_features == X.shape[1]:
+            subsample_features = False
+        else:
+            subsample_features = True
+
+        # We get as many rows as possible within our working_memory budget
+        # (defined by sklearn.get_config()['working_memory']) to store
+        # self._max_features in each row during computation.
+        #
+        # Note:
+        #  - this will get at least 1 row, even if 1 row of score will
+        #    exceed working_memory.
+        #  - this does only account for temporary memory usage while loading
+        #    the data needed to compute the scores -- the returned scores
+        #    themselves are 1D.
+
+        chunk_n_rows = get_chunk_n_rows(row_bytes=16 * self._max_features,
+                                        max_n_rows=n_samples)
+        slices = gen_batches(n_samples, chunk_n_rows)
+
+        scores = np.zeros(n_samples, order="f")
+
+        for sl in slices:
+            # compute score on the slices of test samples:
+            scores[sl] = self._compute_score_samples(X[sl], subsample_features)
+
+        return scores
+
+    def _compute_score_samples(self, X, subsample_features):
+        """
+        Compute the score of each samples in X going through the extra trees.
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix
+            Data matrix.
+
+        subsample_features : bool
+            Whether features should be subsampled.
+        """
+        n_samples = X.shape[0]
+
+        depths = np.zeros(n_samples, order="f")
+
+        for tree, features in zip(self.estimators_, self.estimators_features_):
+            X_subset = X[:, features] if subsample_features else X
+
+            leaves_index = tree.apply(X_subset)
+            node_indicator = tree.decision_path(X_subset)
+            n_samples_leaf = tree.tree_.n_node_samples[leaves_index]
+
+            depths += (
+                np.ravel(node_indicator.sum(axis=1))
+                + _average_path_length(n_samples_leaf)
+                - 1.0
+            )
+
+        scores = 2 ** (
+            -depths
+            / (len(self.estimators_)
+               * _average_path_length([self.max_samples_]))
+        )
+        return scores
+
+
+def _average_path_length(n_samples_leaf):
+    """
+    The average path length in a n_samples iTree, which is equal to
+    the average path length of an unsuccessful BST search since the
+    latter has the same structure as an isolation tree.
+    Parameters
+    ----------
+    n_samples_leaf : array-like of shape (n_samples,)
+        The number of training samples in each test sample leaf, for
+        each estimators.
+
+    Returns
+    -------
+    average_path_length : ndarray of shape (n_samples,)
+    """
+
+    n_samples_leaf = check_array(n_samples_leaf, ensure_2d=False)
+
+    n_samples_leaf_shape = n_samples_leaf.shape
+    n_samples_leaf = n_samples_leaf.reshape((1, -1))
+    average_path_length = np.zeros(n_samples_leaf.shape)
+
+    mask_1 = n_samples_leaf <= 1
+    mask_2 = n_samples_leaf == 2
+    not_mask = ~np.logical_or(mask_1, mask_2)
+
+    average_path_length[mask_1] = 0.
+    average_path_length[mask_2] = 1.
+    average_path_length[not_mask] = (
+        2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma)
+        - 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask]
+    )
+
+    return average_path_length.reshape(n_samples_leaf_shape)
--- a/venv/Lib/site-packages/sklearn/ensemble/_stacking.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_stacking.py
@ -0,0 +1,705 @@
+"""Stacking classifier and regressor."""
+
+# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
+# License: BSD 3 clause
+
+from abc import ABCMeta, abstractmethod
+from copy import deepcopy
+
+import numpy as np
+from joblib import Parallel, delayed
+import scipy.sparse as sparse
+
+from ..base import clone
+from ..base import ClassifierMixin, RegressorMixin, TransformerMixin
+from ..base import is_classifier, is_regressor
+from ..exceptions import NotFittedError
+from ..utils._estimator_html_repr import _VisualBlock
+
+from ._base import _fit_single_estimator
+from ._base import _BaseHeterogeneousEnsemble
+
+from ..linear_model import LogisticRegression
+from ..linear_model import RidgeCV
+
+from ..model_selection import cross_val_predict
+from ..model_selection import check_cv
+
+from ..preprocessing import LabelEncoder
+
+from ..utils import Bunch
+from ..utils.metaestimators import if_delegate_has_method
+from ..utils.multiclass import check_classification_targets
+from ..utils.validation import check_is_fitted
+from ..utils.validation import column_or_1d
+from ..utils.validation import _deprecate_positional_args
+
+
+class _BaseStacking(TransformerMixin, _BaseHeterogeneousEnsemble,
+                    metaclass=ABCMeta):
+    """Base class for stacking method."""
+
+    @abstractmethod
+    def __init__(self, estimators, final_estimator=None, *, cv=None,
+                 stack_method='auto', n_jobs=None, verbose=0,
+                 passthrough=False):
+        super().__init__(estimators=estimators)
+        self.final_estimator = final_estimator
+        self.cv = cv
+        self.stack_method = stack_method
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+        self.passthrough = passthrough
+
+    def _clone_final_estimator(self, default):
+        if self.final_estimator is not None:
+            self.final_estimator_ = clone(self.final_estimator)
+        else:
+            self.final_estimator_ = clone(default)
+
+    def _concatenate_predictions(self, X, predictions):
+        """Concatenate the predictions of each first layer learner and
+        possibly the input dataset `X`.
+
+        If `X` is sparse and `self.passthrough` is False, the output of
+        `transform` will be dense (the predictions). If `X` is sparse
+        and `self.passthrough` is True, the output of `transform` will
+        be sparse.
+
+        This helper is in charge of ensuring the predictions are 2D arrays and
+        it will drop one of the probability column when using probabilities
+        in the binary case. Indeed, the p(y|c=0) = 1 - p(y|c=1)
+        """
+        X_meta = []
+        for est_idx, preds in enumerate(predictions):
+            # case where the the estimator returned a 1D array
+            if preds.ndim == 1:
+                X_meta.append(preds.reshape(-1, 1))
+            else:
+                if (self.stack_method_[est_idx] == 'predict_proba' and
+                        len(self.classes_) == 2):
+                    # Remove the first column when using probabilities in
+                    # binary classification because both features are perfectly
+                    # collinear.
+                    X_meta.append(preds[:, 1:])
+                else:
+                    X_meta.append(preds)
+        if self.passthrough:
+            X_meta.append(X)
+            if sparse.issparse(X):
+                return sparse.hstack(X_meta, format=X.format)
+
+        return np.hstack(X_meta)
+
+    @staticmethod
+    def _method_name(name, estimator, method):
+        if estimator == 'drop':
+            return None
+        if method == 'auto':
+            if getattr(estimator, 'predict_proba', None):
+                return 'predict_proba'
+            elif getattr(estimator, 'decision_function', None):
+                return 'decision_function'
+            else:
+                return 'predict'
+        else:
+            if not hasattr(estimator, method):
+                raise ValueError('Underlying estimator {} does not implement '
+                                 'the method {}.'.format(name, method))
+            return method
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit the estimators.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,) or default=None
+            Sample weights. If None, then samples are equally weighted.
+            Note that this is supported only if all underlying estimators
+            support sample weights.
+
+            .. versionchanged:: 0.23
+               when not None, `sample_weight` is passed to all underlying
+               estimators
+
+        Returns
+        -------
+        self : object
+        """
+        # all_estimators contains all estimators, the one to be fitted and the
+        # 'drop' string.
+        names, all_estimators = self._validate_estimators()
+        self._validate_final_estimator()
+
+        stack_method = [self.stack_method] * len(all_estimators)
+
+        # Fit the base estimators on the whole training data. Those
+        # base estimators will be used in transform, predict, and
+        # predict_proba. They are exposed publicly.
+        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
+            delayed(_fit_single_estimator)(clone(est), X, y, sample_weight)
+            for est in all_estimators if est != 'drop'
+        )
+
+        self.named_estimators_ = Bunch()
+        est_fitted_idx = 0
+        for name_est, org_est in zip(names, all_estimators):
+            if org_est != 'drop':
+                self.named_estimators_[name_est] = self.estimators_[
+                    est_fitted_idx]
+                est_fitted_idx += 1
+            else:
+                self.named_estimators_[name_est] = 'drop'
+
+        # To train the meta-classifier using the most data as possible, we use
+        # a cross-validation to obtain the output of the stacked estimators.
+
+        # To ensure that the data provided to each estimator are the same, we
+        # need to set the random state of the cv if there is one and we need to
+        # take a copy.
+        cv = check_cv(self.cv, y=y, classifier=is_classifier(self))
+        if hasattr(cv, 'random_state') and cv.random_state is None:
+            cv.random_state = np.random.RandomState()
+
+        self.stack_method_ = [
+            self._method_name(name, est, meth)
+            for name, est, meth in zip(names, all_estimators, stack_method)
+        ]
+        fit_params = ({"sample_weight": sample_weight}
+                      if sample_weight is not None
+                      else None)
+        predictions = Parallel(n_jobs=self.n_jobs)(
+            delayed(cross_val_predict)(clone(est), X, y, cv=deepcopy(cv),
+                                       method=meth, n_jobs=self.n_jobs,
+                                       fit_params=fit_params,
+                                       verbose=self.verbose)
+            for est, meth in zip(all_estimators, self.stack_method_)
+            if est != 'drop'
+        )
+
+        # Only not None or not 'drop' estimators will be used in transform.
+        # Remove the None from the method as well.
+        self.stack_method_ = [
+            meth for (meth, est) in zip(self.stack_method_, all_estimators)
+            if est != 'drop'
+        ]
+
+        X_meta = self._concatenate_predictions(X, predictions)
+        _fit_single_estimator(self.final_estimator_, X_meta, y,
+                              sample_weight=sample_weight)
+
+        return self
+
+    @property
+    def n_features_in_(self):
+        """Number of features seen during :term:`fit`."""
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                f"{self.__class__.__name__} object has no attribute "
+                f"n_features_in_") from nfe
+        return self.estimators_[0].n_features_in_
+
+    def _transform(self, X):
+        """Concatenate and return the predictions of the estimators."""
+        check_is_fitted(self)
+        predictions = [
+            getattr(est, meth)(X)
+            for est, meth in zip(self.estimators_, self.stack_method_)
+            if est != 'drop'
+        ]
+        return self._concatenate_predictions(X, predictions)
+
+    @if_delegate_has_method(delegate='final_estimator_')
+    def predict(self, X, **predict_params):
+        """Predict target for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        **predict_params : dict of str -> obj
+            Parameters to the `predict` called by the `final_estimator`. Note
+            that this may be used to return uncertainties from some estimators
+            with `return_std` or `return_cov`. Be aware that it will only
+            accounts for uncertainty in the final estimator.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)
+            Predicted targets.
+        """
+
+        check_is_fitted(self)
+        return self.final_estimator_.predict(
+            self.transform(X), **predict_params
+        )
+
+    def _sk_visual_block_(self, final_estimator):
+        names, estimators = zip(*self.estimators)
+        parallel = _VisualBlock('parallel', estimators, names=names,
+                                dash_wrapped=False)
+        serial = _VisualBlock('serial', (parallel, final_estimator),
+                              dash_wrapped=False)
+        return _VisualBlock('serial', [serial])
+
+
+class StackingClassifier(ClassifierMixin, _BaseStacking):
+    """Stack of estimators with a final classifier.
+
+    Stacked generalization consists in stacking the output of individual
+    estimator and use a classifier to compute the final prediction. Stacking
+    allows to use the strength of each individual estimator by using their
+    output as input of a final estimator.
+
+    Note that `estimators_` are fitted on the full `X` while `final_estimator_`
+    is trained using cross-validated predictions of the base estimators using
+    `cross_val_predict`.
+
+    .. versionadded:: 0.22
+
+    Read more in the :ref:`User Guide <stacking>`.
+
+    Parameters
+    ----------
+    estimators : list of (str, estimator)
+        Base estimators which will be stacked together. Each element of the
+        list is defined as a tuple of string (i.e. name) and an estimator
+        instance. An estimator can be set to 'drop' using `set_params`.
+
+    final_estimator : estimator, default=None
+        A classifier which will be used to combine the base estimators.
+        The default classifier is a `LogisticRegression`.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy used in
+        `cross_val_predict` to train `final_estimator`. Possible inputs for
+        cv are:
+
+        * None, to use the default 5-fold cross validation,
+        * integer, to specify the number of folds in a (Stratified) KFold,
+        * An object to be used as a cross-validation generator,
+        * An iterable yielding train, test splits.
+
+        For integer/None inputs, if the estimator is a classifier and y is
+        either binary or multiclass, `StratifiedKFold` is used. In all other
+        cases, `KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. note::
+           A larger number of split will provide no benefits if the number
+           of training samples is large enough. Indeed, the training time
+           will increase. ``cv`` is not used for model evaluation but for
+           prediction.
+
+    stack_method : {'auto', 'predict_proba', 'decision_function', 'predict'}, \
+            default='auto'
+        Methods called for each base estimator. It can be:
+
+        * if 'auto', it will try to invoke, for each estimator,
+          `'predict_proba'`, `'decision_function'` or `'predict'` in that
+          order.
+        * otherwise, one of `'predict_proba'`, `'decision_function'` or
+          `'predict'`. If the method is not implemented by the estimator, it
+          will raise an error.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel all `estimators` `fit`.
+        `None` means 1 unless in a `joblib.parallel_backend` context. -1 means
+        using all processors. See Glossary for more details.
+
+    passthrough : bool, default=False
+        When False, only the predictions of estimators will be used as
+        training data for `final_estimator`. When True, the
+        `final_estimator` is trained on the predictions as well as the
+        original training data.
+
+    verbose : int, default=0
+        Verbosity level.
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        Class labels.
+
+    estimators_ : list of estimators
+        The elements of the estimators parameter, having been fitted on the
+        training data. If an estimator has been set to `'drop'`, it
+        will not appear in `estimators_`.
+
+    named_estimators_ : :class:`~sklearn.utils.Bunch`
+        Attribute to access any fitted sub-estimators by name.
+
+    final_estimator_ : estimator
+        The classifier which predicts given the output of `estimators_`.
+
+    stack_method_ : list of str
+        The method used by each base estimator.
+
+    Notes
+    -----
+    When `predict_proba` is used by each estimator (i.e. most of the time for
+    `stack_method='auto'` or specifically for `stack_method='predict_proba'`),
+    The first column predicted by each estimator will be dropped in the case
+    of a binary classification problem. Indeed, both feature will be perfectly
+    collinear.
+
+    References
+    ----------
+    .. [1] Wolpert, David H. "Stacked generalization." Neural networks 5.2
+       (1992): 241-259.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.svm import LinearSVC
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.ensemble import StackingClassifier
+    >>> X, y = load_iris(return_X_y=True)
+    >>> estimators = [
+    ...     ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
+    ...     ('svr', make_pipeline(StandardScaler(),
+    ...                           LinearSVC(random_state=42)))
+    ... ]
+    >>> clf = StackingClassifier(
+    ...     estimators=estimators, final_estimator=LogisticRegression()
+    ... )
+    >>> from sklearn.model_selection import train_test_split
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, stratify=y, random_state=42
+    ... )
+    >>> clf.fit(X_train, y_train).score(X_test, y_test)
+    0.9...
+
+    """
+    @_deprecate_positional_args
+    def __init__(self, estimators, final_estimator=None, *, cv=None,
+                 stack_method='auto', n_jobs=None, passthrough=False,
+                 verbose=0):
+        super().__init__(
+            estimators=estimators,
+            final_estimator=final_estimator,
+            cv=cv,
+            stack_method=stack_method,
+            n_jobs=n_jobs,
+            passthrough=passthrough,
+            verbose=verbose
+        )
+
+    def _validate_final_estimator(self):
+        self._clone_final_estimator(default=LogisticRegression())
+        if not is_classifier(self.final_estimator_):
+            raise ValueError(
+                "'final_estimator' parameter should be a classifier. Got {}"
+                .format(self.final_estimator_)
+            )
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit the estimators.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
+            Note that this is supported only if all underlying estimators
+            support sample weights.
+
+        Returns
+        -------
+        self : object
+        """
+        check_classification_targets(y)
+        self._le = LabelEncoder().fit(y)
+        self.classes_ = self._le.classes_
+        return super().fit(X, self._le.transform(y), sample_weight)
+
+    @if_delegate_has_method(delegate='final_estimator_')
+    def predict(self, X, **predict_params):
+        """Predict target for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        **predict_params : dict of str -> obj
+            Parameters to the `predict` called by the `final_estimator`. Note
+            that this may be used to return uncertainties from some estimators
+            with `return_std` or `return_cov`. Be aware that it will only
+            accounts for uncertainty in the final estimator.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)
+            Predicted targets.
+        """
+        y_pred = super().predict(X, **predict_params)
+        return self._le.inverse_transform(y_pred)
+
+    @if_delegate_has_method(delegate='final_estimator_')
+    def predict_proba(self, X):
+        """Predict class probabilities for X using
+        `final_estimator_.predict_proba`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        Returns
+        -------
+        probabilities : ndarray of shape (n_samples, n_classes) or \
+            list of ndarray of shape (n_output,)
+            The class probabilities of the input samples.
+        """
+        check_is_fitted(self)
+        return self.final_estimator_.predict_proba(self.transform(X))
+
+    @if_delegate_has_method(delegate='final_estimator_')
+    def decision_function(self, X):
+        """Predict decision function for samples in X using
+        `final_estimator_.decision_function`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        Returns
+        -------
+        decisions : ndarray of shape (n_samples,), (n_samples, n_classes), \
+            or (n_samples, n_classes * (n_classes-1) / 2)
+            The decision function computed the final estimator.
+        """
+        check_is_fitted(self)
+        return self.final_estimator_.decision_function(self.transform(X))
+
+    def transform(self, X):
+        """Return class labels or probabilities for X for each estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        y_preds : ndarray of shape (n_samples, n_estimators) or \
+                (n_samples, n_classes * n_estimators)
+            Prediction outputs for each estimator.
+        """
+        return self._transform(X)
+
+    def _sk_visual_block_(self):
+        # If final_estimator's default changes then this should be
+        # updated.
+        if self.final_estimator is None:
+            final_estimator = LogisticRegression()
+        else:
+            final_estimator = self.final_estimator
+        return super()._sk_visual_block_(final_estimator)
+
+
+class StackingRegressor(RegressorMixin, _BaseStacking):
+    """Stack of estimators with a final regressor.
+
+    Stacked generalization consists in stacking the output of individual
+    estimator and use a regressor to compute the final prediction. Stacking
+    allows to use the strength of each individual estimator by using their
+    output as input of a final estimator.
+
+    Note that `estimators_` are fitted on the full `X` while `final_estimator_`
+    is trained using cross-validated predictions of the base estimators using
+    `cross_val_predict`.
+
+    .. versionadded:: 0.22
+
+    Read more in the :ref:`User Guide <stacking>`.
+
+    Parameters
+    ----------
+    estimators : list of (str, estimator)
+        Base estimators which will be stacked together. Each element of the
+        list is defined as a tuple of string (i.e. name) and an estimator
+        instance. An estimator can be set to 'drop' using `set_params`.
+
+    final_estimator : estimator, default=None
+        A regressor which will be used to combine the base estimators.
+        The default regressor is a `RidgeCV`.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy used in
+        `cross_val_predict` to train `final_estimator`. Possible inputs for
+        cv are:
+
+        * None, to use the default 5-fold cross validation,
+        * integer, to specify the number of folds in a (Stratified) KFold,
+        * An object to be used as a cross-validation generator,
+        * An iterable yielding train, test splits.
+
+        For integer/None inputs, if the estimator is a classifier and y is
+        either binary or multiclass, `StratifiedKFold` is used. In all other
+        cases, `KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. note::
+           A larger number of split will provide no benefits if the number
+           of training samples is large enough. Indeed, the training time
+           will increase. ``cv`` is not used for model evaluation but for
+           prediction.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel for `fit` of all `estimators`.
+        `None` means 1 unless in a `joblib.parallel_backend` context. -1 means
+        using all processors. See Glossary for more details.
+
+    passthrough : bool, default=False
+        When False, only the predictions of estimators will be used as
+        training data for `final_estimator`. When True, the
+        `final_estimator` is trained on the predictions as well as the
+        original training data.
+
+    verbose : int, default=0
+        Verbosity level.
+
+    Attributes
+    ----------
+    estimators_ : list of estimator
+        The elements of the estimators parameter, having been fitted on the
+        training data. If an estimator has been set to `'drop'`, it
+        will not appear in `estimators_`.
+
+    named_estimators_ : :class:`~sklearn.utils.Bunch`
+        Attribute to access any fitted sub-estimators by name.
+
+
+    final_estimator_ : estimator
+        The regressor to stacked the base estimators fitted.
+
+    References
+    ----------
+    .. [1] Wolpert, David H. "Stacked generalization." Neural networks 5.2
+       (1992): 241-259.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_diabetes
+    >>> from sklearn.linear_model import RidgeCV
+    >>> from sklearn.svm import LinearSVR
+    >>> from sklearn.ensemble import RandomForestRegressor
+    >>> from sklearn.ensemble import StackingRegressor
+    >>> X, y = load_diabetes(return_X_y=True)
+    >>> estimators = [
+    ...     ('lr', RidgeCV()),
+    ...     ('svr', LinearSVR(random_state=42))
+    ... ]
+    >>> reg = StackingRegressor(
+    ...     estimators=estimators,
+    ...     final_estimator=RandomForestRegressor(n_estimators=10,
+    ...                                           random_state=42)
+    ... )
+    >>> from sklearn.model_selection import train_test_split
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, random_state=42
+    ... )
+    >>> reg.fit(X_train, y_train).score(X_test, y_test)
+    0.3...
+
+    """
+    @_deprecate_positional_args
+    def __init__(self, estimators, final_estimator=None, *, cv=None,
+                 n_jobs=None, passthrough=False, verbose=0):
+        super().__init__(
+            estimators=estimators,
+            final_estimator=final_estimator,
+            cv=cv,
+            stack_method="predict",
+            n_jobs=n_jobs,
+            passthrough=passthrough,
+            verbose=verbose
+        )
+
+    def _validate_final_estimator(self):
+        self._clone_final_estimator(default=RidgeCV())
+        if not is_regressor(self.final_estimator_):
+            raise ValueError(
+                "'final_estimator' parameter should be a regressor. Got {}"
+                .format(self.final_estimator_)
+            )
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit the estimators.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
+            Note that this is supported only if all underlying estimators
+            support sample weights.
+
+        Returns
+        -------
+        self : object
+        """
+        y = column_or_1d(y, warn=True)
+        return super().fit(X, y, sample_weight)
+
+    def transform(self, X):
+        """Return the predictions for X for each estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        y_preds : ndarray of shape (n_samples, n_estimators)
+            Prediction outputs for each estimator.
+        """
+        return self._transform(X)
+
+    def _sk_visual_block_(self):
+        # If final_estimator's default changes then this should be
+        # updated.
+        if self.final_estimator is None:
+            final_estimator = RidgeCV()
+        else:
+            final_estimator = self.final_estimator
+        return super()._sk_visual_block_(final_estimator)
--- a/venv/Lib/site-packages/sklearn/ensemble/_voting.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_voting.py
@ -0,0 +1,495 @@
+"""
+Soft Voting/Majority Rule classifier and Voting regressor.
+
+This module contains:
+ - A Soft Voting/Majority Rule classifier for classification estimators.
+ - A Voting regressor for regression estimators.
+"""
+
+# Authors: Sebastian Raschka <se.raschka@gmail.com>,
+#          Gilles Louppe <g.louppe@gmail.com>,
+#          Ramil Nugmanov <stsouko@live.ru>
+#          Mohamed Ali Jamaoui <m.ali.jamaoui@gmail.com>
+#
+# License: BSD 3 clause
+
+from abc import abstractmethod
+
+import numpy as np
+
+from joblib import Parallel, delayed
+
+from ..base import ClassifierMixin
+from ..base import RegressorMixin
+from ..base import TransformerMixin
+from ..base import clone
+from ._base import _fit_single_estimator
+from ._base import _BaseHeterogeneousEnsemble
+from ..preprocessing import LabelEncoder
+from ..utils import Bunch
+from ..utils.validation import check_is_fitted
+from ..utils.multiclass import check_classification_targets
+from ..utils.validation import column_or_1d
+from ..utils.validation import _deprecate_positional_args
+from ..exceptions import NotFittedError
+from ..utils._estimator_html_repr import _VisualBlock
+
+
+class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble):
+    """Base class for voting.
+
+    Warning: This class should not be used directly. Use derived classes
+    instead.
+    """
+
+    def _log_message(self, name, idx, total):
+        if not self.verbose:
+            return None
+        return '(%d of %d) Processing %s' % (idx, total, name)
+
+    @property
+    def _weights_not_none(self):
+        """Get the weights of not `None` estimators."""
+        if self.weights is None:
+            return None
+        return [w for est, w in zip(self.estimators, self.weights)
+                if est[1] not in (None, 'drop')]
+
+    def _predict(self, X):
+        """Collect results from clf.predict calls."""
+        return np.asarray([est.predict(X) for est in self.estimators_]).T
+
+    @abstractmethod
+    def fit(self, X, y, sample_weight=None):
+        """Get common fit operations."""
+        names, clfs = self._validate_estimators()
+
+        if (self.weights is not None and
+                len(self.weights) != len(self.estimators)):
+            raise ValueError('Number of `estimators` and weights must be equal'
+                             '; got %d weights, %d estimators'
+                             % (len(self.weights), len(self.estimators)))
+
+        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
+                delayed(_fit_single_estimator)(
+                        clone(clf), X, y,
+                        sample_weight=sample_weight,
+                        message_clsname='Voting',
+                        message=self._log_message(names[idx],
+                                                  idx + 1, len(clfs))
+                )
+                for idx, clf in enumerate(clfs) if clf not in (None, 'drop')
+            )
+
+        self.named_estimators_ = Bunch()
+
+        # Uses None or 'drop' as placeholder for dropped estimators
+        est_iter = iter(self.estimators_)
+        for name, est in self.estimators:
+            current_est = est if est in (None, 'drop') else next(est_iter)
+            self.named_estimators_[name] = current_est
+
+        return self
+
+    @property
+    def n_features_in_(self):
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() fails if the estimator isn't fitted.
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute."
+                .format(self.__class__.__name__)
+            ) from nfe
+
+        return self.estimators_[0].n_features_in_
+
+    def _sk_visual_block_(self):
+        names, estimators = zip(*self.estimators)
+        return _VisualBlock('parallel', estimators, names=names)
+
+
+class VotingClassifier(ClassifierMixin, _BaseVoting):
+    """Soft Voting/Majority Rule classifier for unfitted estimators.
+
+    .. versionadded:: 0.17
+
+    Read more in the :ref:`User Guide <voting_classifier>`.
+
+    Parameters
+    ----------
+    estimators : list of (str, estimator) tuples
+        Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
+        of those original estimators that will be stored in the class attribute
+        ``self.estimators_``. An estimator can be set to ``'drop'``
+        using ``set_params``.
+
+        .. versionchanged:: 0.21
+            ``'drop'`` is accepted.
+
+        .. deprecated:: 0.22
+           Using ``None`` to drop an estimator is deprecated in 0.22 and
+           support will be dropped in 0.24. Use the string ``'drop'`` instead.
+
+    voting : {'hard', 'soft'}, default='hard'
+        If 'hard', uses predicted class labels for majority rule voting.
+        Else if 'soft', predicts the class label based on the argmax of
+        the sums of the predicted probabilities, which is recommended for
+        an ensemble of well-calibrated classifiers.
+
+    weights : array-like of shape (n_classifiers,), default=None
+        Sequence of weights (`float` or `int`) to weight the occurrences of
+        predicted class labels (`hard` voting) or class probabilities
+        before averaging (`soft` voting). Uses uniform weights if `None`.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel for ``fit``.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionadded:: 0.18
+
+    flatten_transform : bool, default=True
+        Affects shape of transform output only when voting='soft'
+        If voting='soft' and flatten_transform=True, transform method returns
+        matrix with shape (n_samples, n_classifiers * n_classes). If
+        flatten_transform=False, it returns
+        (n_classifiers, n_samples, n_classes).
+
+    verbose : bool, default=False
+        If True, the time elapsed while fitting will be printed as it
+        is completed.
+
+    Attributes
+    ----------
+    estimators_ : list of classifiers
+        The collection of fitted sub-estimators as defined in ``estimators``
+        that are not 'drop'.
+
+    named_estimators_ : :class:`~sklearn.utils.Bunch`
+        Attribute to access any fitted sub-estimators by name.
+
+        .. versionadded:: 0.20
+
+    classes_ : array-like of shape (n_predictions,)
+        The classes labels.
+
+    See Also
+    --------
+    VotingRegressor: Prediction voting regressor.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.naive_bayes import GaussianNB
+    >>> from sklearn.ensemble import RandomForestClassifier, VotingClassifier
+    >>> clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
+    >>> clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
+    >>> clf3 = GaussianNB()
+    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
+    >>> y = np.array([1, 1, 1, 2, 2, 2])
+    >>> eclf1 = VotingClassifier(estimators=[
+    ...         ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
+    >>> eclf1 = eclf1.fit(X, y)
+    >>> print(eclf1.predict(X))
+    [1 1 1 2 2 2]
+    >>> np.array_equal(eclf1.named_estimators_.lr.predict(X),
+    ...                eclf1.named_estimators_['lr'].predict(X))
+    True
+    >>> eclf2 = VotingClassifier(estimators=[
+    ...         ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+    ...         voting='soft')
+    >>> eclf2 = eclf2.fit(X, y)
+    >>> print(eclf2.predict(X))
+    [1 1 1 2 2 2]
+    >>> eclf3 = VotingClassifier(estimators=[
+    ...        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+    ...        voting='soft', weights=[2,1,1],
+    ...        flatten_transform=True)
+    >>> eclf3 = eclf3.fit(X, y)
+    >>> print(eclf3.predict(X))
+    [1 1 1 2 2 2]
+    >>> print(eclf3.transform(X).shape)
+    (6, 6)
+    """
+    @_deprecate_positional_args
+    def __init__(self, estimators, *, voting='hard', weights=None,
+                 n_jobs=None, flatten_transform=True, verbose=False):
+        super().__init__(estimators=estimators)
+        self.voting = voting
+        self.weights = weights
+        self.n_jobs = n_jobs
+        self.flatten_transform = flatten_transform
+        self.verbose = verbose
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit the estimators.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
+            Note that this is supported only if all underlying estimators
+            support sample weights.
+
+            .. versionadded:: 0.18
+
+        Returns
+        -------
+        self : object
+
+        """
+        check_classification_targets(y)
+        if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
+            raise NotImplementedError('Multilabel and multi-output'
+                                      ' classification is not supported.')
+
+        if self.voting not in ('soft', 'hard'):
+            raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
+                             % self.voting)
+
+        self.le_ = LabelEncoder().fit(y)
+        self.classes_ = self.le_.classes_
+        transformed_y = self.le_.transform(y)
+
+        return super().fit(X, transformed_y, sample_weight)
+
+    def predict(self, X):
+        """Predict class labels for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        maj : array-like of shape (n_samples,)
+            Predicted class labels.
+        """
+        check_is_fitted(self)
+        if self.voting == 'soft':
+            maj = np.argmax(self.predict_proba(X), axis=1)
+
+        else:  # 'hard' voting
+            predictions = self._predict(X)
+            maj = np.apply_along_axis(
+                lambda x: np.argmax(
+                    np.bincount(x, weights=self._weights_not_none)),
+                axis=1, arr=predictions)
+
+        maj = self.le_.inverse_transform(maj)
+
+        return maj
+
+    def _collect_probas(self, X):
+        """Collect results from clf.predict calls."""
+        return np.asarray([clf.predict_proba(X) for clf in self.estimators_])
+
+    def _predict_proba(self, X):
+        """Predict class probabilities for X in 'soft' voting."""
+        check_is_fitted(self)
+        avg = np.average(self._collect_probas(X), axis=0,
+                         weights=self._weights_not_none)
+        return avg
+
+    @property
+    def predict_proba(self):
+        """Compute probabilities of possible outcomes for samples in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        avg : array-like of shape (n_samples, n_classes)
+            Weighted average probability for each class per sample.
+        """
+        if self.voting == 'hard':
+            raise AttributeError("predict_proba is not available when"
+                                 " voting=%r" % self.voting)
+        return self._predict_proba
+
+    def transform(self, X):
+        """Return class labels or probabilities for X for each estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        Returns
+        -------
+        probabilities_or_labels
+            If `voting='soft'` and `flatten_transform=True`:
+                returns ndarray of shape (n_classifiers, n_samples *
+                n_classes), being class probabilities calculated by each
+                classifier.
+            If `voting='soft' and `flatten_transform=False`:
+                ndarray of shape (n_classifiers, n_samples, n_classes)
+            If `voting='hard'`:
+                ndarray of shape (n_samples, n_classifiers), being
+                class labels predicted by each classifier.
+        """
+        check_is_fitted(self)
+
+        if self.voting == 'soft':
+            probas = self._collect_probas(X)
+            if not self.flatten_transform:
+                return probas
+            return np.hstack(probas)
+
+        else:
+            return self._predict(X)
+
+
+class VotingRegressor(RegressorMixin, _BaseVoting):
+    """Prediction voting regressor for unfitted estimators.
+
+    .. versionadded:: 0.21
+
+    A voting regressor is an ensemble meta-estimator that fits several base
+    regressors, each on the whole dataset. Then it averages the individual
+    predictions to form a final prediction.
+
+    Read more in the :ref:`User Guide <voting_regressor>`.
+
+    Parameters
+    ----------
+    estimators : list of (str, estimator) tuples
+        Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones
+        of those original estimators that will be stored in the class attribute
+        ``self.estimators_``. An estimator can be set to ``'drop'`` using
+        ``set_params``.
+
+        .. versionchanged:: 0.21
+            ``'drop'`` is accepted.
+
+        .. deprecated:: 0.22
+           Using ``None`` to drop an estimator is deprecated in 0.22 and
+           support will be dropped in 0.24. Use the string ``'drop'`` instead.
+
+    weights : array-like of shape (n_regressors,), default=None
+        Sequence of weights (`float` or `int`) to weight the occurrences of
+        predicted values before averaging. Uses uniform weights if `None`.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel for ``fit``.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : bool, default=False
+        If True, the time elapsed while fitting will be printed as it
+        is completed.
+
+    Attributes
+    ----------
+    estimators_ : list of regressors
+        The collection of fitted sub-estimators as defined in ``estimators``
+        that are not 'drop'.
+
+    named_estimators_ : Bunch
+        Attribute to access any fitted sub-estimators by name.
+
+        .. versionadded:: 0.20
+
+    See Also
+    --------
+    VotingClassifier: Soft Voting/Majority Rule classifier.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import LinearRegression
+    >>> from sklearn.ensemble import RandomForestRegressor
+    >>> from sklearn.ensemble import VotingRegressor
+    >>> r1 = LinearRegression()
+    >>> r2 = RandomForestRegressor(n_estimators=10, random_state=1)
+    >>> X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]])
+    >>> y = np.array([2, 6, 12, 20, 30, 42])
+    >>> er = VotingRegressor([('lr', r1), ('rf', r2)])
+    >>> print(er.fit(X, y).predict(X))
+    [ 3.3  5.7 11.8 19.7 28.  40.3]
+    """
+    @_deprecate_positional_args
+    def __init__(self, estimators, *, weights=None, n_jobs=None,
+                 verbose=False):
+        super().__init__(estimators=estimators)
+        self.weights = weights
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit the estimators.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
+            Note that this is supported only if all underlying estimators
+            support sample weights.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        y = column_or_1d(y, warn=True)
+        return super().fit(X, y, sample_weight)
+
+    def predict(self, X):
+        """Predict regression target for X.
+
+        The predicted regression target of an input sample is computed as the
+        mean predicted regression targets of the estimators in the ensemble.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            The predicted values.
+        """
+        check_is_fitted(self)
+        return np.average(self._predict(X), axis=1,
+                          weights=self._weights_not_none)
+
+    def transform(self, X):
+        """Return predictions for X for each estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        predictions: ndarray of shape (n_samples, n_classifiers)
+            Values predicted by each regressor.
+        """
+        check_is_fitted(self)
+        return self._predict(X)
--- a/venv/Lib/site-packages/sklearn/ensemble/_weight_boosting.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_weight_boosting.py
--- a/venv/Lib/site-packages/sklearn/ensemble/bagging.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/bagging.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _bagging  # type: ignore
+from ..externals._pep562 import Pep562
+from ..utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.ensemble.bagging'
+correct_import_path = 'sklearn.ensemble'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_bagging, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/ensemble/base.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/base.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _base  # type: ignore
+from ..externals._pep562 import Pep562
+from ..utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.ensemble.base'
+correct_import_path = 'sklearn.ensemble'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_base, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/ensemble/forest.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/forest.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _forest  # type: ignore
+from ..externals._pep562 import Pep562
+from ..utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.ensemble.forest'
+correct_import_path = 'sklearn.ensemble'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_forest, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/ensemble/gradient_boosting.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/gradient_boosting.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _gb  # type: ignore
+from ..externals._pep562 import Pep562
+from ..utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.ensemble.gradient_boosting'
+correct_import_path = 'sklearn.ensemble'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_gb, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/ensemble/iforest.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/iforest.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _iforest  # type: ignore
+from ..externals._pep562 import Pep562
+from ..utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.ensemble.iforest'
+correct_import_path = 'sklearn.ensemble'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_iforest, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/ensemble/setup.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/setup.py
@ -0,0 +1,54 @@
+import numpy
+from numpy.distutils.misc_util import Configuration
+
+
+def configuration(parent_package="", top_path=None):
+    config = Configuration("ensemble", parent_package, top_path)
+
+    config.add_extension("_gradient_boosting",
+                         sources=["_gradient_boosting.pyx"],
+                         include_dirs=[numpy.get_include()])
+
+    config.add_subpackage("tests")
+
+    # Histogram-based gradient boosting files
+    config.add_extension(
+        "_hist_gradient_boosting._gradient_boosting",
+        sources=["_hist_gradient_boosting/_gradient_boosting.pyx"],
+        include_dirs=[numpy.get_include()])
+
+    config.add_extension("_hist_gradient_boosting.histogram",
+                         sources=["_hist_gradient_boosting/histogram.pyx"],
+                         include_dirs=[numpy.get_include()])
+
+    config.add_extension("_hist_gradient_boosting.splitting",
+                         sources=["_hist_gradient_boosting/splitting.pyx"],
+                         include_dirs=[numpy.get_include()])
+
+    config.add_extension("_hist_gradient_boosting._binning",
+                         sources=["_hist_gradient_boosting/_binning.pyx"],
+                         include_dirs=[numpy.get_include()])
+
+    config.add_extension("_hist_gradient_boosting._predictor",
+                         sources=["_hist_gradient_boosting/_predictor.pyx"],
+                         include_dirs=[numpy.get_include()])
+
+    config.add_extension("_hist_gradient_boosting._loss",
+                         sources=["_hist_gradient_boosting/_loss.pyx"],
+                         include_dirs=[numpy.get_include()])
+
+    config.add_extension("_hist_gradient_boosting.common",
+                         sources=["_hist_gradient_boosting/common.pyx"],
+                         include_dirs=[numpy.get_include()])
+
+    config.add_extension("_hist_gradient_boosting.utils",
+                         sources=["_hist_gradient_boosting/utils.pyx"],
+                         include_dirs=[numpy.get_include()])
+
+    config.add_subpackage("_hist_gradient_boosting.tests")
+
+    return config
+
+if __name__ == "__main__":
+    from numpy.distutils.core import setup
+    setup(**configuration().todict())
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/init.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/init.py
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_bagging.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_bagging.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_base.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_base.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_common.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_common.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_forest.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_forest.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_gradient_boosting.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_gradient_boosting.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_gradient_boosting_loss_functions.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_gradient_boosting_loss_functions.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_iforest.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_iforest.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_stacking.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_stacking.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_voting.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_voting.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_weight_boosting.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/pycache/test_weight_boosting.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/test_bagging.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/test_bagging.py
@ -0,0 +1,902 @@
+"""
+Testing for the bagging ensemble module (sklearn.ensemble.bagging).
+"""
+
+# Author: Gilles Louppe
+# License: BSD 3 clause
+
+import numpy as np
+import joblib
+
+from sklearn.base import BaseEstimator
+
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import assert_array_almost_equal
+from sklearn.utils._testing import assert_raises
+from sklearn.utils._testing import assert_warns
+from sklearn.utils._testing import assert_warns_message
+from sklearn.utils._testing import assert_raise_message
+from sklearn.utils._testing import ignore_warnings
+
+from sklearn.dummy import DummyClassifier, DummyRegressor
+from sklearn.model_selection import GridSearchCV, ParameterGrid
+from sklearn.ensemble import BaggingClassifier, BaggingRegressor
+from sklearn.linear_model import Perceptron, LogisticRegression
+from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.svm import SVC, SVR
+from sklearn.random_projection import SparseRandomProjection
+from sklearn.pipeline import make_pipeline
+from sklearn.feature_selection import SelectKBest
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2
+from sklearn.utils import check_random_state
+from sklearn.preprocessing import FunctionTransformer
+
+from scipy.sparse import csc_matrix, csr_matrix
+
+rng = check_random_state(0)
+
+# also load the iris dataset
+# and randomly permute it
+iris = load_iris()
+perm = rng.permutation(iris.target.size)
+iris.data = iris.data[perm]
+iris.target = iris.target[perm]
+
+# also load the diabetes dataset
+# and randomly permute it
+diabetes = load_diabetes()
+perm = rng.permutation(diabetes.target.size)
+diabetes.data = diabetes.data[perm]
+diabetes.target = diabetes.target[perm]
+
+
+# TODO: Remove in 0.24 when DummyClassifier's `strategy` default updates
+@ignore_warnings(category=FutureWarning)
+def test_classification():
+    # Check classification for various parameter settings.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(iris.data,
+                                                        iris.target,
+                                                        random_state=rng)
+    grid = ParameterGrid({"max_samples": [0.5, 1.0],
+                          "max_features": [1, 2, 4],
+                          "bootstrap": [True, False],
+                          "bootstrap_features": [True, False]})
+
+    for base_estimator in [None,
+                           DummyClassifier(),
+                           Perceptron(),
+                           DecisionTreeClassifier(),
+                           KNeighborsClassifier(),
+                           SVC()]:
+        for params in grid:
+            BaggingClassifier(base_estimator=base_estimator,
+                              random_state=rng,
+                              **params).fit(X_train, y_train).predict(X_test)
+
+
+def test_sparse_classification():
+    # Check classification for various parameter settings on sparse input.
+
+    class CustomSVC(SVC):
+        """SVC variant that records the nature of the training set"""
+
+        def fit(self, X, y):
+            super().fit(X, y)
+            self.data_type_ = type(X)
+            return self
+
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(iris.data,
+                                                        iris.target,
+                                                        random_state=rng)
+    parameter_sets = [
+        {"max_samples": 0.5,
+         "max_features": 2,
+         "bootstrap": True,
+         "bootstrap_features": True},
+        {"max_samples": 1.0,
+         "max_features": 4,
+         "bootstrap": True,
+         "bootstrap_features": True},
+        {"max_features": 2,
+         "bootstrap": False,
+         "bootstrap_features": True},
+        {"max_samples": 0.5,
+         "bootstrap": True,
+         "bootstrap_features": False},
+    ]
+
+    for sparse_format in [csc_matrix, csr_matrix]:
+        X_train_sparse = sparse_format(X_train)
+        X_test_sparse = sparse_format(X_test)
+        for params in parameter_sets:
+            for f in ['predict', 'predict_proba', 'predict_log_proba', 'decision_function']:
+                # Trained on sparse format
+                sparse_classifier = BaggingClassifier(
+                    base_estimator=CustomSVC(decision_function_shape='ovr'),
+                    random_state=1,
+                    **params
+                ).fit(X_train_sparse, y_train)
+                sparse_results = getattr(sparse_classifier, f)(X_test_sparse)
+
+                # Trained on dense format
+                dense_classifier = BaggingClassifier(
+                    base_estimator=CustomSVC(decision_function_shape='ovr'),
+                    random_state=1,
+                    **params
+                ).fit(X_train, y_train)
+                dense_results = getattr(dense_classifier, f)(X_test)
+                assert_array_almost_equal(sparse_results, dense_results)
+
+            sparse_type = type(X_train_sparse)
+            types = [i.data_type_ for i in sparse_classifier.estimators_]
+
+            assert all([t == sparse_type for t in types])
+
+
+def test_regression():
+    # Check regression for various parameter settings.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
+                                                        diabetes.target[:50],
+                                                        random_state=rng)
+    grid = ParameterGrid({"max_samples": [0.5, 1.0],
+                          "max_features": [0.5, 1.0],
+                          "bootstrap": [True, False],
+                          "bootstrap_features": [True, False]})
+
+    for base_estimator in [None,
+                           DummyRegressor(),
+                           DecisionTreeRegressor(),
+                           KNeighborsRegressor(),
+                           SVR()]:
+        for params in grid:
+            BaggingRegressor(base_estimator=base_estimator,
+                             random_state=rng,
+                             **params).fit(X_train, y_train).predict(X_test)
+
+
+def test_sparse_regression():
+    # Check regression for various parameter settings on sparse input.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
+                                                        diabetes.target[:50],
+                                                        random_state=rng)
+
+    class CustomSVR(SVR):
+        """SVC variant that records the nature of the training set"""
+
+        def fit(self, X, y):
+            super().fit(X, y)
+            self.data_type_ = type(X)
+            return self
+
+    parameter_sets = [
+        {"max_samples": 0.5,
+         "max_features": 2,
+         "bootstrap": True,
+         "bootstrap_features": True},
+        {"max_samples": 1.0,
+         "max_features": 4,
+         "bootstrap": True,
+         "bootstrap_features": True},
+        {"max_features": 2,
+         "bootstrap": False,
+         "bootstrap_features": True},
+        {"max_samples": 0.5,
+         "bootstrap": True,
+         "bootstrap_features": False},
+    ]
+
+    for sparse_format in [csc_matrix, csr_matrix]:
+        X_train_sparse = sparse_format(X_train)
+        X_test_sparse = sparse_format(X_test)
+        for params in parameter_sets:
+
+            # Trained on sparse format
+            sparse_classifier = BaggingRegressor(
+                base_estimator=CustomSVR(),
+                random_state=1,
+                **params
+            ).fit(X_train_sparse, y_train)
+            sparse_results = sparse_classifier.predict(X_test_sparse)
+
+            # Trained on dense format
+            dense_results = BaggingRegressor(
+                base_estimator=CustomSVR(),
+                random_state=1,
+                **params
+            ).fit(X_train, y_train).predict(X_test)
+
+            sparse_type = type(X_train_sparse)
+            types = [i.data_type_ for i in sparse_classifier.estimators_]
+
+            assert_array_almost_equal(sparse_results, dense_results)
+            assert all([t == sparse_type for t in types])
+            assert_array_almost_equal(sparse_results, dense_results)
+
+
+class DummySizeEstimator(BaseEstimator):
+
+    def fit(self, X, y):
+        self.training_size_ = X.shape[0]
+        self.training_hash_ = joblib.hash(X)
+
+
+def test_bootstrap_samples():
+    # Test that bootstrapping samples generate non-perfect base estimators.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
+                                                        random_state=rng)
+
+    base_estimator = DecisionTreeRegressor().fit(X_train, y_train)
+
+    # without bootstrap, all trees are perfect on the training set
+    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
+                                max_samples=1.0,
+                                bootstrap=False,
+                                random_state=rng).fit(X_train, y_train)
+
+    assert (base_estimator.score(X_train, y_train) ==
+                 ensemble.score(X_train, y_train))
+
+    # with bootstrap, trees are no longer perfect on the training set
+    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
+                                max_samples=1.0,
+                                bootstrap=True,
+                                random_state=rng).fit(X_train, y_train)
+
+    assert (base_estimator.score(X_train, y_train) >
+                   ensemble.score(X_train, y_train))
+
+    # check that each sampling correspond to a complete bootstrap resample.
+    # the size of each bootstrap should be the same as the input data but
+    # the data should be different (checked using the hash of the data).
+    ensemble = BaggingRegressor(base_estimator=DummySizeEstimator(),
+                                bootstrap=True).fit(X_train, y_train)
+    training_hash = []
+    for estimator in ensemble.estimators_:
+        assert estimator.training_size_ == X_train.shape[0]
+        training_hash.append(estimator.training_hash_)
+    assert len(set(training_hash)) == len(training_hash)
+
+
+def test_bootstrap_features():
+    # Test that bootstrapping features may generate duplicate features.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
+                                                        random_state=rng)
+
+    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
+                                max_features=1.0,
+                                bootstrap_features=False,
+                                random_state=rng).fit(X_train, y_train)
+
+    for features in ensemble.estimators_features_:
+        assert diabetes.data.shape[1] == np.unique(features).shape[0]
+
+    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
+                                max_features=1.0,
+                                bootstrap_features=True,
+                                random_state=rng).fit(X_train, y_train)
+
+    for features in ensemble.estimators_features_:
+        assert diabetes.data.shape[1] > np.unique(features).shape[0]
+
+
+def test_probability():
+    # Predict probabilities.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(iris.data,
+                                                        iris.target,
+                                                        random_state=rng)
+
+    with np.errstate(divide="ignore", invalid="ignore"):
+        # Normal case
+        ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
+                                     random_state=rng).fit(X_train, y_train)
+
+        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
+                                         axis=1),
+                                  np.ones(len(X_test)))
+
+        assert_array_almost_equal(ensemble.predict_proba(X_test),
+                                  np.exp(ensemble.predict_log_proba(X_test)))
+
+        # Degenerate case, where some classes are missing
+        ensemble = BaggingClassifier(base_estimator=LogisticRegression(),
+                                     random_state=rng,
+                                     max_samples=5).fit(X_train, y_train)
+
+        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
+                                         axis=1),
+                                  np.ones(len(X_test)))
+
+        assert_array_almost_equal(ensemble.predict_proba(X_test),
+                                  np.exp(ensemble.predict_log_proba(X_test)))
+
+
+def test_oob_score_classification():
+    # Check that oob prediction is a good estimation of the generalization
+    # error.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(iris.data,
+                                                        iris.target,
+                                                        random_state=rng)
+
+    for base_estimator in [DecisionTreeClassifier(), SVC()]:
+        clf = BaggingClassifier(base_estimator=base_estimator,
+                                n_estimators=100,
+                                bootstrap=True,
+                                oob_score=True,
+                                random_state=rng).fit(X_train, y_train)
+
+        test_score = clf.score(X_test, y_test)
+
+        assert abs(test_score - clf.oob_score_) < 0.1
+
+        # Test with few estimators
+        assert_warns(UserWarning,
+                     BaggingClassifier(base_estimator=base_estimator,
+                                       n_estimators=1,
+                                       bootstrap=True,
+                                       oob_score=True,
+                                       random_state=rng).fit,
+                     X_train,
+                     y_train)
+
+
+def test_oob_score_regression():
+    # Check that oob prediction is a good estimation of the generalization
+    # error.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
+                                                        random_state=rng)
+
+    clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
+                           n_estimators=50,
+                           bootstrap=True,
+                           oob_score=True,
+                           random_state=rng).fit(X_train, y_train)
+
+    test_score = clf.score(X_test, y_test)
+
+    assert abs(test_score - clf.oob_score_) < 0.1
+
+    # Test with few estimators
+    assert_warns(UserWarning,
+                 BaggingRegressor(base_estimator=DecisionTreeRegressor(),
+                                  n_estimators=1,
+                                  bootstrap=True,
+                                  oob_score=True,
+                                  random_state=rng).fit,
+                 X_train,
+                 y_train)
+
+
+def test_single_estimator():
+    # Check singleton ensembles.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
+                                                        random_state=rng)
+
+    clf1 = BaggingRegressor(base_estimator=KNeighborsRegressor(),
+                            n_estimators=1,
+                            bootstrap=False,
+                            bootstrap_features=False,
+                            random_state=rng).fit(X_train, y_train)
+
+    clf2 = KNeighborsRegressor().fit(X_train, y_train)
+
+    assert_array_almost_equal(clf1.predict(X_test), clf2.predict(X_test))
+
+
+def test_error():
+    # Test that it gives proper exception on deficient input.
+    X, y = iris.data, iris.target
+    base = DecisionTreeClassifier()
+
+    # Test max_samples
+    assert_raises(ValueError,
+                  BaggingClassifier(base, max_samples=-1).fit, X, y)
+    assert_raises(ValueError,
+                  BaggingClassifier(base, max_samples=0.0).fit, X, y)
+    assert_raises(ValueError,
+                  BaggingClassifier(base, max_samples=2.0).fit, X, y)
+    assert_raises(ValueError,
+                  BaggingClassifier(base, max_samples=1000).fit, X, y)
+    assert_raises(ValueError,
+                  BaggingClassifier(base, max_samples="foobar").fit, X, y)
+
+    # Test max_features
+    assert_raises(ValueError,
+                  BaggingClassifier(base, max_features=-1).fit, X, y)
+    assert_raises(ValueError,
+                  BaggingClassifier(base, max_features=0.0).fit, X, y)
+    assert_raises(ValueError,
+                  BaggingClassifier(base, max_features=2.0).fit, X, y)
+    assert_raises(ValueError,
+                  BaggingClassifier(base, max_features=5).fit, X, y)
+    assert_raises(ValueError,
+                  BaggingClassifier(base, max_features="foobar").fit, X, y)
+
+    # Test support of decision_function
+    assert not hasattr(BaggingClassifier(base).fit(X, y), 'decision_function')
+
+
+def test_parallel_classification():
+    # Check parallel classification.
+    rng = check_random_state(0)
+
+    # Classification
+    X_train, X_test, y_train, y_test = train_test_split(iris.data,
+                                                        iris.target,
+                                                        random_state=rng)
+
+    ensemble = BaggingClassifier(DecisionTreeClassifier(),
+                                 n_jobs=3,
+                                 random_state=0).fit(X_train, y_train)
+
+    # predict_proba
+    ensemble.set_params(n_jobs=1)
+    y1 = ensemble.predict_proba(X_test)
+    ensemble.set_params(n_jobs=2)
+    y2 = ensemble.predict_proba(X_test)
+    assert_array_almost_equal(y1, y2)
+
+    ensemble = BaggingClassifier(DecisionTreeClassifier(),
+                                 n_jobs=1,
+                                 random_state=0).fit(X_train, y_train)
+
+    y3 = ensemble.predict_proba(X_test)
+    assert_array_almost_equal(y1, y3)
+
+    # decision_function
+    ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
+                                 n_jobs=3,
+                                 random_state=0).fit(X_train, y_train)
+
+    ensemble.set_params(n_jobs=1)
+    decisions1 = ensemble.decision_function(X_test)
+    ensemble.set_params(n_jobs=2)
+    decisions2 = ensemble.decision_function(X_test)
+    assert_array_almost_equal(decisions1, decisions2)
+
+    X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1))))
+    assert_raise_message(ValueError, "Number of features of the model "
+                         "must match the input. Model n_features is {0} "
+                         "and input n_features is {1} "
+                         "".format(X_test.shape[1], X_err.shape[1]),
+                         ensemble.decision_function, X_err)
+
+    ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
+                                 n_jobs=1,
+                                 random_state=0).fit(X_train, y_train)
+
+    decisions3 = ensemble.decision_function(X_test)
+    assert_array_almost_equal(decisions1, decisions3)
+
+
+def test_parallel_regression():
+    # Check parallel regression.
+    rng = check_random_state(0)
+
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
+                                                        random_state=rng)
+
+    ensemble = BaggingRegressor(DecisionTreeRegressor(),
+                                n_jobs=3,
+                                random_state=0).fit(X_train, y_train)
+
+    ensemble.set_params(n_jobs=1)
+    y1 = ensemble.predict(X_test)
+    ensemble.set_params(n_jobs=2)
+    y2 = ensemble.predict(X_test)
+    assert_array_almost_equal(y1, y2)
+
+    ensemble = BaggingRegressor(DecisionTreeRegressor(),
+                                n_jobs=1,
+                                random_state=0).fit(X_train, y_train)
+
+    y3 = ensemble.predict(X_test)
+    assert_array_almost_equal(y1, y3)
+
+
+def test_gridsearch():
+    # Check that bagging ensembles can be grid-searched.
+    # Transform iris into a binary classification task
+    X, y = iris.data, iris.target
+    y[y == 2] = 1
+
+    # Grid search with scoring based on decision_function
+    parameters = {'n_estimators': (1, 2),
+                  'base_estimator__C': (1, 2)}
+
+    GridSearchCV(BaggingClassifier(SVC()),
+                 parameters,
+                 scoring="roc_auc").fit(X, y)
+
+
+def test_base_estimator():
+    # Check base_estimator and its default values.
+    rng = check_random_state(0)
+
+    # Classification
+    X_train, X_test, y_train, y_test = train_test_split(iris.data,
+                                                        iris.target,
+                                                        random_state=rng)
+
+    ensemble = BaggingClassifier(None,
+                                 n_jobs=3,
+                                 random_state=0).fit(X_train, y_train)
+
+    assert isinstance(ensemble.base_estimator_, DecisionTreeClassifier)
+
+    ensemble = BaggingClassifier(DecisionTreeClassifier(),
+                                 n_jobs=3,
+                                 random_state=0).fit(X_train, y_train)
+
+    assert isinstance(ensemble.base_estimator_, DecisionTreeClassifier)
+
+    ensemble = BaggingClassifier(Perceptron(),
+                                 n_jobs=3,
+                                 random_state=0).fit(X_train, y_train)
+
+    assert isinstance(ensemble.base_estimator_, Perceptron)
+
+    # Regression
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
+                                                        random_state=rng)
+
+    ensemble = BaggingRegressor(None,
+                                n_jobs=3,
+                                random_state=0).fit(X_train, y_train)
+
+    assert isinstance(ensemble.base_estimator_, DecisionTreeRegressor)
+
+    ensemble = BaggingRegressor(DecisionTreeRegressor(),
+                                n_jobs=3,
+                                random_state=0).fit(X_train, y_train)
+
+    assert isinstance(ensemble.base_estimator_, DecisionTreeRegressor)
+
+    ensemble = BaggingRegressor(SVR(),
+                                n_jobs=3,
+                                random_state=0).fit(X_train, y_train)
+    assert isinstance(ensemble.base_estimator_, SVR)
+
+
+def test_bagging_with_pipeline():
+    estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
+                                                DecisionTreeClassifier()),
+                                  max_features=2)
+    estimator.fit(iris.data, iris.target)
+    assert isinstance(estimator[0].steps[-1][1].random_state, int)
+
+
+class DummyZeroEstimator(BaseEstimator):
+
+    def fit(self, X, y):
+        self.classes_ = np.unique(y)
+        return self
+
+    def predict(self, X):
+        return self.classes_[np.zeros(X.shape[0], dtype=int)]
+
+
+def test_bagging_sample_weight_unsupported_but_passed():
+    estimator = BaggingClassifier(DummyZeroEstimator())
+    rng = check_random_state(0)
+
+    estimator.fit(iris.data, iris.target).predict(iris.data)
+    assert_raises(ValueError, estimator.fit, iris.data, iris.target,
+                  sample_weight=rng.randint(10, size=(iris.data.shape[0])))
+
+
+def test_warm_start(random_state=42):
+    # Test if fitting incrementally with warm start gives a forest of the
+    # right size and the same results as a normal fit.
+    X, y = make_hastie_10_2(n_samples=20, random_state=1)
+
+    clf_ws = None
+    for n_estimators in [5, 10]:
+        if clf_ws is None:
+            clf_ws = BaggingClassifier(n_estimators=n_estimators,
+                                       random_state=random_state,
+                                       warm_start=True)
+        else:
+            clf_ws.set_params(n_estimators=n_estimators)
+        clf_ws.fit(X, y)
+        assert len(clf_ws) == n_estimators
+
+    clf_no_ws = BaggingClassifier(n_estimators=10, random_state=random_state,
+                                  warm_start=False)
+    clf_no_ws.fit(X, y)
+
+    assert (set([tree.random_state for tree in clf_ws]) ==
+                 set([tree.random_state for tree in clf_no_ws]))
+
+
+def test_warm_start_smaller_n_estimators():
+    # Test if warm start'ed second fit with smaller n_estimators raises error.
+    X, y = make_hastie_10_2(n_samples=20, random_state=1)
+    clf = BaggingClassifier(n_estimators=5, warm_start=True)
+    clf.fit(X, y)
+    clf.set_params(n_estimators=4)
+    assert_raises(ValueError, clf.fit, X, y)
+
+
+def test_warm_start_equal_n_estimators():
+    # Test that nothing happens when fitting without increasing n_estimators
+    X, y = make_hastie_10_2(n_samples=20, random_state=1)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
+
+    clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83)
+    clf.fit(X_train, y_train)
+
+    y_pred = clf.predict(X_test)
+    # modify X to nonsense values, this should not change anything
+    X_train += 1.
+
+    assert_warns_message(UserWarning,
+                         "Warm-start fitting without increasing n_estimators does not",
+                         clf.fit, X_train, y_train)
+    assert_array_equal(y_pred, clf.predict(X_test))
+
+
+def test_warm_start_equivalence():
+    # warm started classifier with 5+5 estimators should be equivalent to
+    # one classifier with 10 estimators
+    X, y = make_hastie_10_2(n_samples=20, random_state=1)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
+
+    clf_ws = BaggingClassifier(n_estimators=5, warm_start=True,
+                               random_state=3141)
+    clf_ws.fit(X_train, y_train)
+    clf_ws.set_params(n_estimators=10)
+    clf_ws.fit(X_train, y_train)
+    y1 = clf_ws.predict(X_test)
+
+    clf = BaggingClassifier(n_estimators=10, warm_start=False,
+                            random_state=3141)
+    clf.fit(X_train, y_train)
+    y2 = clf.predict(X_test)
+
+    assert_array_almost_equal(y1, y2)
+
+
+def test_warm_start_with_oob_score_fails():
+    # Check using oob_score and warm_start simultaneously fails
+    X, y = make_hastie_10_2(n_samples=20, random_state=1)
+    clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True)
+    assert_raises(ValueError, clf.fit, X, y)
+
+
+def test_oob_score_removed_on_warm_start():
+    X, y = make_hastie_10_2(n_samples=2000, random_state=1)
+
+    clf = BaggingClassifier(n_estimators=50, oob_score=True)
+    clf.fit(X, y)
+
+    clf.set_params(warm_start=True, oob_score=False, n_estimators=100)
+    clf.fit(X, y)
+
+    assert_raises(AttributeError, getattr, clf, "oob_score_")
+
+
+def test_oob_score_consistency():
+    # Make sure OOB scores are identical when random_state, estimator, and
+    # training data are fixed and fitting is done twice
+    X, y = make_hastie_10_2(n_samples=200, random_state=1)
+    bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5,
+                                max_features=0.5, oob_score=True,
+                                random_state=1)
+    assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
+
+
+def test_estimators_samples():
+    # Check that format of estimators_samples_ is correct and that results
+    # generated at fit time can be identically reproduced at a later time
+    # using data saved in object attributes.
+    X, y = make_hastie_10_2(n_samples=200, random_state=1)
+    bagging = BaggingClassifier(LogisticRegression(), max_samples=0.5,
+                                max_features=0.5, random_state=1,
+                                bootstrap=False)
+    bagging.fit(X, y)
+
+    # Get relevant attributes
+    estimators_samples = bagging.estimators_samples_
+    estimators_features = bagging.estimators_features_
+    estimators = bagging.estimators_
+
+    # Test for correct formatting
+    assert len(estimators_samples) == len(estimators)
+    assert len(estimators_samples[0]) == len(X) // 2
+    assert estimators_samples[0].dtype.kind == 'i'
+
+    # Re-fit single estimator to test for consistent sampling
+    estimator_index = 0
+    estimator_samples = estimators_samples[estimator_index]
+    estimator_features = estimators_features[estimator_index]
+    estimator = estimators[estimator_index]
+
+    X_train = (X[estimator_samples])[:, estimator_features]
+    y_train = y[estimator_samples]
+
+    orig_coefs = estimator.coef_
+    estimator.fit(X_train, y_train)
+    new_coefs = estimator.coef_
+
+    assert_array_almost_equal(orig_coefs, new_coefs)
+
+
+def test_estimators_samples_deterministic():
+    # This test is a regression test to check that with a random step
+    # (e.g. SparseRandomProjection) and a given random state, the results
+    # generated at fit time can be identically reproduced at a later time using
+    # data saved in object attributes. Check issue #9524 for full discussion.
+
+    iris = load_iris()
+    X, y = iris.data, iris.target
+
+    base_pipeline = make_pipeline(SparseRandomProjection(n_components=2),
+                                  LogisticRegression())
+    clf = BaggingClassifier(base_estimator=base_pipeline,
+                            max_samples=0.5,
+                            random_state=0)
+    clf.fit(X, y)
+    pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy()
+
+    estimator = clf.estimators_[0]
+    estimator_sample = clf.estimators_samples_[0]
+    estimator_feature = clf.estimators_features_[0]
+
+    X_train = (X[estimator_sample])[:, estimator_feature]
+    y_train = y[estimator_sample]
+
+    estimator.fit(X_train, y_train)
+    assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
+
+
+def test_max_samples_consistency():
+    # Make sure validated max_samples and original max_samples are identical
+    # when valid integer max_samples supplied by user
+    max_samples = 100
+    X, y = make_hastie_10_2(n_samples=2*max_samples, random_state=1)
+    bagging = BaggingClassifier(KNeighborsClassifier(),
+                                max_samples=max_samples,
+                                max_features=0.5, random_state=1)
+    bagging.fit(X, y)
+    assert bagging._max_samples == max_samples
+
+
+def test_set_oob_score_label_encoding():
+    # Make sure the oob_score doesn't change when the labels change
+    # See: https://github.com/scikit-learn/scikit-learn/issues/8933
+    random_state = 5
+    X = [[-1], [0], [1]] * 5
+    Y1 = ['A', 'B', 'C'] * 5
+    Y2 = [-1, 0, 1] * 5
+    Y3 = [0, 1, 2] * 5
+    x1 = BaggingClassifier(oob_score=True,
+                           random_state=random_state).fit(X, Y1).oob_score_
+    x2 = BaggingClassifier(oob_score=True,
+                           random_state=random_state).fit(X, Y2).oob_score_
+    x3 = BaggingClassifier(oob_score=True,
+                           random_state=random_state).fit(X, Y3).oob_score_
+    assert [x1, x2] == [x3, x3]
+
+
+def replace(X):
+    X = X.astype('float', copy=True)
+    X[~np.isfinite(X)] = 0
+    return X
+
+
+def test_bagging_regressor_with_missing_inputs():
+    # Check that BaggingRegressor can accept X with missing/infinite data
+    X = np.array([
+        [1, 3, 5],
+        [2, None, 6],
+        [2, np.nan, 6],
+        [2, np.inf, 6],
+        [2, np.NINF, 6],
+    ])
+    y_values = [
+        np.array([2, 3, 3, 3, 3]),
+        np.array([
+            [2, 1, 9],
+            [3, 6, 8],
+            [3, 6, 8],
+            [3, 6, 8],
+            [3, 6, 8],
+        ])
+    ]
+    for y in y_values:
+        regressor = DecisionTreeRegressor()
+        pipeline = make_pipeline(
+            FunctionTransformer(replace), regressor
+        )
+        pipeline.fit(X, y).predict(X)
+        bagging_regressor = BaggingRegressor(pipeline)
+        y_hat = bagging_regressor.fit(X, y).predict(X)
+        assert y.shape == y_hat.shape
+
+        # Verify that exceptions can be raised by wrapper regressor
+        regressor = DecisionTreeRegressor()
+        pipeline = make_pipeline(regressor)
+        assert_raises(ValueError, pipeline.fit, X, y)
+        bagging_regressor = BaggingRegressor(pipeline)
+        assert_raises(ValueError, bagging_regressor.fit, X, y)
+
+
+def test_bagging_classifier_with_missing_inputs():
+    # Check that BaggingClassifier can accept X with missing/infinite data
+    X = np.array([
+        [1, 3, 5],
+        [2, None, 6],
+        [2, np.nan, 6],
+        [2, np.inf, 6],
+        [2, np.NINF, 6],
+    ])
+    y = np.array([3, 6, 6, 6, 6])
+    classifier = DecisionTreeClassifier()
+    pipeline = make_pipeline(
+        FunctionTransformer(replace), classifier
+    )
+    pipeline.fit(X, y).predict(X)
+    bagging_classifier = BaggingClassifier(pipeline)
+    bagging_classifier.fit(X, y)
+    y_hat = bagging_classifier.predict(X)
+    assert y.shape == y_hat.shape
+    bagging_classifier.predict_log_proba(X)
+    bagging_classifier.predict_proba(X)
+
+    # Verify that exceptions can be raised by wrapper classifier
+    classifier = DecisionTreeClassifier()
+    pipeline = make_pipeline(classifier)
+    assert_raises(ValueError, pipeline.fit, X, y)
+    bagging_classifier = BaggingClassifier(pipeline)
+    assert_raises(ValueError, bagging_classifier.fit, X, y)
+
+
+def test_bagging_small_max_features():
+    # Check that Bagging estimator can accept low fractional max_features
+
+    X = np.array([[1, 2], [3, 4]])
+    y = np.array([1, 0])
+
+    bagging = BaggingClassifier(LogisticRegression(),
+                                max_features=0.3, random_state=1)
+    bagging.fit(X, y)
+
+
+def test_bagging_get_estimators_indices():
+    # Check that Bagging estimator can generate sample indices properly
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/16436
+
+    rng = np.random.RandomState(0)
+    X = rng.randn(13, 4)
+    y = np.arange(13)
+
+    class MyEstimator(DecisionTreeRegressor):
+        """An estimator which stores y indices information at fit."""
+        def fit(self, X, y):
+            self._sample_indices = y
+
+    clf = BaggingRegressor(base_estimator=MyEstimator(),
+                           n_estimators=1, random_state=0)
+    clf.fit(X, y)
+
+    assert_array_equal(clf.estimators_[0]._sample_indices,
+                       clf.estimators_samples_[0])
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/test_base.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/test_base.py
@ -0,0 +1,127 @@
+"""
+Testing for the base module (sklearn.ensemble.base).
+"""
+
+# Authors: Gilles Louppe
+# License: BSD 3 clause
+
+import numpy as np
+
+from sklearn.utils._testing import assert_raise_message
+
+from sklearn.datasets import load_iris
+from sklearn.ensemble import BaggingClassifier
+from sklearn.ensemble._base import _set_random_states
+from sklearn.linear_model import Perceptron
+from collections import OrderedDict
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.pipeline import Pipeline
+from sklearn.feature_selection import SelectFromModel
+
+
+def test_base():
+    # Check BaseEnsemble methods.
+    ensemble = BaggingClassifier(
+        base_estimator=Perceptron(random_state=None), n_estimators=3)
+
+    iris = load_iris()
+    ensemble.fit(iris.data, iris.target)
+    ensemble.estimators_ = []  # empty the list and create estimators manually
+
+    ensemble._make_estimator()
+    random_state = np.random.RandomState(3)
+    ensemble._make_estimator(random_state=random_state)
+    ensemble._make_estimator(random_state=random_state)
+    ensemble._make_estimator(append=False)
+
+    assert 3 == len(ensemble)
+    assert 3 == len(ensemble.estimators_)
+
+    assert isinstance(ensemble[0], Perceptron)
+    assert ensemble[0].random_state is None
+    assert isinstance(ensemble[1].random_state, int)
+    assert isinstance(ensemble[2].random_state, int)
+    assert ensemble[1].random_state != ensemble[2].random_state
+
+    np_int_ensemble = BaggingClassifier(base_estimator=Perceptron(),
+                                        n_estimators=np.int32(3))
+    np_int_ensemble.fit(iris.data, iris.target)
+
+
+def test_base_zero_n_estimators():
+    # Check that instantiating a BaseEnsemble with n_estimators<=0 raises
+    # a ValueError.
+    ensemble = BaggingClassifier(base_estimator=Perceptron(),
+                                 n_estimators=0)
+    iris = load_iris()
+    assert_raise_message(ValueError,
+                         "n_estimators must be greater than zero, got 0.",
+                         ensemble.fit, iris.data, iris.target)
+
+
+def test_base_not_int_n_estimators():
+    # Check that instantiating a BaseEnsemble with a string as n_estimators
+    # raises a ValueError demanding n_estimators to be supplied as an integer.
+    string_ensemble = BaggingClassifier(base_estimator=Perceptron(),
+                                        n_estimators='3')
+    iris = load_iris()
+    assert_raise_message(ValueError,
+                         "n_estimators must be an integer",
+                         string_ensemble.fit, iris.data, iris.target)
+    float_ensemble = BaggingClassifier(base_estimator=Perceptron(),
+                                       n_estimators=3.0)
+    assert_raise_message(ValueError,
+                         "n_estimators must be an integer",
+                         float_ensemble.fit, iris.data, iris.target)
+
+
+def test_set_random_states():
+    # Linear Discriminant Analysis doesn't have random state: smoke test
+    _set_random_states(LinearDiscriminantAnalysis(), random_state=17)
+
+    clf1 = Perceptron(random_state=None)
+    assert clf1.random_state is None
+    # check random_state is None still sets
+    _set_random_states(clf1, None)
+    assert isinstance(clf1.random_state, int)
+
+    # check random_state fixes results in consistent initialisation
+    _set_random_states(clf1, 3)
+    assert isinstance(clf1.random_state, int)
+    clf2 = Perceptron(random_state=None)
+    _set_random_states(clf2, 3)
+    assert clf1.random_state == clf2.random_state
+
+    # nested random_state
+
+    def make_steps():
+        return [('sel', SelectFromModel(Perceptron(random_state=None))),
+                ('clf', Perceptron(random_state=None))]
+
+    est1 = Pipeline(make_steps())
+    _set_random_states(est1, 3)
+    assert isinstance(est1.steps[0][1].estimator.random_state, int)
+    assert isinstance(est1.steps[1][1].random_state, int)
+    assert (est1.get_params()['sel__estimator__random_state'] !=
+                     est1.get_params()['clf__random_state'])
+
+    # ensure multiple random_state parameters are invariant to get_params()
+    # iteration order
+
+    class AlphaParamPipeline(Pipeline):
+        def get_params(self, *args, **kwargs):
+            params = Pipeline.get_params(self, *args, **kwargs).items()
+            return OrderedDict(sorted(params))
+
+    class RevParamPipeline(Pipeline):
+        def get_params(self, *args, **kwargs):
+            params = Pipeline.get_params(self, *args, **kwargs).items()
+            return OrderedDict(sorted(params, reverse=True))
+
+    for cls in [AlphaParamPipeline, RevParamPipeline]:
+        est2 = cls(make_steps())
+        _set_random_states(est2, 3)
+        assert (est1.get_params()['sel__estimator__random_state'] ==
+                     est2.get_params()['sel__estimator__random_state'])
+        assert (est1.get_params()['clf__random_state'] ==
+                     est2.get_params()['clf__random_state'])
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/test_common.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/test_common.py
@ -0,0 +1,172 @@
+import pytest
+
+from sklearn.base import clone
+from sklearn.base import ClassifierMixin
+from sklearn.base import is_classifier
+
+from sklearn.datasets import make_classification
+from sklearn.datasets import make_regression
+
+from sklearn.linear_model import LogisticRegression, LinearRegression
+from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+
+from sklearn.ensemble import StackingClassifier, StackingRegressor
+from sklearn.ensemble import VotingClassifier, VotingRegressor
+
+
+@pytest.mark.parametrize(
+    "X, y, estimator",
+    [(*make_classification(n_samples=10),
+      StackingClassifier(estimators=[('lr', LogisticRegression()),
+                                     ('svm', LinearSVC()),
+                                     ('rf', RandomForestClassifier())])),
+     (*make_classification(n_samples=10),
+      VotingClassifier(estimators=[('lr', LogisticRegression()),
+                                   ('svm', LinearSVC()),
+                                   ('rf', RandomForestClassifier())])),
+     (*make_regression(n_samples=10),
+      StackingRegressor(estimators=[('lr', LinearRegression()),
+                                    ('svm', LinearSVR()),
+                                    ('rf', RandomForestRegressor())])),
+     (*make_regression(n_samples=10),
+      VotingRegressor(estimators=[('lr', LinearRegression()),
+                                  ('svm', LinearSVR()),
+                                  ('rf', RandomForestRegressor())]))],
+    ids=['stacking-classifier', 'voting-classifier',
+         'stacking-regressor', 'voting-regressor']
+)
+def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
+    # check that the behavior of `estimators`, `estimators_`,
+    # `named_estimators`, `named_estimators_` is consistent across all
+    # ensemble classes and when using `set_params()`.
+
+    # before fit
+    assert 'svm' in estimator.named_estimators
+    assert estimator.named_estimators.svm is estimator.estimators[1][1]
+    assert estimator.named_estimators.svm is estimator.named_estimators['svm']
+
+    # check fitted attributes
+    estimator.fit(X, y)
+    assert len(estimator.named_estimators) == 3
+    assert len(estimator.named_estimators_) == 3
+    assert (sorted(list(estimator.named_estimators_.keys())) ==
+            sorted(['lr', 'svm', 'rf']))
+
+    # check that set_params() does not add a new attribute
+    estimator_new_params = clone(estimator)
+    svm_estimator = SVC() if is_classifier(estimator) else SVR()
+    estimator_new_params.set_params(svm=svm_estimator).fit(X, y)
+    assert not hasattr(estimator_new_params, 'svm')
+    assert (estimator_new_params.named_estimators.lr.get_params() ==
+            estimator.named_estimators.lr.get_params())
+    assert (estimator_new_params.named_estimators.rf.get_params() ==
+            estimator.named_estimators.rf.get_params())
+
+    # check the behavior when setting an dropping an estimator
+    estimator_dropped = clone(estimator)
+    estimator_dropped.set_params(svm='drop')
+    estimator_dropped.fit(X, y)
+    assert len(estimator_dropped.named_estimators) == 3
+    assert estimator_dropped.named_estimators.svm == 'drop'
+    assert len(estimator_dropped.named_estimators_) == 3
+    assert (sorted(list(estimator_dropped.named_estimators_.keys())) ==
+            sorted(['lr', 'svm', 'rf']))
+    for sub_est in estimator_dropped.named_estimators_:
+        # check that the correspondence is correct
+        assert not isinstance(sub_est, type(estimator.named_estimators.svm))
+
+    # check that we can set the parameters of the underlying classifier
+    estimator.set_params(svm__C=10.0)
+    estimator.set_params(rf__max_depth=5)
+    assert (estimator.get_params()['svm__C'] ==
+            estimator.get_params()['svm'].get_params()['C'])
+    assert (estimator.get_params()['rf__max_depth'] ==
+            estimator.get_params()['rf'].get_params()['max_depth'])
+
+
+@pytest.mark.parametrize(
+    "Ensemble",
+    [StackingClassifier, VotingClassifier, StackingRegressor, VotingRegressor]
+)
+def test_ensemble_heterogeneous_estimators_type(Ensemble):
+    # check that ensemble will fail during validation if the underlying
+    # estimators are not of the same type (i.e. classifier or regressor)
+    if issubclass(Ensemble, ClassifierMixin):
+        X, y = make_classification(n_samples=10)
+        estimators = [('lr', LinearRegression())]
+        ensemble_type = 'classifier'
+    else:
+        X, y = make_regression(n_samples=10)
+        estimators = [('lr', LogisticRegression())]
+        ensemble_type = 'regressor'
+    ensemble = Ensemble(estimators=estimators)
+
+    err_msg = "should be a {}".format(ensemble_type)
+    with pytest.raises(ValueError, match=err_msg):
+        ensemble.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "X, y, Ensemble",
+    [(*make_classification(n_samples=10), StackingClassifier),
+     (*make_classification(n_samples=10), VotingClassifier),
+     (*make_regression(n_samples=10), StackingRegressor),
+     (*make_regression(n_samples=10), VotingRegressor)]
+)
+def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble):
+    # raise an error when the name contains dunder
+    if issubclass(Ensemble, ClassifierMixin):
+        estimators = [('lr__', LogisticRegression())]
+    else:
+        estimators = [('lr__', LinearRegression())]
+    ensemble = Ensemble(estimators=estimators)
+
+    err_msg = r"Estimator names must not contain __: got \['lr__'\]"
+    with pytest.raises(ValueError, match=err_msg):
+        ensemble.fit(X, y)
+
+    # raise an error when the name is not unique
+    if issubclass(Ensemble, ClassifierMixin):
+        estimators = [('lr', LogisticRegression()),
+                      ('lr', LogisticRegression())]
+    else:
+        estimators = [('lr', LinearRegression()),
+                      ('lr', LinearRegression())]
+    ensemble = Ensemble(estimators=estimators)
+
+    err_msg = r"Names provided are not unique: \['lr', 'lr'\]"
+    with pytest.raises(ValueError, match=err_msg):
+        ensemble.fit(X, y)
+
+    # raise an error when the name conflicts with the parameters
+    if issubclass(Ensemble, ClassifierMixin):
+        estimators = [('estimators', LogisticRegression())]
+    else:
+        estimators = [('estimators', LinearRegression())]
+    ensemble = Ensemble(estimators=estimators)
+
+    err_msg = "Estimator names conflict with constructor arguments"
+    with pytest.raises(ValueError, match=err_msg):
+        ensemble.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "X, y, estimator",
+    [(*make_classification(n_samples=10),
+      StackingClassifier(estimators=[('lr', LogisticRegression())])),
+     (*make_classification(n_samples=10),
+      VotingClassifier(estimators=[('lr', LogisticRegression())])),
+     (*make_regression(n_samples=10),
+      StackingRegressor(estimators=[('lr', LinearRegression())])),
+     (*make_regression(n_samples=10),
+      VotingRegressor(estimators=[('lr', LinearRegression())]))],
+    ids=['stacking-classifier', 'voting-classifier',
+         'stacking-regressor', 'voting-regressor']
+)
+def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
+    # check that we raise a consistent error when all estimators are
+    # dropped
+    estimator.set_params(lr='drop')
+    with pytest.raises(ValueError, match="All estimators are dropped."):
+        estimator.fit(X, y)
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/test_forest.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/test_forest.py
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/test_gradient_boosting.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/test_gradient_boosting.py
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py
@ -0,0 +1,343 @@
+"""
+Testing for the gradient boosting loss functions and initial estimators.
+"""
+
+import numpy as np
+from numpy.testing import assert_almost_equal
+from numpy.testing import assert_allclose
+import pytest
+
+from sklearn.utils import check_random_state
+from sklearn.utils.stats import _weighted_percentile
+from sklearn.ensemble._gb_losses import RegressionLossFunction
+from sklearn.ensemble._gb_losses import LeastSquaresError
+from sklearn.ensemble._gb_losses import LeastAbsoluteError
+from sklearn.ensemble._gb_losses import HuberLossFunction
+from sklearn.ensemble._gb_losses import QuantileLossFunction
+from sklearn.ensemble._gb_losses import BinomialDeviance
+from sklearn.ensemble._gb_losses import MultinomialDeviance
+from sklearn.ensemble._gb_losses import ExponentialLoss
+from sklearn.ensemble._gb_losses import LOSS_FUNCTIONS
+
+
+def test_binomial_deviance():
+    # Check binomial deviance loss.
+    # Check against alternative definitions in ESLII.
+    bd = BinomialDeviance(2)
+
+    # pred has the same BD for y in {0, 1}
+    assert (bd(np.array([0.0]), np.array([0.0])) ==
+            bd(np.array([1.0]), np.array([0.0])))
+
+    assert_almost_equal(bd(np.array([1.0, 1.0, 1.0]),
+                           np.array([100.0, 100.0, 100.0])),
+                        0.0)
+    assert_almost_equal(bd(np.array([1.0, 0.0, 0.0]),
+                           np.array([100.0, -100.0, -100.0])), 0)
+
+    # check if same results as alternative definition of deviance (from ESLII)
+    def alt_dev(y, pred):
+        return np.mean(np.logaddexp(0.0, -2.0 * (2.0 * y - 1) * pred))
+
+    test_data = [(np.array([1.0, 1.0, 1.0]), np.array([100.0, 100.0, 100.0])),
+                 (np.array([0.0, 0.0, 0.0]), np.array([100.0, 100.0, 100.0])),
+                 (np.array([0.0, 0.0, 0.0]),
+                  np.array([-100.0, -100.0, -100.0])),
+                 (np.array([1.0, 1.0, 1.0]),
+                  np.array([-100.0, -100.0, -100.0]))]
+
+    for datum in test_data:
+        assert_almost_equal(bd(*datum), alt_dev(*datum))
+
+    # check the gradient against the
+    def alt_ng(y, pred):
+        return (2 * y - 1) / (1 + np.exp(2 * (2 * y - 1) * pred))
+
+    for datum in test_data:
+        assert_almost_equal(bd.negative_gradient(*datum), alt_ng(*datum))
+
+
+def test_sample_weight_smoke():
+    rng = check_random_state(13)
+    y = rng.rand(100)
+    pred = rng.rand(100)
+
+    # least squares
+    loss = LeastSquaresError(1)
+    loss_wo_sw = loss(y, pred)
+    loss_w_sw = loss(y, pred, np.ones(pred.shape[0], dtype=np.float32))
+    assert_almost_equal(loss_wo_sw, loss_w_sw)
+
+
+def test_sample_weight_init_estimators():
+    # Smoke test for init estimators with sample weights.
+    rng = check_random_state(13)
+    X = rng.rand(100, 2)
+    sample_weight = np.ones(100)
+    reg_y = rng.rand(100)
+
+    clf_y = rng.randint(0, 2, size=100)
+
+    for Loss in LOSS_FUNCTIONS.values():
+        if Loss is None:
+            continue
+        if issubclass(Loss, RegressionLossFunction):
+            k = 1
+            y = reg_y
+        else:
+            k = 2
+            y = clf_y
+            if Loss.is_multi_class:
+                # skip multiclass
+                continue
+
+        loss = Loss(k)
+        init_est = loss.init_estimator()
+        init_est.fit(X, y)
+        out = loss.get_init_raw_predictions(X, init_est)
+        assert out.shape == (y.shape[0], 1)
+
+        sw_init_est = loss.init_estimator()
+        sw_init_est.fit(X, y, sample_weight=sample_weight)
+        sw_out = loss.get_init_raw_predictions(X, sw_init_est)
+        assert sw_out.shape == (y.shape[0], 1)
+
+        # check if predictions match
+        assert_allclose(out, sw_out, rtol=1e-2)
+
+
+def test_weighted_percentile():
+    y = np.empty(102, dtype=np.float64)
+    y[:50] = 0
+    y[-51:] = 2
+    y[-1] = 100000
+    y[50] = 1
+    sw = np.ones(102, dtype=np.float64)
+    sw[-1] = 0.0
+    score = _weighted_percentile(y, sw, 50)
+    assert score == 1
+
+
+def test_weighted_percentile_equal():
+    y = np.empty(102, dtype=np.float64)
+    y.fill(0.0)
+    sw = np.ones(102, dtype=np.float64)
+    sw[-1] = 0.0
+    score = _weighted_percentile(y, sw, 50)
+    assert score == 0
+
+
+def test_weighted_percentile_zero_weight():
+    y = np.empty(102, dtype=np.float64)
+    y.fill(1.0)
+    sw = np.ones(102, dtype=np.float64)
+    sw.fill(0.0)
+    score = _weighted_percentile(y, sw, 50)
+    assert score == 1.0
+
+
+def test_quantile_loss_function():
+    # Non regression test for the QuantileLossFunction object
+    # There was a sign problem when evaluating the function
+    # for negative values of 'ytrue - ypred'
+    x = np.asarray([-1.0, 0.0, 1.0])
+    y_found = QuantileLossFunction(1, 0.9)(x, np.zeros_like(x))
+    y_expected = np.asarray([0.1, 0.0, 0.9]).mean()
+    np.testing.assert_allclose(y_found, y_expected)
+
+
+def test_sample_weight_deviance():
+    # Test if deviance supports sample weights.
+    rng = check_random_state(13)
+    sample_weight = np.ones(100)
+    reg_y = rng.rand(100)
+    clf_y = rng.randint(0, 2, size=100)
+    mclf_y = rng.randint(0, 3, size=100)
+
+    for Loss in LOSS_FUNCTIONS.values():
+        if Loss is None:
+            continue
+        if issubclass(Loss, RegressionLossFunction):
+            k = 1
+            y = reg_y
+            p = reg_y
+        else:
+            k = 2
+            y = clf_y
+            p = clf_y
+            if Loss.is_multi_class:
+                k = 3
+                y = mclf_y
+                # one-hot encoding
+                p = np.zeros((y.shape[0], k), dtype=np.float64)
+                for i in range(k):
+                    p[:, i] = y == i
+
+        loss = Loss(k)
+        deviance_w_w = loss(y, p, sample_weight)
+        deviance_wo_w = loss(y, p)
+        assert deviance_wo_w == deviance_w_w
+
+
+@pytest.mark.parametrize(
+    'n_classes, n_samples', [(3, 100), (5, 57), (7, 13)]
+)
+def test_multinomial_deviance(n_classes, n_samples):
+    # Check multinomial deviance with and without sample weights.
+    rng = np.random.RandomState(13)
+    sample_weight = np.ones(n_samples)
+    y_true = rng.randint(0, n_classes, size=n_samples)
+    y_pred = np.zeros((n_samples, n_classes), dtype=np.float64)
+    for klass in range(y_pred.shape[1]):
+        y_pred[:, klass] = y_true == klass
+
+    loss = MultinomialDeviance(n_classes)
+    loss_wo_sw = loss(y_true, y_pred)
+    assert loss_wo_sw > 0
+    loss_w_sw = loss(y_true, y_pred, sample_weight=sample_weight)
+    assert loss_wo_sw == pytest.approx(loss_w_sw)
+
+    # Multinomial deviance uses weighted average loss rather than
+    # weighted sum loss, so we make sure that the value remains the same
+    # when we device the weight by 2.
+    loss_w_sw = loss(y_true, y_pred, sample_weight=0.5 * sample_weight)
+    assert loss_wo_sw == pytest.approx(loss_w_sw)
+
+
+def test_mdl_computation_weighted():
+    raw_predictions = np.array([[1., -1., -.1], [-2., 1., 2.]])
+    y_true = np.array([0, 1])
+    weights = np.array([1, 3])
+    expected_loss = 1.0909323
+    # MultinomialDeviance loss computation with weights.
+    loss = MultinomialDeviance(3)
+    assert (loss(y_true, raw_predictions, weights)
+            == pytest.approx(expected_loss))
+
+
+@pytest.mark.parametrize('n', [0, 1, 2])
+def test_mdl_exception(n):
+    # Check that MultinomialDeviance throws an exception when n_classes <= 2
+    err_msg = 'MultinomialDeviance requires more than 2 classes.'
+    with pytest.raises(ValueError, match=err_msg):
+        MultinomialDeviance(n)
+
+
+def test_init_raw_predictions_shapes():
+    # Make sure get_init_raw_predictions returns float64 arrays with shape
+    # (n_samples, K) where K is 1 for binary classification and regression, and
+    # K = n_classes for multiclass classification
+    rng = np.random.RandomState(0)
+
+    n_samples = 100
+    X = rng.normal(size=(n_samples, 5))
+    y = rng.normal(size=n_samples)
+    for loss in (LeastSquaresError(n_classes=1),
+                 LeastAbsoluteError(n_classes=1),
+                 QuantileLossFunction(n_classes=1),
+                 HuberLossFunction(n_classes=1)):
+        init_estimator = loss.init_estimator().fit(X, y)
+        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
+        assert raw_predictions.shape == (n_samples, 1)
+        assert raw_predictions.dtype == np.float64
+
+    y = rng.randint(0, 2, size=n_samples)
+    for loss in (BinomialDeviance(n_classes=2),
+                 ExponentialLoss(n_classes=2)):
+        init_estimator = loss.init_estimator().fit(X, y)
+        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
+        assert raw_predictions.shape == (n_samples, 1)
+        assert raw_predictions.dtype == np.float64
+
+    for n_classes in range(3, 5):
+        y = rng.randint(0, n_classes, size=n_samples)
+        loss = MultinomialDeviance(n_classes=n_classes)
+        init_estimator = loss.init_estimator().fit(X, y)
+        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
+        assert raw_predictions.shape == (n_samples, n_classes)
+        assert raw_predictions.dtype == np.float64
+
+
+def test_init_raw_predictions_values():
+    # Make sure the get_init_raw_predictions() returns the expected values for
+    # each loss.
+    rng = np.random.RandomState(0)
+
+    n_samples = 100
+    X = rng.normal(size=(n_samples, 5))
+    y = rng.normal(size=n_samples)
+
+    # Least squares loss
+    loss = LeastSquaresError(n_classes=1)
+    init_estimator = loss.init_estimator().fit(X, y)
+    raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
+    # Make sure baseline prediction is the mean of all targets
+    assert_almost_equal(raw_predictions, y.mean())
+
+    # Least absolute and huber loss
+    for Loss in (LeastAbsoluteError, HuberLossFunction):
+        loss = Loss(n_classes=1)
+        init_estimator = loss.init_estimator().fit(X, y)
+        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
+        # Make sure baseline prediction is the median of all targets
+        assert_almost_equal(raw_predictions, np.median(y))
+
+    # Quantile loss
+    for alpha in (.1, .5, .9):
+        loss = QuantileLossFunction(n_classes=1, alpha=alpha)
+        init_estimator = loss.init_estimator().fit(X, y)
+        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
+        # Make sure baseline prediction is the alpha-quantile of all targets
+        assert_almost_equal(raw_predictions, np.percentile(y, alpha * 100))
+
+    y = rng.randint(0, 2, size=n_samples)
+
+    # Binomial deviance
+    loss = BinomialDeviance(n_classes=2)
+    init_estimator = loss.init_estimator().fit(X, y)
+    # Make sure baseline prediction is equal to link_function(p), where p
+    # is the proba of the positive class. We want predict_proba() to return p,
+    # and by definition
+    # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction)
+    # So we want raw_prediction = link_function(p) = log(p / (1 - p))
+    raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
+    p = y.mean()
+    assert_almost_equal(raw_predictions, np.log(p / (1 - p)))
+
+    # Exponential loss
+    loss = ExponentialLoss(n_classes=2)
+    init_estimator = loss.init_estimator().fit(X, y)
+    raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
+    p = y.mean()
+    assert_almost_equal(raw_predictions, .5 * np.log(p / (1 - p)))
+
+    # Multinomial deviance loss
+    for n_classes in range(3, 5):
+        y = rng.randint(0, n_classes, size=n_samples)
+        loss = MultinomialDeviance(n_classes=n_classes)
+        init_estimator = loss.init_estimator().fit(X, y)
+        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
+        for k in range(n_classes):
+            p = (y == k).mean()
+        assert_almost_equal(raw_predictions[:, k], np.log(p))
+
+
+@pytest.mark.parametrize('seed', range(5))
+def test_lad_equals_quantile_50(seed):
+    # Make sure quantile loss with alpha = .5 is equivalent to LAD
+    lad = LeastAbsoluteError(n_classes=1)
+    ql = QuantileLossFunction(n_classes=1, alpha=0.5)
+
+    n_samples = 50
+    rng = np.random.RandomState(seed)
+    raw_predictions = rng.normal(size=(n_samples))
+    y_true = rng.normal(size=(n_samples))
+
+    lad_loss = lad(y_true, raw_predictions)
+    ql_loss = ql(y_true, raw_predictions)
+    assert_almost_equal(lad_loss, 2 * ql_loss)
+
+    weights = np.linspace(0, 1, n_samples) ** 2
+    lad_weighted_loss = lad(y_true, raw_predictions, sample_weight=weights)
+    ql_weighted_loss = ql(y_true, raw_predictions, sample_weight=weights)
+    assert_almost_equal(lad_weighted_loss, 2 * ql_weighted_loss)
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/test_iforest.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/test_iforest.py
@ -0,0 +1,358 @@
+"""
+Testing for Isolation Forest algorithm (sklearn.ensemble.iforest).
+"""
+
+# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
+#          Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
+# License: BSD 3 clause
+
+import pytest
+
+import numpy as np
+
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import assert_array_almost_equal
+from sklearn.utils._testing import assert_raises
+from sklearn.utils._testing import assert_warns_message
+from sklearn.utils._testing import ignore_warnings
+from sklearn.utils._testing import assert_allclose
+
+from sklearn.model_selection import ParameterGrid
+from sklearn.ensemble import IsolationForest
+from sklearn.ensemble._iforest import _average_path_length
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import load_diabetes, load_iris
+from sklearn.utils import check_random_state
+from sklearn.metrics import roc_auc_score
+
+from scipy.sparse import csc_matrix, csr_matrix
+from unittest.mock import Mock, patch
+
+rng = check_random_state(0)
+
+# load the iris dataset
+# and randomly permute it
+iris = load_iris()
+perm = rng.permutation(iris.target.size)
+iris.data = iris.data[perm]
+iris.target = iris.target[perm]
+
+# also load the diabetes dataset
+# and randomly permute it
+diabetes = load_diabetes()
+perm = rng.permutation(diabetes.target.size)
+diabetes.data = diabetes.data[perm]
+diabetes.target = diabetes.target[perm]
+
+
+def test_iforest():
+    """Check Isolation Forest for various parameter settings."""
+    X_train = np.array([[0, 1], [1, 2]])
+    X_test = np.array([[2, 1], [1, 1]])
+
+    grid = ParameterGrid({"n_estimators": [3],
+                          "max_samples": [0.5, 1.0, 3],
+                          "bootstrap": [True, False]})
+
+    with ignore_warnings():
+        for params in grid:
+            IsolationForest(random_state=rng,
+                            **params).fit(X_train).predict(X_test)
+
+
+def test_iforest_sparse():
+    """Check IForest for various parameter settings on sparse input."""
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
+                                                        diabetes.target[:50],
+                                                        random_state=rng)
+    grid = ParameterGrid({"max_samples": [0.5, 1.0],
+                          "bootstrap": [True, False]})
+
+    for sparse_format in [csc_matrix, csr_matrix]:
+        X_train_sparse = sparse_format(X_train)
+        X_test_sparse = sparse_format(X_test)
+
+        for params in grid:
+            # Trained on sparse format
+            sparse_classifier = IsolationForest(
+                n_estimators=10, random_state=1, **params).fit(X_train_sparse)
+            sparse_results = sparse_classifier.predict(X_test_sparse)
+
+            # Trained on dense format
+            dense_classifier = IsolationForest(
+                n_estimators=10, random_state=1, **params).fit(X_train)
+            dense_results = dense_classifier.predict(X_test)
+
+            assert_array_equal(sparse_results, dense_results)
+
+
+def test_iforest_error():
+    """Test that it gives proper exception on deficient input."""
+    X = iris.data
+
+    # Test max_samples
+    assert_raises(ValueError,
+                  IsolationForest(max_samples=-1).fit, X)
+    assert_raises(ValueError,
+                  IsolationForest(max_samples=0.0).fit, X)
+    assert_raises(ValueError,
+                  IsolationForest(max_samples=2.0).fit, X)
+    # The dataset has less than 256 samples, explicitly setting
+    # max_samples > n_samples should result in a warning. If not set
+    # explicitly there should be no warning
+    assert_warns_message(UserWarning,
+                         "max_samples will be set to n_samples for estimation",
+                         IsolationForest(max_samples=1000).fit, X)
+    # note that assert_no_warnings does not apply since it enables a
+    # PendingDeprecationWarning triggered by scipy.sparse's use of
+    # np.matrix. See issue #11251.
+    with pytest.warns(None) as record:
+        IsolationForest(max_samples='auto').fit(X)
+    user_warnings = [each for each in record
+                     if issubclass(each.category, UserWarning)]
+    assert len(user_warnings) == 0
+    with pytest.warns(None) as record:
+        IsolationForest(max_samples=np.int64(2)).fit(X)
+    user_warnings = [each for each in record
+                     if issubclass(each.category, UserWarning)]
+    assert len(user_warnings) == 0
+
+    assert_raises(ValueError, IsolationForest(max_samples='foobar').fit, X)
+    assert_raises(ValueError, IsolationForest(max_samples=1.5).fit, X)
+
+    # test X_test n_features match X_train one:
+    assert_raises(ValueError, IsolationForest().fit(X).predict, X[:, 1:])
+
+    # test that behaviour='old' will raise an error
+    msg = "The old behaviour of IsolationForest is not implemented anymore."
+    with pytest.raises(NotImplementedError, match=msg):
+        IsolationForest(behaviour='old').fit(X)
+
+
+def test_recalculate_max_depth():
+    """Check max_depth recalculation when max_samples is reset to n_samples"""
+    X = iris.data
+    clf = IsolationForest().fit(X)
+    for est in clf.estimators_:
+        assert est.max_depth == int(np.ceil(np.log2(X.shape[0])))
+
+
+def test_max_samples_attribute():
+    X = iris.data
+    clf = IsolationForest().fit(X)
+    assert clf.max_samples_ == X.shape[0]
+
+    clf = IsolationForest(max_samples=500)
+    assert_warns_message(UserWarning,
+                         "max_samples will be set to n_samples for estimation",
+                         clf.fit, X)
+    assert clf.max_samples_ == X.shape[0]
+
+    clf = IsolationForest(max_samples=0.4).fit(X)
+    assert clf.max_samples_ == 0.4*X.shape[0]
+
+
+def test_iforest_parallel_regression():
+    """Check parallel regression."""
+    rng = check_random_state(0)
+
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
+                                                        random_state=rng)
+
+    ensemble = IsolationForest(n_jobs=3,
+                               random_state=0).fit(X_train)
+
+    ensemble.set_params(n_jobs=1)
+    y1 = ensemble.predict(X_test)
+    ensemble.set_params(n_jobs=2)
+    y2 = ensemble.predict(X_test)
+    assert_array_almost_equal(y1, y2)
+
+    ensemble = IsolationForest(n_jobs=1,
+                               random_state=0).fit(X_train)
+
+    y3 = ensemble.predict(X_test)
+    assert_array_almost_equal(y1, y3)
+
+
+def test_iforest_performance():
+    """Test Isolation Forest performs well"""
+
+    # Generate train/test data
+    rng = check_random_state(2)
+    X = 0.3 * rng.randn(120, 2)
+    X_train = np.r_[X + 2, X - 2]
+    X_train = X[:100]
+
+    # Generate some abnormal novel observations
+    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
+    X_test = np.r_[X[100:], X_outliers]
+    y_test = np.array([0] * 20 + [1] * 20)
+
+    # fit the model
+    clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)
+
+    # predict scores (the lower, the more normal)
+    y_pred = - clf.decision_function(X_test)
+
+    # check that there is at most 6 errors (false positive or false negative)
+    assert roc_auc_score(y_test, y_pred) > 0.98
+
+
+@pytest.mark.parametrize("contamination", [0.25, "auto"])
+def test_iforest_works(contamination):
+    # toy sample (the last two samples are outliers)
+    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]
+
+    # Test IsolationForest
+    clf = IsolationForest(random_state=rng, contamination=contamination)
+    clf.fit(X)
+    decision_func = -clf.decision_function(X)
+    pred = clf.predict(X)
+    # assert detect outliers:
+    assert np.min(decision_func[-2:]) > np.max(decision_func[:-2])
+    assert_array_equal(pred, 6 * [1] + 2 * [-1])
+
+
+def test_max_samples_consistency():
+    # Make sure validated max_samples in iforest and BaseBagging are identical
+    X = iris.data
+    clf = IsolationForest().fit(X)
+    assert clf.max_samples_ == clf._max_samples
+
+
+def test_iforest_subsampled_features():
+    # It tests non-regression for #5732 which failed at predict.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
+                                                        diabetes.target[:50],
+                                                        random_state=rng)
+    clf = IsolationForest(max_features=0.8)
+    clf.fit(X_train, y_train)
+    clf.predict(X_test)
+
+
+def test_iforest_average_path_length():
+    # It tests non-regression for #8549 which used the wrong formula
+    # for average path length, strictly for the integer case
+    # Updated to check average path length when input is <= 2 (issue #11839)
+    result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0
+    result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0
+    assert_allclose(_average_path_length([0]), [0.0])
+    assert_allclose(_average_path_length([1]), [0.0])
+    assert_allclose(_average_path_length([2]), [1.0])
+    assert_allclose(_average_path_length([5]), [result_one])
+    assert_allclose(_average_path_length([999]), [result_two])
+    assert_allclose(
+        _average_path_length(np.array([1, 2, 5, 999])),
+        [0.0, 1.0, result_one, result_two],
+    )
+    # _average_path_length is increasing
+    avg_path_length = _average_path_length(np.arange(5))
+    assert_array_equal(avg_path_length, np.sort(avg_path_length))
+
+
+def test_score_samples():
+    X_train = [[1, 1], [1, 2], [2, 1]]
+    clf1 = IsolationForest(contamination=0.1).fit(X_train)
+    clf2 = IsolationForest().fit(X_train)
+    assert_array_equal(clf1.score_samples([[2., 2.]]),
+                       clf1.decision_function([[2., 2.]]) + clf1.offset_)
+    assert_array_equal(clf2.score_samples([[2., 2.]]),
+                       clf2.decision_function([[2., 2.]]) + clf2.offset_)
+    assert_array_equal(clf1.score_samples([[2., 2.]]),
+                       clf2.score_samples([[2., 2.]]))
+
+
+def test_iforest_warm_start():
+    """Test iterative addition of iTrees to an iForest """
+
+    rng = check_random_state(0)
+    X = rng.randn(20, 2)
+
+    # fit first 10 trees
+    clf = IsolationForest(n_estimators=10, max_samples=20,
+                          random_state=rng, warm_start=True)
+    clf.fit(X)
+    # remember the 1st tree
+    tree_1 = clf.estimators_[0]
+    # fit another 10 trees
+    clf.set_params(n_estimators=20)
+    clf.fit(X)
+    # expecting 20 fitted trees and no overwritten trees
+    assert len(clf.estimators_) == 20
+    assert clf.estimators_[0] is tree_1
+
+
+# mock get_chunk_n_rows to actually test more than one chunk (here one
+# chunk = 3 rows:
+@patch(
+    "sklearn.ensemble._iforest.get_chunk_n_rows",
+    side_effect=Mock(**{"return_value": 3}),
+)
+@pytest.mark.parametrize(
+    "contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]
+)
+def test_iforest_chunks_works1(
+    mocked_get_chunk, contamination, n_predict_calls
+):
+    test_iforest_works(contamination)
+    assert mocked_get_chunk.call_count == n_predict_calls
+
+
+# idem with chunk_size = 5 rows
+@patch(
+    "sklearn.ensemble._iforest.get_chunk_n_rows",
+    side_effect=Mock(**{"return_value": 10}),
+)
+@pytest.mark.parametrize(
+    "contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]
+)
+def test_iforest_chunks_works2(
+    mocked_get_chunk, contamination, n_predict_calls
+):
+    test_iforest_works(contamination)
+    assert mocked_get_chunk.call_count == n_predict_calls
+
+
+def test_iforest_deprecation():
+    iforest = IsolationForest(behaviour='new')
+    warn_msg = "'behaviour' is deprecated in 0.22 and will be removed in 0.24"
+    with pytest.warns(FutureWarning, match=warn_msg):
+        iforest.fit(iris.data)
+
+
+def test_iforest_with_uniform_data():
+    """Test whether iforest predicts inliers when using uniform data"""
+
+    # 2-d array of all 1s
+    X = np.ones((100, 10))
+    iforest = IsolationForest()
+    iforest.fit(X)
+
+    rng = np.random.RandomState(0)
+
+    assert all(iforest.predict(X) == 1)
+    assert all(iforest.predict(rng.randn(100, 10)) == 1)
+    assert all(iforest.predict(X + 1) == 1)
+    assert all(iforest.predict(X - 1) == 1)
+
+    # 2-d array where columns contain the same value across rows
+    X = np.repeat(rng.randn(1, 10), 100, 0)
+    iforest = IsolationForest()
+    iforest.fit(X)
+
+    assert all(iforest.predict(X) == 1)
+    assert all(iforest.predict(rng.randn(100, 10)) == 1)
+    assert all(iforest.predict(np.ones((100, 10))) == 1)
+
+    # Single row
+    X = rng.randn(1, 10)
+    iforest = IsolationForest()
+    iforest.fit(X)
+
+    assert all(iforest.predict(X) == 1)
+    assert all(iforest.predict(rng.randn(100, 10)) == 1)
+    assert all(iforest.predict(np.ones((100, 10))) == 1)
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/test_stacking.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/test_stacking.py
@ -0,0 +1,524 @@
+"""Test the stacking classifier and regressor."""
+
+# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
+# License: BSD 3 clause
+
+import pytest
+import numpy as np
+import scipy.sparse as sparse
+
+from sklearn.base import BaseEstimator
+from sklearn.base import ClassifierMixin
+from sklearn.base import RegressorMixin
+from sklearn.base import clone
+
+from sklearn.exceptions import ConvergenceWarning
+
+from sklearn.datasets import load_iris
+from sklearn.datasets import load_diabetes
+from sklearn.datasets import load_breast_cancer
+from sklearn.datasets import make_regression
+from sklearn.datasets import make_classification
+
+from sklearn.dummy import DummyClassifier
+from sklearn.dummy import DummyRegressor
+from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LinearRegression
+from sklearn.svm import LinearSVC
+from sklearn.svm import LinearSVR
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.preprocessing import scale
+
+from sklearn.ensemble import StackingClassifier
+from sklearn.ensemble import StackingRegressor
+
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import KFold
+
+from sklearn.utils._mocking import CheckingClassifier
+from sklearn.utils._testing import assert_allclose
+from sklearn.utils._testing import assert_allclose_dense_sparse
+from sklearn.utils._testing import ignore_warnings
+from sklearn.utils.estimator_checks import check_estimator
+from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
+
+X_diabetes, y_diabetes = load_diabetes(return_X_y=True)
+X_iris, y_iris = load_iris(return_X_y=True)
+
+
+@pytest.mark.parametrize(
+    "cv", [3, StratifiedKFold(n_splits=3, shuffle=True, random_state=42)]
+)
+@pytest.mark.parametrize(
+    "final_estimator", [None, RandomForestClassifier(random_state=42)]
+)
+@pytest.mark.parametrize("passthrough", [False, True])
+def test_stacking_classifier_iris(cv, final_estimator, passthrough):
+    # prescale the data to avoid convergence warning without using a pipeline
+    # for later assert
+    X_train, X_test, y_train, y_test = train_test_split(
+        scale(X_iris), y_iris, stratify=y_iris, random_state=42
+    )
+    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
+    clf = StackingClassifier(
+        estimators=estimators, final_estimator=final_estimator, cv=cv,
+        passthrough=passthrough
+    )
+    clf.fit(X_train, y_train)
+    clf.predict(X_test)
+    clf.predict_proba(X_test)
+    assert clf.score(X_test, y_test) > 0.8
+
+    X_trans = clf.transform(X_test)
+    expected_column_count = 10 if passthrough else 6
+    assert X_trans.shape[1] == expected_column_count
+    if passthrough:
+        assert_allclose(X_test, X_trans[:, -4:])
+
+    clf.set_params(lr='drop')
+    clf.fit(X_train, y_train)
+    clf.predict(X_test)
+    clf.predict_proba(X_test)
+    if final_estimator is None:
+        # LogisticRegression has decision_function method
+        clf.decision_function(X_test)
+
+    X_trans = clf.transform(X_test)
+    expected_column_count_drop = 7 if passthrough else 3
+    assert X_trans.shape[1] == expected_column_count_drop
+    if passthrough:
+        assert_allclose(X_test, X_trans[:, -4:])
+
+
+def test_stacking_classifier_drop_column_binary_classification():
+    # check that a column is dropped in binary classification
+    X, y = load_breast_cancer(return_X_y=True)
+    X_train, X_test, y_train, _ = train_test_split(
+        scale(X), y, stratify=y, random_state=42
+    )
+
+    # both classifiers implement 'predict_proba' and will both drop one column
+    estimators = [('lr', LogisticRegression()),
+                  ('rf', RandomForestClassifier(random_state=42))]
+    clf = StackingClassifier(estimators=estimators, cv=3)
+
+    clf.fit(X_train, y_train)
+    X_trans = clf.transform(X_test)
+    assert X_trans.shape[1] == 2
+
+    # LinearSVC does not implement 'predict_proba' and will not drop one column
+    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
+    clf.set_params(estimators=estimators)
+
+    clf.fit(X_train, y_train)
+    X_trans = clf.transform(X_test)
+    assert X_trans.shape[1] == 2
+
+
+def test_stacking_classifier_drop_estimator():
+    # prescale the data to avoid convergence warning without using a pipeline
+    # for later assert
+    X_train, X_test, y_train, _ = train_test_split(
+        scale(X_iris), y_iris, stratify=y_iris, random_state=42
+    )
+    estimators = [('lr', 'drop'), ('svc', LinearSVC(random_state=0))]
+    rf = RandomForestClassifier(n_estimators=10, random_state=42)
+    clf = StackingClassifier(
+        estimators=[('svc', LinearSVC(random_state=0))],
+        final_estimator=rf, cv=5
+    )
+    clf_drop = StackingClassifier(
+        estimators=estimators, final_estimator=rf, cv=5
+    )
+
+    clf.fit(X_train, y_train)
+    clf_drop.fit(X_train, y_train)
+    assert_allclose(clf.predict(X_test), clf_drop.predict(X_test))
+    assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test))
+    assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
+
+
+def test_stacking_regressor_drop_estimator():
+    # prescale the data to avoid convergence warning without using a pipeline
+    # for later assert
+    X_train, X_test, y_train, _ = train_test_split(
+        scale(X_diabetes), y_diabetes, random_state=42
+    )
+    estimators = [('lr', 'drop'), ('svr', LinearSVR(random_state=0))]
+    rf = RandomForestRegressor(n_estimators=10, random_state=42)
+    reg = StackingRegressor(
+        estimators=[('svr', LinearSVR(random_state=0))],
+        final_estimator=rf, cv=5
+    )
+    reg_drop = StackingRegressor(
+        estimators=estimators, final_estimator=rf, cv=5
+    )
+
+    reg.fit(X_train, y_train)
+    reg_drop.fit(X_train, y_train)
+    assert_allclose(reg.predict(X_test), reg_drop.predict(X_test))
+    assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
+
+
+@pytest.mark.parametrize(
+    "cv", [3, KFold(n_splits=3, shuffle=True, random_state=42)]
+)
+@pytest.mark.parametrize(
+    "final_estimator, predict_params",
+    [(None, {}),
+     (RandomForestRegressor(random_state=42), {}),
+     (DummyRegressor(), {'return_std': True})]
+)
+@pytest.mark.parametrize("passthrough", [False, True])
+def test_stacking_regressor_diabetes(cv, final_estimator, predict_params,
+                                     passthrough):
+    # prescale the data to avoid convergence warning without using a pipeline
+    # for later assert
+    X_train, X_test, y_train, _ = train_test_split(
+        scale(X_diabetes), y_diabetes, random_state=42
+    )
+    estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
+    reg = StackingRegressor(
+        estimators=estimators, final_estimator=final_estimator, cv=cv,
+        passthrough=passthrough
+    )
+    reg.fit(X_train, y_train)
+    result = reg.predict(X_test, **predict_params)
+    expected_result_length = 2 if predict_params else 1
+    if predict_params:
+        assert len(result) == expected_result_length
+
+    X_trans = reg.transform(X_test)
+    expected_column_count = 12 if passthrough else 2
+    assert X_trans.shape[1] == expected_column_count
+    if passthrough:
+        assert_allclose(X_test, X_trans[:, -10:])
+
+    reg.set_params(lr='drop')
+    reg.fit(X_train, y_train)
+    reg.predict(X_test)
+
+    X_trans = reg.transform(X_test)
+    expected_column_count_drop = 11 if passthrough else 1
+    assert X_trans.shape[1] == expected_column_count_drop
+    if passthrough:
+        assert_allclose(X_test, X_trans[:, -10:])
+
+
+@pytest.mark.parametrize('fmt', ['csc', 'csr', 'coo'])
+def test_stacking_regressor_sparse_passthrough(fmt):
+    # Check passthrough behavior on a sparse X matrix
+    X_train, X_test, y_train, _ = train_test_split(
+        sparse.coo_matrix(scale(X_diabetes)).asformat(fmt),
+        y_diabetes, random_state=42
+    )
+    estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
+    rf = RandomForestRegressor(n_estimators=10, random_state=42)
+    clf = StackingRegressor(
+        estimators=estimators, final_estimator=rf, cv=5, passthrough=True
+    )
+    clf.fit(X_train, y_train)
+    X_trans = clf.transform(X_test)
+    assert_allclose_dense_sparse(X_test, X_trans[:, -10:])
+    assert sparse.issparse(X_trans)
+    assert X_test.format == X_trans.format
+
+
+@pytest.mark.parametrize('fmt', ['csc', 'csr', 'coo'])
+def test_stacking_classifier_sparse_passthrough(fmt):
+    # Check passthrough behavior on a sparse X matrix
+    X_train, X_test, y_train, _ = train_test_split(
+        sparse.coo_matrix(scale(X_iris)).asformat(fmt),
+        y_iris, random_state=42
+    )
+    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
+    rf = RandomForestClassifier(n_estimators=10, random_state=42)
+    clf = StackingClassifier(
+        estimators=estimators, final_estimator=rf, cv=5, passthrough=True
+    )
+    clf.fit(X_train, y_train)
+    X_trans = clf.transform(X_test)
+    assert_allclose_dense_sparse(X_test, X_trans[:, -4:])
+    assert sparse.issparse(X_trans)
+    assert X_test.format == X_trans.format
+
+
+def test_stacking_classifier_drop_binary_prob():
+    # check that classifier will drop one of the probability column for
+    # binary classification problem
+
+    # Select only the 2 first classes
+    X_, y_ = scale(X_iris[:100]), y_iris[:100]
+
+    estimators = [
+        ('lr', LogisticRegression()), ('rf', RandomForestClassifier())
+    ]
+    clf = StackingClassifier(estimators=estimators)
+    clf.fit(X_, y_)
+    X_meta = clf.transform(X_)
+    assert X_meta.shape[1] == 2
+
+
+class NoWeightRegressor(BaseEstimator, RegressorMixin):
+    def fit(self, X, y):
+        self.reg = DummyRegressor()
+        return self.reg.fit(X, y)
+
+    def predict(self, X):
+        return np.ones(X.shape[0])
+
+
+class NoWeightClassifier(BaseEstimator, ClassifierMixin):
+    def fit(self, X, y):
+        self.clf = DummyClassifier(strategy='stratified')
+        return self.clf.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "y, params, type_err, msg_err",
+    [(y_iris,
+      {'estimators': None},
+      ValueError, "Invalid 'estimators' attribute,"),
+     (y_iris,
+      {'estimators': []},
+      ValueError, "Invalid 'estimators' attribute,"),
+     (y_iris,
+      {'estimators': [('lr', LogisticRegression()),
+                      ('svm', SVC(max_iter=5e4))],
+       'stack_method': 'predict_proba'},
+      ValueError, 'does not implement the method predict_proba'),
+     (y_iris,
+      {'estimators': [('lr', LogisticRegression()),
+                      ('cor', NoWeightClassifier())]},
+      TypeError, 'does not support sample weight'),
+     (y_iris,
+      {'estimators': [('lr', LogisticRegression()),
+                      ('cor', LinearSVC(max_iter=5e4))],
+       'final_estimator': NoWeightClassifier()},
+      TypeError, 'does not support sample weight')]
+)
+def test_stacking_classifier_error(y, params, type_err, msg_err):
+    with pytest.raises(type_err, match=msg_err):
+        clf = StackingClassifier(**params, cv=3)
+        clf.fit(
+            scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0])
+        )
+
+
+@pytest.mark.parametrize(
+    "y, params, type_err, msg_err",
+    [(y_diabetes,
+      {'estimators': None},
+      ValueError, "Invalid 'estimators' attribute,"),
+     (y_diabetes,
+      {'estimators': []},
+      ValueError, "Invalid 'estimators' attribute,"),
+     (y_diabetes,
+      {'estimators': [('lr', LinearRegression()),
+                      ('cor', NoWeightRegressor())]},
+      TypeError, 'does not support sample weight'),
+     (y_diabetes,
+      {'estimators': [('lr', LinearRegression()),
+                      ('cor', LinearSVR())],
+       'final_estimator': NoWeightRegressor()},
+      TypeError, 'does not support sample weight')]
+)
+def test_stacking_regressor_error(y, params, type_err, msg_err):
+    with pytest.raises(type_err, match=msg_err):
+        reg = StackingRegressor(**params, cv=3)
+        reg.fit(
+            scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0])
+        )
+
+
+@pytest.mark.parametrize(
+    "estimator, X, y",
+    [(StackingClassifier(
+        estimators=[('lr', LogisticRegression(random_state=0)),
+                    ('svm', LinearSVC(random_state=0))]),
+      X_iris[:100], y_iris[:100]),  # keep only classes 0 and 1
+     (StackingRegressor(
+         estimators=[('lr', LinearRegression()),
+                     ('svm', LinearSVR(random_state=0))]),
+      X_diabetes, y_diabetes)],
+    ids=['StackingClassifier', 'StackingRegressor']
+)
+def test_stacking_randomness(estimator, X, y):
+    # checking that fixing the random state of the CV will lead to the same
+    # results
+    estimator_full = clone(estimator)
+    estimator_full.set_params(
+        cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
+    )
+
+    estimator_drop = clone(estimator)
+    estimator_drop.set_params(lr='drop')
+    estimator_drop.set_params(
+        cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
+    )
+
+    assert_allclose(
+        estimator_full.fit(X, y).transform(X)[:, 1:],
+        estimator_drop.fit(X, y).transform(X)
+    )
+
+
+# These warnings are raised due to _BaseComposition
+@pytest.mark.filterwarnings("ignore:TypeError occurred during set_params")
+@pytest.mark.filterwarnings("ignore:Estimator's parameters changed after")
+@pytest.mark.parametrize(
+    "estimator",
+    [StackingClassifier(
+        estimators=[('lr', LogisticRegression(random_state=0)),
+                    ('tree', DecisionTreeClassifier(random_state=0))]),
+     StackingRegressor(
+         estimators=[('lr', LinearRegression()),
+                     ('tree', DecisionTreeRegressor(random_state=0))])],
+    ids=['StackingClassifier', 'StackingRegressor']
+)
+def test_check_estimators_stacking_estimator(estimator):
+    check_estimator(estimator)
+    check_no_attributes_set_in_init(estimator.__class__.__name__, estimator)
+
+
+def test_stacking_classifier_stratify_default():
+    # check that we stratify the classes for the default CV
+    clf = StackingClassifier(
+        estimators=[('lr', LogisticRegression(max_iter=1e4)),
+                    ('svm', LinearSVC(max_iter=1e4))]
+    )
+    # since iris is not shuffled, a simple k-fold would not contain the
+    # 3 classes during training
+    clf.fit(X_iris, y_iris)
+
+
+@pytest.mark.parametrize(
+    "stacker, X, y",
+    [(StackingClassifier(
+        estimators=[('lr', LogisticRegression()),
+                    ('svm', LinearSVC(random_state=42))],
+        final_estimator=LogisticRegression(),
+        cv=KFold(shuffle=True, random_state=42)),
+      *load_breast_cancer(return_X_y=True)),
+     (StackingRegressor(
+         estimators=[('lr', LinearRegression()),
+                     ('svm', LinearSVR(random_state=42))],
+         final_estimator=LinearRegression(),
+         cv=KFold(shuffle=True, random_state=42)),
+      X_diabetes, y_diabetes)],
+    ids=['StackingClassifier', 'StackingRegressor']
+)
+def test_stacking_with_sample_weight(stacker, X, y):
+    # check that sample weights has an influence on the fitting
+    # note: ConvergenceWarning are catch since we are not worrying about the
+    # convergence here
+    n_half_samples = len(y) // 2
+    total_sample_weight = np.array(
+        [0.1] * n_half_samples + [0.9] * (len(y) - n_half_samples)
+    )
+    X_train, X_test, y_train, _, sample_weight_train, _ = train_test_split(
+        X, y, total_sample_weight, random_state=42
+    )
+
+    with ignore_warnings(category=ConvergenceWarning):
+        stacker.fit(X_train, y_train)
+    y_pred_no_weight = stacker.predict(X_test)
+
+    with ignore_warnings(category=ConvergenceWarning):
+        stacker.fit(X_train, y_train, sample_weight=np.ones(y_train.shape))
+    y_pred_unit_weight = stacker.predict(X_test)
+
+    assert_allclose(y_pred_no_weight, y_pred_unit_weight)
+
+    with ignore_warnings(category=ConvergenceWarning):
+        stacker.fit(X_train, y_train, sample_weight=sample_weight_train)
+    y_pred_biased = stacker.predict(X_test)
+
+    assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0
+
+
+def test_stacking_classifier_sample_weight_fit_param():
+    # check sample_weight is passed to all invocations of fit
+    stacker = StackingClassifier(
+        estimators=[
+            ('lr', CheckingClassifier(expected_fit_params=['sample_weight']))
+        ],
+        final_estimator=CheckingClassifier(
+            expected_fit_params=['sample_weight']
+        )
+    )
+    stacker.fit(X_iris, y_iris, sample_weight=np.ones(X_iris.shape[0]))
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.parametrize(
+    "stacker, X, y",
+    [(StackingClassifier(
+        estimators=[('lr', LogisticRegression()),
+                    ('svm', LinearSVC(random_state=42))],
+        final_estimator=LogisticRegression()),
+      *load_breast_cancer(return_X_y=True)),
+     (StackingRegressor(
+         estimators=[('lr', LinearRegression()),
+                     ('svm', LinearSVR(random_state=42))],
+         final_estimator=LinearRegression()),
+      X_diabetes, y_diabetes)],
+    ids=['StackingClassifier', 'StackingRegressor']
+)
+def test_stacking_cv_influence(stacker, X, y):
+    # check that the stacking affects the fit of the final estimator but not
+    # the fit of the base estimators
+    # note: ConvergenceWarning are catch since we are not worrying about the
+    # convergence here
+    stacker_cv_3 = clone(stacker)
+    stacker_cv_5 = clone(stacker)
+
+    stacker_cv_3.set_params(cv=3)
+    stacker_cv_5.set_params(cv=5)
+
+    stacker_cv_3.fit(X, y)
+    stacker_cv_5.fit(X, y)
+
+    # the base estimators should be identical
+    for est_cv_3, est_cv_5 in zip(stacker_cv_3.estimators_,
+                                  stacker_cv_5.estimators_):
+        assert_allclose(est_cv_3.coef_, est_cv_5.coef_)
+
+    # the final estimator should be different
+    with pytest.raises(AssertionError, match='Not equal'):
+        assert_allclose(stacker_cv_3.final_estimator_.coef_,
+                        stacker_cv_5.final_estimator_.coef_)
+
+
+@pytest.mark.parametrize("make_dataset, Stacking, Estimator", [
+    (make_classification, StackingClassifier, LogisticRegression),
+    (make_regression, StackingRegressor, LinearRegression)
+])
+def test_stacking_without_n_features_in(make_dataset, Stacking, Estimator):
+    # Stacking supports estimators without `n_features_in_`. Regression test
+    # for #17353
+
+    class MyEstimator(Estimator):
+        """Estimator without n_features_in_"""
+        def fit(self, X, y):
+            super().fit(X, y)
+            del self.n_features_in_
+
+    X, y = make_dataset(random_state=0, n_samples=100)
+    stacker = Stacking(estimators=[('lr', MyEstimator())])
+
+    msg = f"{Stacking.__name__} object has no attribute n_features_in_"
+    with pytest.raises(AttributeError, match=msg):
+        stacker.n_features_in_
+
+    # Does not raise
+    stacker.fit(X, y)
+
+    msg = "'MyEstimator' object has no attribute 'n_features_in_'"
+    with pytest.raises(AttributeError, match=msg):
+        stacker.n_features_in_
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/test_voting.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/test_voting.py
@ -0,0 +1,574 @@
+"""Testing for the VotingClassifier and VotingRegressor"""
+
+import pytest
+import re
+import numpy as np
+
+from sklearn.utils._testing import assert_almost_equal, assert_array_equal
+from sklearn.utils._testing import assert_array_almost_equal
+from sklearn.utils._testing import assert_raise_message
+from sklearn.utils.estimator_checks import check_estimator
+from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
+from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import LinearRegression
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import GaussianNB
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.ensemble import VotingClassifier, VotingRegressor
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.model_selection import GridSearchCV
+from sklearn import datasets
+from sklearn.model_selection import cross_val_score, train_test_split
+from sklearn.datasets import make_multilabel_classification
+from sklearn.svm import SVC
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.base import BaseEstimator, ClassifierMixin, clone
+from sklearn.dummy import DummyRegressor
+
+
+# Load datasets
+iris = datasets.load_iris()
+X, y = iris.data[:, 1:3], iris.target
+
+X_r, y_r = datasets.load_diabetes(return_X_y=True)
+
+
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [({'estimators': []},
+      "Invalid 'estimators' attribute, 'estimators' should be a list of"),
+     ({'estimators': [('lr', LogisticRegression())], 'voting': 'error'},
+      r"Voting must be 'soft' or 'hard'; got \(voting='error'\)"),
+     ({'estimators': [('lr', LogisticRegression())], 'weights': [1, 2]},
+      "Number of `estimators` and weights must be equal")]
+)
+def test_voting_classifier_estimator_init(params, err_msg):
+    ensemble = VotingClassifier(**params)
+    with pytest.raises(ValueError, match=err_msg):
+        ensemble.fit(X, y)
+
+
+def test_predictproba_hardvoting():
+    eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()),
+                                        ('lr2', LogisticRegression())],
+                            voting='hard')
+    msg = "predict_proba is not available when voting='hard'"
+    with pytest.raises(AttributeError, match=msg):
+        eclf.predict_proba
+
+    assert not hasattr(eclf, "predict_proba")
+    eclf.fit(X, y)
+    assert not hasattr(eclf, "predict_proba")
+
+
+def test_notfitted():
+    eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()),
+                                        ('lr2', LogisticRegression())],
+                            voting='soft')
+    ereg = VotingRegressor([('dr', DummyRegressor())])
+    msg = ("This %s instance is not fitted yet. Call \'fit\'"
+           " with appropriate arguments before using this estimator.")
+    assert_raise_message(NotFittedError, msg % 'VotingClassifier',
+                         eclf.predict, X)
+    assert_raise_message(NotFittedError, msg % 'VotingClassifier',
+                         eclf.predict_proba, X)
+    assert_raise_message(NotFittedError, msg % 'VotingClassifier',
+                         eclf.transform, X)
+    assert_raise_message(NotFittedError, msg % 'VotingRegressor',
+                         ereg.predict, X_r)
+    assert_raise_message(NotFittedError, msg % 'VotingRegressor',
+                         ereg.transform, X_r)
+
+
+def test_majority_label_iris():
+    """Check classification by majority label on dataset iris."""
+    clf1 = LogisticRegression(solver='liblinear', random_state=123)
+    clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
+    clf3 = GaussianNB()
+    eclf = VotingClassifier(estimators=[
+                ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+                voting='hard')
+    scores = cross_val_score(eclf, X, y, scoring='accuracy')
+    assert_almost_equal(scores.mean(), 0.95, decimal=2)
+
+
+def test_tie_situation():
+    """Check voting classifier selects smaller class label in tie situation."""
+    clf1 = LogisticRegression(random_state=123, solver='liblinear')
+    clf2 = RandomForestClassifier(random_state=123)
+    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)],
+                            voting='hard')
+    assert clf1.fit(X, y).predict(X)[73] == 2
+    assert clf2.fit(X, y).predict(X)[73] == 1
+    assert eclf.fit(X, y).predict(X)[73] == 1
+
+
+def test_weights_iris():
+    """Check classification by average probabilities on dataset iris."""
+    clf1 = LogisticRegression(random_state=123)
+    clf2 = RandomForestClassifier(random_state=123)
+    clf3 = GaussianNB()
+    eclf = VotingClassifier(estimators=[
+                            ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+                            voting='soft',
+                            weights=[1, 2, 10])
+    scores = cross_val_score(eclf, X, y, scoring='accuracy')
+    assert_almost_equal(scores.mean(), 0.93, decimal=2)
+
+
+def test_weights_regressor():
+    """Check weighted average regression prediction on diabetes dataset."""
+    reg1 = DummyRegressor(strategy='mean')
+    reg2 = DummyRegressor(strategy='median')
+    reg3 = DummyRegressor(strategy='quantile', quantile=.2)
+    ereg = VotingRegressor([('mean', reg1), ('median', reg2),
+                            ('quantile', reg3)], weights=[1, 2, 10])
+
+    X_r_train, X_r_test, y_r_train, y_r_test = \
+        train_test_split(X_r, y_r, test_size=.25)
+
+    reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test)
+    reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test)
+    reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test)
+    ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test)
+
+    avg = np.average(np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0,
+                     weights=[1, 2, 10])
+    assert_almost_equal(ereg_pred, avg, decimal=2)
+
+    ereg_weights_none = VotingRegressor([('mean', reg1), ('median', reg2),
+                                         ('quantile', reg3)], weights=None)
+    ereg_weights_equal = VotingRegressor([('mean', reg1), ('median', reg2),
+                                          ('quantile', reg3)],
+                                         weights=[1, 1, 1])
+    ereg_weights_none.fit(X_r_train, y_r_train)
+    ereg_weights_equal.fit(X_r_train, y_r_train)
+    ereg_none_pred = ereg_weights_none.predict(X_r_test)
+    ereg_equal_pred = ereg_weights_equal.predict(X_r_test)
+    assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
+
+
+def test_predict_on_toy_problem():
+    """Manually check predicted class labels for toy dataset."""
+    clf1 = LogisticRegression(random_state=123)
+    clf2 = RandomForestClassifier(random_state=123)
+    clf3 = GaussianNB()
+
+    X = np.array([[-1.1, -1.5],
+                  [-1.2, -1.4],
+                  [-3.4, -2.2],
+                  [1.1, 1.2],
+                  [2.1, 1.4],
+                  [3.1, 2.3]])
+
+    y = np.array([1, 1, 1, 2, 2, 2])
+
+    assert_array_equal(clf1.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
+    assert_array_equal(clf2.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
+    assert_array_equal(clf3.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
+
+    eclf = VotingClassifier(estimators=[
+                            ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+                            voting='hard',
+                            weights=[1, 1, 1])
+    assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
+
+    eclf = VotingClassifier(estimators=[
+                            ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+                            voting='soft',
+                            weights=[1, 1, 1])
+    assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
+
+
+def test_predict_proba_on_toy_problem():
+    """Calculate predicted probabilities on toy dataset."""
+    clf1 = LogisticRegression(random_state=123)
+    clf2 = RandomForestClassifier(random_state=123)
+    clf3 = GaussianNB()
+    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
+    y = np.array([1, 1, 2, 2])
+
+    clf1_res = np.array([[0.59790391, 0.40209609],
+                         [0.57622162, 0.42377838],
+                         [0.50728456, 0.49271544],
+                         [0.40241774, 0.59758226]])
+
+    clf2_res = np.array([[0.8, 0.2],
+                         [0.8, 0.2],
+                         [0.2, 0.8],
+                         [0.3, 0.7]])
+
+    clf3_res = np.array([[0.9985082, 0.0014918],
+                         [0.99845843, 0.00154157],
+                         [0., 1.],
+                         [0., 1.]])
+
+    t00 = (2*clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4
+    t11 = (2*clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4
+    t21 = (2*clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4
+    t31 = (2*clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4
+
+    eclf = VotingClassifier(estimators=[
+                            ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+                            voting='soft',
+                            weights=[2, 1, 1])
+    eclf_res = eclf.fit(X, y).predict_proba(X)
+
+    assert_almost_equal(t00, eclf_res[0][0], decimal=1)
+    assert_almost_equal(t11, eclf_res[1][1], decimal=1)
+    assert_almost_equal(t21, eclf_res[2][1], decimal=1)
+    assert_almost_equal(t31, eclf_res[3][1], decimal=1)
+
+    with pytest.raises(
+            AttributeError,
+            match="predict_proba is not available when voting='hard'"):
+        eclf = VotingClassifier(estimators=[
+                                ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+                                voting='hard')
+        eclf.fit(X, y).predict_proba(X)
+
+
+def test_multilabel():
+    """Check if error is raised for multilabel classification."""
+    X, y = make_multilabel_classification(n_classes=2, n_labels=1,
+                                          allow_unlabeled=False,
+                                          random_state=123)
+    clf = OneVsRestClassifier(SVC(kernel='linear'))
+
+    eclf = VotingClassifier(estimators=[('ovr', clf)], voting='hard')
+
+    try:
+        eclf.fit(X, y)
+    except NotImplementedError:
+        return
+
+
+def test_gridsearch():
+    """Check GridSearch support."""
+    clf1 = LogisticRegression(random_state=1)
+    clf2 = RandomForestClassifier(random_state=1)
+    clf3 = GaussianNB()
+    eclf = VotingClassifier(estimators=[
+                ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+                voting='soft')
+
+    params = {'lr__C': [1.0, 100.0],
+              'voting': ['soft', 'hard'],
+              'weights': [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]]}
+
+    grid = GridSearchCV(estimator=eclf, param_grid=params)
+    grid.fit(iris.data, iris.target)
+
+
+def test_parallel_fit():
+    """Check parallel backend of VotingClassifier on toy dataset."""
+    clf1 = LogisticRegression(random_state=123)
+    clf2 = RandomForestClassifier(random_state=123)
+    clf3 = GaussianNB()
+    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
+    y = np.array([1, 1, 2, 2])
+
+    eclf1 = VotingClassifier(estimators=[
+        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+        voting='soft',
+        n_jobs=1).fit(X, y)
+    eclf2 = VotingClassifier(estimators=[
+        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+        voting='soft',
+        n_jobs=2).fit(X, y)
+
+    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
+    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
+
+
+def test_sample_weight():
+    """Tests sample_weight parameter of VotingClassifier"""
+    clf1 = LogisticRegression(random_state=123)
+    clf2 = RandomForestClassifier(random_state=123)
+    clf3 = SVC(probability=True, random_state=123)
+    eclf1 = VotingClassifier(estimators=[
+        ('lr', clf1), ('rf', clf2), ('svc', clf3)],
+        voting='soft').fit(X, y, sample_weight=np.ones((len(y),)))
+    eclf2 = VotingClassifier(estimators=[
+        ('lr', clf1), ('rf', clf2), ('svc', clf3)],
+        voting='soft').fit(X, y)
+    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
+    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
+
+    sample_weight = np.random.RandomState(123).uniform(size=(len(y),))
+    eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft')
+    eclf3.fit(X, y, sample_weight)
+    clf1.fit(X, y, sample_weight)
+    assert_array_equal(eclf3.predict(X), clf1.predict(X))
+    assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X))
+
+    # check that an error is raised and indicative if sample_weight is not
+    # supported.
+    clf4 = KNeighborsClassifier()
+    eclf3 = VotingClassifier(estimators=[
+        ('lr', clf1), ('svc', clf3), ('knn', clf4)],
+        voting='soft')
+    msg = ('Underlying estimator KNeighborsClassifier does not support '
+           'sample weights.')
+    with pytest.raises(TypeError, match=msg):
+        eclf3.fit(X, y, sample_weight)
+
+    # check that _fit_single_estimator will raise the right error
+    # it should raise the original error if this is not linked to sample_weight
+    class ClassifierErrorFit(ClassifierMixin, BaseEstimator):
+        def fit(self, X, y, sample_weight):
+            raise TypeError('Error unrelated to sample_weight.')
+    clf = ClassifierErrorFit()
+    with pytest.raises(TypeError, match='Error unrelated to sample_weight'):
+        clf.fit(X, y, sample_weight=sample_weight)
+
+
+def test_sample_weight_kwargs():
+    """Check that VotingClassifier passes sample_weight as kwargs"""
+    class MockClassifier(ClassifierMixin, BaseEstimator):
+        """Mock Classifier to check that sample_weight is received as kwargs"""
+        def fit(self, X, y, *args, **sample_weight):
+            assert 'sample_weight' in sample_weight
+
+    clf = MockClassifier()
+    eclf = VotingClassifier(estimators=[('mock', clf)], voting='soft')
+
+    # Should not raise an error.
+    eclf.fit(X, y, sample_weight=np.ones((len(y),)))
+
+
+def test_voting_classifier_set_params():
+    # check equivalence in the output when setting underlying estimators
+    clf1 = LogisticRegression(random_state=123, C=1.0)
+    clf2 = RandomForestClassifier(random_state=123, max_depth=None)
+    clf3 = GaussianNB()
+
+    eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft',
+                             weights=[1, 2]).fit(X, y)
+    eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft',
+                             weights=[1, 2])
+    eclf2.set_params(nb=clf2).fit(X, y)
+
+    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
+    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
+    assert eclf2.estimators[0][1].get_params() == clf1.get_params()
+    assert eclf2.estimators[1][1].get_params() == clf2.get_params()
+
+
+# TODO: Remove parametrization in 0.24 when None is removed in Voting*
+@pytest.mark.parametrize("drop", [None, 'drop'])
+def test_set_estimator_none(drop):
+    """VotingClassifier set_params should be able to set estimators as None or
+    drop"""
+    # Test predict
+    clf1 = LogisticRegression(random_state=123)
+    clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
+    clf3 = GaussianNB()
+    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
+                                         ('nb', clf3)],
+                             voting='hard', weights=[1, 0, 0.5]).fit(X, y)
+
+    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
+                                         ('nb', clf3)],
+                             voting='hard', weights=[1, 1, 0.5])
+    with pytest.warns(None) as record:
+        eclf2.set_params(rf=drop).fit(X, y)
+    assert record if drop is None else not record
+    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
+
+    assert dict(eclf2.estimators)["rf"] is drop
+    assert len(eclf2.estimators_) == 2
+    assert all(isinstance(est, (LogisticRegression, GaussianNB))
+               for est in eclf2.estimators_)
+    assert eclf2.get_params()["rf"] is drop
+
+    eclf1.set_params(voting='soft').fit(X, y)
+    with pytest.warns(None) as record:
+        eclf2.set_params(voting='soft').fit(X, y)
+    assert record if drop is None else not record
+    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
+    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
+    msg = 'All estimators are dropped. At least one is required'
+    with pytest.warns(None) as record:
+        with pytest.raises(ValueError, match=msg):
+            eclf2.set_params(lr=drop, rf=drop, nb=drop).fit(X, y)
+    assert record if drop is None else not record
+
+    # Test soft voting transform
+    X1 = np.array([[1], [2]])
+    y1 = np.array([1, 2])
+    eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
+                             voting='soft', weights=[0, 0.5],
+                             flatten_transform=False).fit(X1, y1)
+
+    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
+                             voting='soft', weights=[1, 0.5],
+                             flatten_transform=False)
+    with pytest.warns(None) as record:
+        eclf2.set_params(rf=drop).fit(X1, y1)
+    assert record if drop is None else not record
+    assert_array_almost_equal(eclf1.transform(X1),
+                              np.array([[[0.7, 0.3], [0.3, 0.7]],
+                                        [[1., 0.], [0., 1.]]]))
+    assert_array_almost_equal(eclf2.transform(X1),
+                              np.array([[[1., 0.],
+                                         [0., 1.]]]))
+    eclf1.set_params(voting='hard')
+    eclf2.set_params(voting='hard')
+    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
+    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
+
+
+def test_estimator_weights_format():
+    # Test estimator weights inputs as list and array
+    clf1 = LogisticRegression(random_state=123)
+    clf2 = RandomForestClassifier(random_state=123)
+    eclf1 = VotingClassifier(estimators=[
+                ('lr', clf1), ('rf', clf2)],
+                weights=[1, 2],
+                voting='soft')
+    eclf2 = VotingClassifier(estimators=[
+                ('lr', clf1), ('rf', clf2)],
+                weights=np.array((1, 2)),
+                voting='soft')
+    eclf1.fit(X, y)
+    eclf2.fit(X, y)
+    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
+
+
+def test_transform():
+    """Check transform method of VotingClassifier on toy dataset."""
+    clf1 = LogisticRegression(random_state=123)
+    clf2 = RandomForestClassifier(random_state=123)
+    clf3 = GaussianNB()
+    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
+    y = np.array([1, 1, 2, 2])
+
+    eclf1 = VotingClassifier(estimators=[
+        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+        voting='soft').fit(X, y)
+    eclf2 = VotingClassifier(estimators=[
+        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+        voting='soft',
+        flatten_transform=True).fit(X, y)
+    eclf3 = VotingClassifier(estimators=[
+        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+        voting='soft',
+        flatten_transform=False).fit(X, y)
+
+    assert_array_equal(eclf1.transform(X).shape, (4, 6))
+    assert_array_equal(eclf2.transform(X).shape, (4, 6))
+    assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
+    assert_array_almost_equal(eclf1.transform(X),
+                              eclf2.transform(X))
+    assert_array_almost_equal(
+            eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)),
+            eclf2.transform(X)
+    )
+
+
+# TODO: Remove drop=None in 0.24 when None is removed in Voting*
+@pytest.mark.parametrize(
+    "X, y, voter",
+    [(X, y, VotingClassifier(
+        [('lr', LogisticRegression()),
+         ('rf', RandomForestClassifier(n_estimators=5))])),
+     (X_r, y_r, VotingRegressor(
+         [('lr', LinearRegression()),
+          ('rf', RandomForestRegressor(n_estimators=5))]))]
+)
+@pytest.mark.parametrize("drop", [None, 'drop'])
+def test_none_estimator_with_weights(X, y, voter, drop):
+    # TODO: remove the parametrization on 'drop' when support for None is
+    # removed.
+    # check that an estimator can be set to 'drop' and passing some weight
+    # regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/13777
+    voter = clone(voter)
+    voter.fit(X, y, sample_weight=np.ones(y.shape))
+    voter.set_params(lr=drop)
+    with pytest.warns(None) as record:
+        voter.fit(X, y, sample_weight=np.ones(y.shape))
+    assert record if drop is None else not record
+    y_pred = voter.predict(X)
+    assert y_pred.shape == y.shape
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [VotingRegressor(
+        estimators=[('lr', LinearRegression()),
+                    ('tree', DecisionTreeRegressor(random_state=0))]),
+     VotingClassifier(
+         estimators=[('lr', LogisticRegression(random_state=0)),
+                     ('tree', DecisionTreeClassifier(random_state=0))])],
+    ids=['VotingRegressor', 'VotingClassifier']
+)
+def test_check_estimators_voting_estimator(estimator):
+    # FIXME: to be removed when meta-estimators can specified themselves
+    # their testing parameters (for required parameters).
+    check_estimator(estimator)
+    check_no_attributes_set_in_init(estimator.__class__.__name__, estimator)
+
+
+@pytest.mark.parametrize(
+    "est",
+    [VotingRegressor(
+        estimators=[('lr', LinearRegression()),
+                    ('tree', DecisionTreeRegressor(random_state=0))]),
+     VotingClassifier(
+         estimators=[('lr', LogisticRegression(random_state=0)),
+                     ('tree', DecisionTreeClassifier(random_state=0))])],
+    ids=['VotingRegressor', 'VotingClassifier']
+)
+def test_n_features_in(est):
+
+    X = [[1, 2], [3, 4], [5, 6]]
+    y = [0, 1, 2]
+
+    assert not hasattr(est, 'n_features_in_')
+    est.fit(X, y)
+    assert est.n_features_in_ == 2
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [VotingRegressor(
+        estimators=[('lr', LinearRegression()),
+                    ('rf', RandomForestRegressor(random_state=123))],
+        verbose=True),
+     VotingClassifier(
+         estimators=[('lr', LogisticRegression(random_state=123)),
+                     ('rf', RandomForestClassifier(random_state=123))],
+        verbose=True)]
+)
+def test_voting_verbose(estimator, capsys):
+
+    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
+    y = np.array([1, 1, 2, 2])
+
+    pattern = (r'\[Voting\].*\(1 of 2\) Processing lr, total=.*\n'
+               r'\[Voting\].*\(2 of 2\) Processing rf, total=.*\n$')
+
+    estimator.fit(X, y)
+    assert re.match(pattern, capsys.readouterr()[0])
+
+
+# TODO: Remove in 0.24 when None is removed in Voting*
+@pytest.mark.parametrize(
+    "Voter, BaseEstimator",
+    [(VotingClassifier, DecisionTreeClassifier),
+     (VotingRegressor, DecisionTreeRegressor)]
+)
+def test_deprecate_none_transformer(Voter, BaseEstimator):
+    est = Voter(estimators=[('lr', None),
+                            ('tree', BaseEstimator(random_state=0))])
+
+    msg = ("Using 'None' to drop an estimator from the ensemble is "
+           "deprecated in 0.22 and support will be dropped in 0.24. "
+           "Use the string 'drop' instead.")
+    with pytest.warns(FutureWarning, match=msg):
+        est.fit(X, y)
--- a/venv/Lib/site-packages/sklearn/ensemble/tests/test_weight_boosting.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/tests/test_weight_boosting.py
@ -0,0 +1,582 @@
+"""Testing for the boost module (sklearn.ensemble.boost)."""
+
+import numpy as np
+import pytest
+
+from scipy.sparse import csc_matrix
+from scipy.sparse import csr_matrix
+from scipy.sparse import coo_matrix
+from scipy.sparse import dok_matrix
+from scipy.sparse import lil_matrix
+
+from sklearn.utils._testing import assert_array_equal, assert_array_less
+from sklearn.utils._testing import assert_array_almost_equal
+from sklearn.utils._testing import assert_raises, assert_raises_regexp
+from sklearn.utils._testing import ignore_warnings
+
+from sklearn.base import BaseEstimator
+from sklearn.base import clone
+from sklearn.dummy import DummyClassifier, DummyRegressor
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import GridSearchCV
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.ensemble import AdaBoostRegressor
+from sklearn.ensemble._weight_boosting import _samme_proba
+from sklearn.svm import SVC, SVR
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.utils import shuffle
+from sklearn.utils._mocking import NoSampleWeightWrapper
+from sklearn import datasets
+
+
+# Common random state
+rng = np.random.RandomState(0)
+
+# Toy sample
+X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+y_class = ["foo", "foo", "foo", 1, 1, 1]    # test string class labels
+y_regr = [-1, -1, -1, 1, 1, 1]
+T = [[-1, -1], [2, 2], [3, 2]]
+y_t_class = ["foo", 1, 1]
+y_t_regr = [-1, 1, 1]
+
+# Load the iris dataset and randomly permute it
+iris = datasets.load_iris()
+perm = rng.permutation(iris.target.size)
+iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng)
+
+# Load the boston dataset and randomly permute it
+boston = datasets.load_boston()
+boston.data, boston.target = shuffle(boston.data, boston.target,
+                                     random_state=rng)
+
+
+def test_samme_proba():
+    # Test the `_samme_proba` helper function.
+
+    # Define some example (bad) `predict_proba` output.
+    probs = np.array([[1, 1e-6, 0],
+                      [0.19, 0.6, 0.2],
+                      [-999, 0.51, 0.5],
+                      [1e-6, 1, 1e-9]])
+    probs /= np.abs(probs.sum(axis=1))[:, np.newaxis]
+
+    # _samme_proba calls estimator.predict_proba.
+    # Make a mock object so I can control what gets returned.
+    class MockEstimator:
+        def predict_proba(self, X):
+            assert_array_equal(X.shape, probs.shape)
+            return probs
+    mock = MockEstimator()
+
+    samme_proba = _samme_proba(mock, 3, np.ones_like(probs))
+
+    assert_array_equal(samme_proba.shape, probs.shape)
+    assert np.isfinite(samme_proba).all()
+
+    # Make sure that the correct elements come out as smallest --
+    # `_samme_proba` should preserve the ordering in each example.
+    assert_array_equal(np.argmin(samme_proba, axis=1), [2, 0, 0, 2])
+    assert_array_equal(np.argmax(samme_proba, axis=1), [0, 1, 1, 1])
+
+
+def test_oneclass_adaboost_proba():
+    # Test predict_proba robustness for one class label input.
+    # In response to issue #7501
+    # https://github.com/scikit-learn/scikit-learn/issues/7501
+    y_t = np.ones(len(X))
+    clf = AdaBoostClassifier().fit(X, y_t)
+    assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
+
+
+@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
+def test_classification_toy(algorithm):
+    # Check classification on a toy dataset.
+    clf = AdaBoostClassifier(algorithm=algorithm, random_state=0)
+    clf.fit(X, y_class)
+    assert_array_equal(clf.predict(T), y_t_class)
+    assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
+    assert clf.predict_proba(T).shape == (len(T), 2)
+    assert clf.decision_function(T).shape == (len(T),)
+
+
+def test_regression_toy():
+    # Check classification on a toy dataset.
+    clf = AdaBoostRegressor(random_state=0)
+    clf.fit(X, y_regr)
+    assert_array_equal(clf.predict(T), y_t_regr)
+
+
+def test_iris():
+    # Check consistency on dataset iris.
+    classes = np.unique(iris.target)
+    clf_samme = prob_samme = None
+
+    for alg in ['SAMME', 'SAMME.R']:
+        clf = AdaBoostClassifier(algorithm=alg)
+        clf.fit(iris.data, iris.target)
+
+        assert_array_equal(classes, clf.classes_)
+        proba = clf.predict_proba(iris.data)
+        if alg == "SAMME":
+            clf_samme = clf
+            prob_samme = proba
+        assert proba.shape[1] == len(classes)
+        assert clf.decision_function(iris.data).shape[1] == len(classes)
+
+        score = clf.score(iris.data, iris.target)
+        assert score > 0.9, "Failed with algorithm %s and score = %f" % \
+            (alg, score)
+
+        # Check we used multiple estimators
+        assert len(clf.estimators_) > 1
+        # Check for distinct random states (see issue #7408)
+        assert (len(set(est.random_state for est in clf.estimators_)) ==
+                     len(clf.estimators_))
+
+    # Somewhat hacky regression test: prior to
+    # ae7adc880d624615a34bafdb1d75ef67051b8200,
+    # predict_proba returned SAMME.R values for SAMME.
+    clf_samme.algorithm = "SAMME.R"
+    assert_array_less(0,
+                      np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
+
+
+@pytest.mark.parametrize('loss', ['linear', 'square', 'exponential'])
+def test_boston(loss):
+    # Check consistency on dataset boston house prices.
+    reg = AdaBoostRegressor(loss=loss, random_state=0)
+    reg.fit(boston.data, boston.target)
+    score = reg.score(boston.data, boston.target)
+    assert score > 0.85
+
+    # Check we used multiple estimators
+    assert len(reg.estimators_) > 1
+    # Check for distinct random states (see issue #7408)
+    assert (len(set(est.random_state for est in reg.estimators_)) ==
+                 len(reg.estimators_))
+
+
+@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
+def test_staged_predict(algorithm):
+    # Check staged predictions.
+    rng = np.random.RandomState(0)
+    iris_weights = rng.randint(10, size=iris.target.shape)
+    boston_weights = rng.randint(10, size=boston.target.shape)
+
+    clf = AdaBoostClassifier(algorithm=algorithm, n_estimators=10)
+    clf.fit(iris.data, iris.target, sample_weight=iris_weights)
+
+    predictions = clf.predict(iris.data)
+    staged_predictions = [p for p in clf.staged_predict(iris.data)]
+    proba = clf.predict_proba(iris.data)
+    staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
+    score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
+    staged_scores = [
+        s for s in clf.staged_score(
+            iris.data, iris.target, sample_weight=iris_weights)]
+
+    assert len(staged_predictions) == 10
+    assert_array_almost_equal(predictions, staged_predictions[-1])
+    assert len(staged_probas) == 10
+    assert_array_almost_equal(proba, staged_probas[-1])
+    assert len(staged_scores) == 10
+    assert_array_almost_equal(score, staged_scores[-1])
+
+    # AdaBoost regression
+    clf = AdaBoostRegressor(n_estimators=10, random_state=0)
+    clf.fit(boston.data, boston.target, sample_weight=boston_weights)
+
+    predictions = clf.predict(boston.data)
+    staged_predictions = [p for p in clf.staged_predict(boston.data)]
+    score = clf.score(boston.data, boston.target, sample_weight=boston_weights)
+    staged_scores = [
+        s for s in clf.staged_score(
+            boston.data, boston.target, sample_weight=boston_weights)]
+
+    assert len(staged_predictions) == 10
+    assert_array_almost_equal(predictions, staged_predictions[-1])
+    assert len(staged_scores) == 10
+    assert_array_almost_equal(score, staged_scores[-1])
+
+
+def test_gridsearch():
+    # Check that base trees can be grid-searched.
+    # AdaBoost classification
+    boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
+    parameters = {'n_estimators': (1, 2),
+                  'base_estimator__max_depth': (1, 2),
+                  'algorithm': ('SAMME', 'SAMME.R')}
+    clf = GridSearchCV(boost, parameters)
+    clf.fit(iris.data, iris.target)
+
+    # AdaBoost regression
+    boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),
+                              random_state=0)
+    parameters = {'n_estimators': (1, 2),
+                  'base_estimator__max_depth': (1, 2)}
+    clf = GridSearchCV(boost, parameters)
+    clf.fit(boston.data, boston.target)
+
+
+def test_pickle():
+    # Check pickability.
+    import pickle
+
+    # Adaboost classifier
+    for alg in ['SAMME', 'SAMME.R']:
+        obj = AdaBoostClassifier(algorithm=alg)
+        obj.fit(iris.data, iris.target)
+        score = obj.score(iris.data, iris.target)
+        s = pickle.dumps(obj)
+
+        obj2 = pickle.loads(s)
+        assert type(obj2) == obj.__class__
+        score2 = obj2.score(iris.data, iris.target)
+        assert score == score2
+
+    # Adaboost regressor
+    obj = AdaBoostRegressor(random_state=0)
+    obj.fit(boston.data, boston.target)
+    score = obj.score(boston.data, boston.target)
+    s = pickle.dumps(obj)
+
+    obj2 = pickle.loads(s)
+    assert type(obj2) == obj.__class__
+    score2 = obj2.score(boston.data, boston.target)
+    assert score == score2
+
+
+def test_importances():
+    # Check variable importances.
+    X, y = datasets.make_classification(n_samples=2000,
+                                        n_features=10,
+                                        n_informative=3,
+                                        n_redundant=0,
+                                        n_repeated=0,
+                                        shuffle=False,
+                                        random_state=1)
+
+    for alg in ['SAMME', 'SAMME.R']:
+        clf = AdaBoostClassifier(algorithm=alg)
+
+        clf.fit(X, y)
+        importances = clf.feature_importances_
+
+        assert importances.shape[0] == 10
+        assert (importances[:3, np.newaxis] >= importances[3:]).all()
+
+
+def test_error():
+    # Test that it gives proper exception on deficient input.
+    assert_raises(ValueError,
+                  AdaBoostClassifier(learning_rate=-1).fit,
+                  X, y_class)
+
+    assert_raises(ValueError,
+                  AdaBoostClassifier(algorithm="foo").fit,
+                  X, y_class)
+
+    assert_raises(ValueError,
+                  AdaBoostClassifier().fit,
+                  X, y_class, sample_weight=np.asarray([-1]))
+
+
+def test_base_estimator():
+    # Test different base estimators.
+    from sklearn.ensemble import RandomForestClassifier
+
+    # XXX doesn't work with y_class because RF doesn't support classes_
+    # Shouldn't AdaBoost run a LabelBinarizer?
+    clf = AdaBoostClassifier(RandomForestClassifier())
+    clf.fit(X, y_regr)
+
+    clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
+    clf.fit(X, y_class)
+
+    from sklearn.ensemble import RandomForestRegressor
+
+    clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0)
+    clf.fit(X, y_regr)
+
+    clf = AdaBoostRegressor(SVR(), random_state=0)
+    clf.fit(X, y_regr)
+
+    # Check that an empty discrete ensemble fails in fit, not predict.
+    X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]]
+    y_fail = ["foo", "bar", 1, 2]
+    clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
+    assert_raises_regexp(ValueError, "worse than random",
+                         clf.fit, X_fail, y_fail)
+
+
+def test_sparse_classification():
+    # Check classification with sparse input.
+
+    class CustomSVC(SVC):
+        """SVC variant that records the nature of the training set."""
+
+        def fit(self, X, y, sample_weight=None):
+            """Modification on fit caries data type for later verification."""
+            super().fit(X, y, sample_weight=sample_weight)
+            self.data_type_ = type(X)
+            return self
+
+    X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=15,
+                                                   n_features=5,
+                                                   random_state=42)
+    # Flatten y to a 1d array
+    y = np.ravel(y)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
+                          dok_matrix]:
+        X_train_sparse = sparse_format(X_train)
+        X_test_sparse = sparse_format(X_test)
+
+        # Trained on sparse format
+        sparse_classifier = AdaBoostClassifier(
+            base_estimator=CustomSVC(probability=True),
+            random_state=1,
+            algorithm="SAMME"
+        ).fit(X_train_sparse, y_train)
+
+        # Trained on dense format
+        dense_classifier = AdaBoostClassifier(
+            base_estimator=CustomSVC(probability=True),
+            random_state=1,
+            algorithm="SAMME"
+        ).fit(X_train, y_train)
+
+        # predict
+        sparse_results = sparse_classifier.predict(X_test_sparse)
+        dense_results = dense_classifier.predict(X_test)
+        assert_array_equal(sparse_results, dense_results)
+
+        # decision_function
+        sparse_results = sparse_classifier.decision_function(X_test_sparse)
+        dense_results = dense_classifier.decision_function(X_test)
+        assert_array_almost_equal(sparse_results, dense_results)
+
+        # predict_log_proba
+        sparse_results = sparse_classifier.predict_log_proba(X_test_sparse)
+        dense_results = dense_classifier.predict_log_proba(X_test)
+        assert_array_almost_equal(sparse_results, dense_results)
+
+        # predict_proba
+        sparse_results = sparse_classifier.predict_proba(X_test_sparse)
+        dense_results = dense_classifier.predict_proba(X_test)
+        assert_array_almost_equal(sparse_results, dense_results)
+
+        # score
+        sparse_results = sparse_classifier.score(X_test_sparse, y_test)
+        dense_results = dense_classifier.score(X_test, y_test)
+        assert_array_almost_equal(sparse_results, dense_results)
+
+        # staged_decision_function
+        sparse_results = sparse_classifier.staged_decision_function(
+            X_test_sparse)
+        dense_results = dense_classifier.staged_decision_function(X_test)
+        for sprase_res, dense_res in zip(sparse_results, dense_results):
+            assert_array_almost_equal(sprase_res, dense_res)
+
+        # staged_predict
+        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
+        dense_results = dense_classifier.staged_predict(X_test)
+        for sprase_res, dense_res in zip(sparse_results, dense_results):
+            assert_array_equal(sprase_res, dense_res)
+
+        # staged_predict_proba
+        sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse)
+        dense_results = dense_classifier.staged_predict_proba(X_test)
+        for sprase_res, dense_res in zip(sparse_results, dense_results):
+            assert_array_almost_equal(sprase_res, dense_res)
+
+        # staged_score
+        sparse_results = sparse_classifier.staged_score(X_test_sparse,
+                                                        y_test)
+        dense_results = dense_classifier.staged_score(X_test, y_test)
+        for sprase_res, dense_res in zip(sparse_results, dense_results):
+            assert_array_equal(sprase_res, dense_res)
+
+        # Verify sparsity of data is maintained during training
+        types = [i.data_type_ for i in sparse_classifier.estimators_]
+
+        assert all([(t == csc_matrix or t == csr_matrix)
+                   for t in types])
+
+
+def test_sparse_regression():
+    # Check regression with sparse input.
+
+    class CustomSVR(SVR):
+        """SVR variant that records the nature of the training set."""
+
+        def fit(self, X, y, sample_weight=None):
+            """Modification on fit caries data type for later verification."""
+            super().fit(X, y, sample_weight=sample_weight)
+            self.data_type_ = type(X)
+            return self
+
+    X, y = datasets.make_regression(n_samples=15, n_features=50, n_targets=1,
+                                    random_state=42)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
+                          dok_matrix]:
+        X_train_sparse = sparse_format(X_train)
+        X_test_sparse = sparse_format(X_test)
+
+        # Trained on sparse format
+        sparse_classifier = AdaBoostRegressor(
+            base_estimator=CustomSVR(),
+            random_state=1
+        ).fit(X_train_sparse, y_train)
+
+        # Trained on dense format
+        dense_classifier = dense_results = AdaBoostRegressor(
+            base_estimator=CustomSVR(),
+            random_state=1
+        ).fit(X_train, y_train)
+
+        # predict
+        sparse_results = sparse_classifier.predict(X_test_sparse)
+        dense_results = dense_classifier.predict(X_test)
+        assert_array_almost_equal(sparse_results, dense_results)
+
+        # staged_predict
+        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
+        dense_results = dense_classifier.staged_predict(X_test)
+        for sprase_res, dense_res in zip(sparse_results, dense_results):
+            assert_array_almost_equal(sprase_res, dense_res)
+
+        types = [i.data_type_ for i in sparse_classifier.estimators_]
+
+        assert all([(t == csc_matrix or t == csr_matrix)
+                   for t in types])
+
+
+def test_sample_weight_adaboost_regressor():
+    """
+    AdaBoostRegressor should work without sample_weights in the base estimator
+    The random weighted sampling is done internally in the _boost method in
+    AdaBoostRegressor.
+    """
+    class DummyEstimator(BaseEstimator):
+
+        def fit(self, X, y):
+            pass
+
+        def predict(self, X):
+            return np.zeros(X.shape[0])
+
+    boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
+    boost.fit(X, y_regr)
+    assert len(boost.estimator_weights_) == len(boost.estimator_errors_)
+
+
+def test_multidimensional_X():
+    """
+    Check that the AdaBoost estimators can work with n-dimensional
+    data matrix
+    """
+    rng = np.random.RandomState(0)
+
+    X = rng.randn(50, 3, 3)
+    yc = rng.choice([0, 1], 50)
+    yr = rng.randn(50)
+
+    boost = AdaBoostClassifier(DummyClassifier(strategy='most_frequent'))
+    boost.fit(X, yc)
+    boost.predict(X)
+    boost.predict_proba(X)
+
+    boost = AdaBoostRegressor(DummyRegressor())
+    boost.fit(X, yr)
+    boost.predict(X)
+
+
+# TODO: Remove in 0.24 when DummyClassifier's `strategy` default changes
+@ignore_warnings
+@pytest.mark.parametrize("algorithm", ['SAMME', 'SAMME.R'])
+def test_adaboostclassifier_without_sample_weight(algorithm):
+    X, y = iris.data, iris.target
+    base_estimator = NoSampleWeightWrapper(DummyClassifier())
+    clf = AdaBoostClassifier(
+        base_estimator=base_estimator, algorithm=algorithm
+    )
+    err_msg = ("{} doesn't support sample_weight"
+               .format(base_estimator.__class__.__name__))
+    with pytest.raises(ValueError, match=err_msg):
+        clf.fit(X, y)
+
+
+def test_adaboostregressor_sample_weight():
+    # check that giving weight will have an influence on the error computed
+    # for a weak learner
+    rng = np.random.RandomState(42)
+    X = np.linspace(0, 100, num=1000)
+    y = (.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001)
+    X = X.reshape(-1, 1)
+
+    # add an arbitrary outlier
+    X[-1] *= 10
+    y[-1] = 10000
+
+    # random_state=0 ensure that the underlying bootstrap will use the outlier
+    regr_no_outlier = AdaBoostRegressor(
+        base_estimator=LinearRegression(), n_estimators=1, random_state=0
+    )
+    regr_with_weight = clone(regr_no_outlier)
+    regr_with_outlier = clone(regr_no_outlier)
+
+    # fit 3 models:
+    # - a model containing the outlier
+    # - a model without the outlier
+    # - a model containing the outlier but with a null sample-weight
+    regr_with_outlier.fit(X, y)
+    regr_no_outlier.fit(X[:-1], y[:-1])
+    sample_weight = np.ones_like(y)
+    sample_weight[-1] = 0
+    regr_with_weight.fit(X, y, sample_weight=sample_weight)
+
+    score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1])
+    score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1])
+    score_with_weight = regr_with_weight.score(X[:-1], y[:-1])
+
+    assert score_with_outlier < score_no_outlier
+    assert score_with_outlier < score_with_weight
+    assert score_no_outlier == pytest.approx(score_with_weight)
+
+@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
+def test_adaboost_consistent_predict(algorithm):
+    # check that predict_proba and predict give consistent results
+    # regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/14084
+    X_train, X_test, y_train, y_test = train_test_split(
+        *datasets.load_digits(return_X_y=True), random_state=42
+    )
+    model = AdaBoostClassifier(algorithm=algorithm, random_state=42)
+    model.fit(X_train, y_train)
+
+    assert_array_equal(
+        np.argmax(model.predict_proba(X_test), axis=1),
+        model.predict(X_test)
+    )
+
+
+@pytest.mark.parametrize(
+    'model, X, y',
+    [(AdaBoostClassifier(), iris.data, iris.target),
+     (AdaBoostRegressor(), boston.data, boston.target)]
+)
+def test_adaboost_negative_weight_error(model, X, y):
+    sample_weight = np.ones_like(y)
+    sample_weight[-1] = -10
+
+    err_msg = "sample_weight cannot contain negative weight"
+    with pytest.raises(ValueError, match=err_msg):
+        model.fit(X, y, sample_weight=sample_weight)
--- a/Show more
+++ b/Show more