Uploaded Test files

2020-11-12 11:05:57 -05:00 · 2020-11-12 11:05:57 -05:00 · 2e81cb7d99
commit 2e81cb7d99
parent f584ad9d97
16627 changed files with 2065359 additions and 102444 deletions
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/init.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/init.py
@ -0,0 +1,5 @@
+"""This module implements histogram-based gradient boosting estimators.
+
+The implementation is a port from pygbm which is itself strongly inspired
+from LightGBM.
+"""
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/binning.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/binning.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/gradient_boosting.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/gradient_boosting.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/grower.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/grower.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/loss.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/loss.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/predictor.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/pycache/predictor.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_loss.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_loss.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_predictor.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/_predictor.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/binning.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/binning.py
@ -0,0 +1,204 @@
+"""
+This module contains the BinMapper class.
+
+BinMapper is used for mapping a real-valued dataset into integer-valued bins.
+Bin thresholds are computed with the quantiles so that each bin contains
+approximately the same number of samples.
+"""
+# Author: Nicolas Hug
+
+import numpy as np
+
+from ...utils import check_random_state, check_array
+from ...base import BaseEstimator, TransformerMixin
+from ...utils.validation import check_is_fitted
+from ._binning import _map_to_bins
+from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF
+
+
+def _find_binning_thresholds(data, max_bins, subsample, random_state):
+    """Extract feature-wise quantiles from numerical data.
+
+    Missing values are ignored for finding the thresholds.
+
+    Parameters
+    ----------
+    data : array-like, shape (n_samples, n_features)
+        The data to bin.
+    max_bins: int
+        The maximum number of bins to use for non-missing values. If for a
+        given feature the number of unique values is less than ``max_bins``,
+        then those unique values will be used to compute the bin thresholds,
+        instead of the quantiles.
+    subsample : int or None
+        If ``n_samples > subsample``, then ``sub_samples`` samples will be
+        randomly chosen to compute the quantiles. If ``None``, the whole data
+        is used.
+    random_state: int, RandomState instance or None
+        Pseudo-random number generator to control the random sub-sampling.
+        Pass an int for reproducible output across multiple
+        function calls.
+        See :term: `Glossary <random_state>`.
+
+    Return
+    ------
+    binning_thresholds: list of arrays
+        For each feature, stores the increasing numeric values that can
+        be used to separate the bins. Thus ``len(binning_thresholds) ==
+        n_features``.
+    """
+    rng = check_random_state(random_state)
+    if subsample is not None and data.shape[0] > subsample:
+        subset = rng.choice(data.shape[0], subsample, replace=False)
+        data = data.take(subset, axis=0)
+
+    binning_thresholds = []
+    for f_idx in range(data.shape[1]):
+        col_data = data[:, f_idx]
+        # ignore missing values when computing bin thresholds
+        missing_mask = np.isnan(col_data)
+        if missing_mask.any():
+            col_data = col_data[~missing_mask]
+        col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
+        distinct_values = np.unique(col_data)
+        if len(distinct_values) <= max_bins:
+            midpoints = distinct_values[:-1] + distinct_values[1:]
+            midpoints *= .5
+        else:
+            # We sort again the data in this case. We could compute
+            # approximate midpoint percentiles using the output of
+            # np.unique(col_data, return_counts) instead but this is more
+            # work and the performance benefit will be limited because we
+            # work on a fixed-size subsample of the full data.
+            percentiles = np.linspace(0, 100, num=max_bins + 1)
+            percentiles = percentiles[1:-1]
+            midpoints = np.percentile(col_data, percentiles,
+                                      interpolation='midpoint').astype(X_DTYPE)
+            assert midpoints.shape[0] == max_bins - 1
+
+        # We avoid having +inf thresholds: +inf thresholds are only allowed in
+        # a "split on nan" situation.
+        np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)
+
+        binning_thresholds.append(midpoints)
+
+    return binning_thresholds
+
+
+class _BinMapper(TransformerMixin, BaseEstimator):
+    """Transformer that maps a dataset into integer-valued bins.
+
+    The bins are created in a feature-wise fashion, using quantiles so that
+    each bins contains approximately the same number of samples.
+
+    For large datasets, quantiles are computed on a subset of the data to
+    speed-up the binning, but the quantiles should remain stable.
+
+    Features with a small number of values may be binned into less than
+    ``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved
+    for missing values.
+
+    Parameters
+    ----------
+    n_bins : int, optional (default=256)
+        The maximum number of bins to use (including the bin for missing
+        values). Non-missing values are binned on ``max_bins = n_bins - 1``
+        bins. The last bin is always reserved for missing values. If for a
+        given feature the number of unique values is less than ``max_bins``,
+        then those unique values will be used to compute the bin thresholds,
+        instead of the quantiles.
+    subsample : int or None, optional (default=2e5)
+        If ``n_samples > subsample``, then ``sub_samples`` samples will be
+        randomly chosen to compute the quantiles. If ``None``, the whole data
+        is used.
+    random_state: int, RandomState instance or None
+        Pseudo-random number generator to control the random sub-sampling.
+        Pass an int for reproducible output across multiple
+        function calls.
+        See :term: `Glossary <random_state>`.
+
+    Attributes
+    ----------
+    bin_thresholds_ : list of arrays
+        For each feature, gives the real-valued bin threhsolds. There are
+        ``max_bins - 1`` thresholds, where ``max_bins = n_bins - 1`` is the
+        number of bins used for non-missing values.
+    n_bins_non_missing_ : array of uint32
+        For each feature, gives the number of bins actually used for
+        non-missing values. For features with a lot of unique values, this is
+        equal to ``n_bins - 1``.
+    missing_values_bin_idx_ : uint8
+        The index of the bin where missing values are mapped. This is a
+        constant across all features. This corresponds to the last bin, and
+        it is always equal to ``n_bins - 1``. Note that if ``n_bins_missing_``
+        is less than ``n_bins - 1`` for a given feature, then there are
+        empty (and unused) bins.
+    """
+    def __init__(self, n_bins=256, subsample=int(2e5), random_state=None):
+        self.n_bins = n_bins
+        self.subsample = subsample
+        self.random_state = random_state
+
+    def fit(self, X, y=None):
+        """Fit data X by computing the binning thresholds.
+
+        The last bin is reserved for missing values, whether missing values
+        are present in the data or not.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The data to bin.
+        y: None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+        """
+        if not (3 <= self.n_bins <= 256):
+            # min is 3: at least 2 distinct bins and a missing values bin
+            raise ValueError('n_bins={} should be no smaller than 3 '
+                             'and no larger than 256.'.format(self.n_bins))
+
+        X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
+        max_bins = self.n_bins - 1
+        self.bin_thresholds_ = _find_binning_thresholds(
+            X, max_bins, subsample=self.subsample,
+            random_state=self.random_state)
+
+        self.n_bins_non_missing_ = np.array(
+            [thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_],
+            dtype=np.uint32)
+
+        self.missing_values_bin_idx_ = self.n_bins - 1
+
+        return self
+
+    def transform(self, X):
+        """Bin data X.
+
+        Missing values will be mapped to the last bin.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The data to bin.
+
+        Returns
+        -------
+        X_binned : array-like, shape (n_samples, n_features)
+            The binned data (fortran-aligned).
+        """
+        X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
+        check_is_fitted(self)
+        if X.shape[1] != self.n_bins_non_missing_.shape[0]:
+            raise ValueError(
+                'This estimator was fitted with {} features but {} got passed '
+                'to transform()'.format(self.n_bins_non_missing_.shape[0],
+                                        X.shape[1])
+            )
+        binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F')
+        _map_to_bins(X, self.bin_thresholds_, self.missing_values_bin_idx_,
+                     binned)
+        return binned
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pxd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pxd
@ -0,0 +1,40 @@
+# cython: language_level=3
+import numpy as np
+cimport numpy as np
+
+np.import_array()
+
+
+ctypedef np.npy_float64 X_DTYPE_C
+ctypedef np.npy_uint8 X_BINNED_DTYPE_C
+ctypedef np.npy_float64 Y_DTYPE_C
+ctypedef np.npy_float32 G_H_DTYPE_C
+
+cdef packed struct hist_struct:
+    # Same as histogram dtype but we need a struct to declare views. It needs
+    # to be packed since by default numpy dtypes aren't aligned
+    Y_DTYPE_C sum_gradients
+    Y_DTYPE_C sum_hessians
+    unsigned int count
+
+
+cdef packed struct node_struct:
+    # Equivalent struct to PREDICTOR_RECORD_DTYPE to use in memory views. It
+    # needs to be packed since by default numpy dtypes aren't aligned
+    Y_DTYPE_C value
+    unsigned int count
+    unsigned int feature_idx
+    X_DTYPE_C threshold
+    unsigned char missing_go_to_left
+    unsigned int left
+    unsigned int right
+    Y_DTYPE_C gain
+    unsigned int depth
+    unsigned char is_leaf
+    X_BINNED_DTYPE_C bin_threshold
+
+
+cpdef enum MonotonicConstraint:
+    NO_CST = 0
+    POS = 1
+    NEG = -1
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/grower.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/grower.py
@ -0,0 +1,571 @@
+"""
+This module contains the TreeGrower class.
+
+TreeGrowee builds a regression tree fitting a Newton-Raphson step, based on
+the gradients and hessians of the training data.
+"""
+# Author: Nicolas Hug
+
+from heapq import heappush, heappop
+import numpy as np
+from timeit import default_timer as time
+import numbers
+
+from .splitting import Splitter
+from .histogram import HistogramBuilder
+from .predictor import TreePredictor
+from .utils import sum_parallel
+from .common import PREDICTOR_RECORD_DTYPE
+from .common import Y_DTYPE
+from .common import MonotonicConstraint
+
+
+EPS = np.finfo(Y_DTYPE).eps  # to avoid zero division errors
+
+
+class TreeNode:
+    """Tree Node class used in TreeGrower.
+
+    This isn't used for prediction purposes, only for training (see
+    TreePredictor).
+
+    Parameters
+    ----------
+    depth : int
+        The depth of the node, i.e. its distance from the root.
+    sample_indices : ndarray of unsigned int, shape (n_samples_at_node,)
+        The indices of the samples at the node.
+    sum_gradients : float
+        The sum of the gradients of the samples at the node.
+    sum_hessians : float
+        The sum of the hessians of the samples at the node.
+    parent : TreeNode or None, optional (default=None)
+        The parent of the node. None for root.
+
+    Attributes
+    ----------
+    depth : int
+        The depth of the node, i.e. its distance from the root.
+    sample_indices : ndarray of unsigned int, shape (n_samples_at_node,)
+        The indices of the samples at the node.
+    sum_gradients : float
+        The sum of the gradients of the samples at the node.
+    sum_hessians : float
+        The sum of the hessians of the samples at the node.
+    parent : TreeNode or None
+        The parent of the node. None for root.
+    split_info : SplitInfo or None
+        The result of the split evaluation.
+    left_child : TreeNode or None
+        The left child of the node. None for leaves.
+    right_child : TreeNode or None
+        The right child of the node. None for leaves.
+    value : float or None
+        The value of the leaf, as computed in finalize_leaf(). None for
+        non-leaf nodes.
+    partition_start : int
+        start position of the node's sample_indices in splitter.partition.
+    partition_stop : int
+        stop position of the node's sample_indices in splitter.partition.
+    """
+
+    split_info = None
+    left_child = None
+    right_child = None
+    histograms = None
+    sibling = None
+    parent = None
+
+    # start and stop indices of the node in the splitter.partition
+    # array. Concretely,
+    # self.sample_indices = view(self.splitter.partition[start:stop])
+    # Please see the comments about splitter.partition and
+    # splitter.split_indices for more info about this design.
+    # These 2 attributes are only used in _update_raw_prediction, because we
+    # need to iterate over the leaves and I don't know how to efficiently
+    # store the sample_indices views because they're all of different sizes.
+    partition_start = 0
+    partition_stop = 0
+
+    def __init__(self, depth, sample_indices, sum_gradients,
+                 sum_hessians, parent=None, value=None):
+        self.depth = depth
+        self.sample_indices = sample_indices
+        self.n_samples = sample_indices.shape[0]
+        self.sum_gradients = sum_gradients
+        self.sum_hessians = sum_hessians
+        self.parent = parent
+        self.value = value
+        self.is_leaf = False
+        self.set_children_bounds(float('-inf'), float('+inf'))
+
+    def set_children_bounds(self, lower, upper):
+        """Set children values bounds to respect monotonic constraints."""
+
+        # These are bounds for the node's *children* values, not the node's
+        # value. The bounds are used in the splitter when considering potential
+        # left and right child.
+        self.children_lower_bound = lower
+        self.children_upper_bound = upper
+
+    def __lt__(self, other_node):
+        """Comparison for priority queue.
+
+        Nodes with high gain are higher priority than nodes with low gain.
+
+        heapq.heappush only need the '<' operator.
+        heapq.heappop take the smallest item first (smaller is higher
+        priority).
+
+        Parameters
+        ----------
+        other_node : TreeNode
+            The node to compare with.
+        """
+        return self.split_info.gain > other_node.split_info.gain
+
+
+class TreeGrower:
+    """Tree grower class used to build a tree.
+
+    The tree is fitted to predict the values of a Newton-Raphson step. The
+    splits are considered in a best-first fashion, and the quality of a
+    split is defined in splitting._split_gain.
+
+    Parameters
+    ----------
+    X_binned : ndarray of int, shape (n_samples, n_features)
+        The binned input samples. Must be Fortran-aligned.
+    gradients : ndarray, shape (n_samples,)
+        The gradients of each training sample. Those are the gradients of the
+        loss w.r.t the predictions, evaluated at iteration ``i - 1``.
+    hessians : ndarray, shape (n_samples,)
+        The hessians of each training sample. Those are the hessians of the
+        loss w.r.t the predictions, evaluated at iteration ``i - 1``.
+    max_leaf_nodes : int or None, optional (default=None)
+        The maximum number of leaves for each tree. If None, there is no
+        maximum limit.
+    max_depth : int or None, optional (default=None)
+        The maximum depth of each tree. The depth of a tree is the number of
+        edges to go from the root to the deepest leaf.
+        Depth isn't constrained by default.
+    min_samples_leaf : int, optional (default=20)
+        The minimum number of samples per leaf.
+    min_gain_to_split : float, optional (default=0.)
+        The minimum gain needed to split a node. Splits with lower gain will
+        be ignored.
+    n_bins : int, optional (default=256)
+        The total number of bins, including the bin for missing values. Used
+        to define the shape of the histograms.
+    n_bins_non_missing_ : array of uint32
+        For each feature, gives the number of bins actually used for
+        non-missing values. For features with a lot of unique values, this
+        is equal to ``n_bins - 1``. If it's an int, all features are
+        considered to have the same number of bins. If None, all features
+        are considered to have ``n_bins - 1`` bins.
+    has_missing_values : ndarray of bool or bool, optional (default=False)
+        Whether each feature contains missing values (in the training data).
+        If it's a bool, the same value is used for all features.
+    l2_regularization : float, optional (default=0)
+        The L2 regularization parameter.
+    min_hessian_to_split : float, optional (default=1e-3)
+        The minimum sum of hessians needed in each node. Splits that result in
+        at least one child having a sum of hessians less than
+        ``min_hessian_to_split`` are discarded.
+    shrinkage : float, optional (default=1)
+        The shrinkage parameter to apply to the leaves values, also known as
+        learning rate.
+    """
+    def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,
+                 max_depth=None, min_samples_leaf=20, min_gain_to_split=0.,
+                 n_bins=256, n_bins_non_missing=None, has_missing_values=False,
+                 monotonic_cst=None, l2_regularization=0.,
+                 min_hessian_to_split=1e-3, shrinkage=1.):
+
+        self._validate_parameters(X_binned, max_leaf_nodes, max_depth,
+                                  min_samples_leaf, min_gain_to_split,
+                                  l2_regularization, min_hessian_to_split)
+
+        if n_bins_non_missing is None:
+            n_bins_non_missing = n_bins - 1
+
+        if isinstance(n_bins_non_missing, numbers.Integral):
+            n_bins_non_missing = np.array(
+                [n_bins_non_missing] * X_binned.shape[1],
+                dtype=np.uint32)
+        else:
+            n_bins_non_missing = np.asarray(n_bins_non_missing,
+                                            dtype=np.uint32)
+
+        if isinstance(has_missing_values, bool):
+            has_missing_values = [has_missing_values] * X_binned.shape[1]
+        has_missing_values = np.asarray(has_missing_values, dtype=np.uint8)
+
+        if monotonic_cst is None:
+            self.with_monotonic_cst = False
+            monotonic_cst = np.full(shape=X_binned.shape[1],
+                                    fill_value=MonotonicConstraint.NO_CST,
+                                    dtype=np.int8)
+        else:
+            self.with_monotonic_cst = True
+            monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
+
+            if monotonic_cst.shape[0] != X_binned.shape[1]:
+                raise ValueError(
+                    "monotonic_cst has shape {} but the input data "
+                    "X has {} features.".format(
+                        monotonic_cst.shape[0], X_binned.shape[1]
+                    )
+                )
+            if np.any(monotonic_cst < -1) or np.any(monotonic_cst > 1):
+                raise ValueError(
+                    "monotonic_cst must be None or an array-like of "
+                    "-1, 0 or 1."
+                    )
+
+        hessians_are_constant = hessians.shape[0] == 1
+        self.histogram_builder = HistogramBuilder(
+            X_binned, n_bins, gradients, hessians, hessians_are_constant)
+        missing_values_bin_idx = n_bins - 1
+        self.splitter = Splitter(
+            X_binned, n_bins_non_missing, missing_values_bin_idx,
+            has_missing_values, monotonic_cst,
+            l2_regularization, min_hessian_to_split,
+            min_samples_leaf, min_gain_to_split, hessians_are_constant)
+        self.n_bins_non_missing = n_bins_non_missing
+        self.max_leaf_nodes = max_leaf_nodes
+        self.has_missing_values = has_missing_values
+        self.monotonic_cst = monotonic_cst
+        self.l2_regularization = l2_regularization
+        self.n_features = X_binned.shape[1]
+        self.max_depth = max_depth
+        self.min_samples_leaf = min_samples_leaf
+        self.X_binned = X_binned
+        self.min_gain_to_split = min_gain_to_split
+        self.shrinkage = shrinkage
+        self.splittable_nodes = []
+        self.finalized_leaves = []
+        self.total_find_split_time = 0.  # time spent finding the best splits
+        self.total_compute_hist_time = 0.  # time spent computing histograms
+        self.total_apply_split_time = 0.  # time spent splitting nodes
+        self._intilialize_root(gradients, hessians, hessians_are_constant)
+        self.n_nodes = 1
+
+    def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth,
+                             min_samples_leaf, min_gain_to_split,
+                             l2_regularization, min_hessian_to_split):
+        """Validate parameters passed to __init__.
+
+        Also validate parameters passed to splitter.
+        """
+        if X_binned.dtype != np.uint8:
+            raise NotImplementedError(
+                "X_binned must be of type uint8.")
+        if not X_binned.flags.f_contiguous:
+            raise ValueError(
+                "X_binned should be passed as Fortran contiguous "
+                "array for maximum efficiency.")
+        if max_leaf_nodes is not None and max_leaf_nodes <= 1:
+            raise ValueError('max_leaf_nodes={} should not be'
+                             ' smaller than 2'.format(max_leaf_nodes))
+        if max_depth is not None and max_depth < 1:
+            raise ValueError('max_depth={} should not be'
+                             ' smaller than 1'.format(max_depth))
+        if min_samples_leaf < 1:
+            raise ValueError('min_samples_leaf={} should '
+                             'not be smaller than 1'.format(min_samples_leaf))
+        if min_gain_to_split < 0:
+            raise ValueError('min_gain_to_split={} '
+                             'must be positive.'.format(min_gain_to_split))
+        if l2_regularization < 0:
+            raise ValueError('l2_regularization={} must be '
+                             'positive.'.format(l2_regularization))
+        if min_hessian_to_split < 0:
+            raise ValueError('min_hessian_to_split={} '
+                             'must be positive.'.format(min_hessian_to_split))
+
+    def grow(self):
+        """Grow the tree, from root to leaves."""
+        while self.splittable_nodes:
+            self.split_next()
+
+        self._apply_shrinkage()
+
+    def _apply_shrinkage(self):
+        """Multiply leaves values by shrinkage parameter.
+
+        This must be done at the very end of the growing process. If this were
+        done during the growing process e.g. in finalize_leaf(), then a leaf
+        would be shrunk but its sibling would potentially not be (if it's a
+        non-leaf), which would lead to a wrong computation of the 'middle'
+        value needed to enforce the monotonic constraints.
+        """
+        for leaf in self.finalized_leaves:
+            leaf.value *= self.shrinkage
+
+    def _intilialize_root(self, gradients, hessians, hessians_are_constant):
+        """Initialize root node and finalize it if needed."""
+        n_samples = self.X_binned.shape[0]
+        depth = 0
+        sum_gradients = sum_parallel(gradients)
+        if self.histogram_builder.hessians_are_constant:
+            sum_hessians = hessians[0] * n_samples
+        else:
+            sum_hessians = sum_parallel(hessians)
+        self.root = TreeNode(
+            depth=depth,
+            sample_indices=self.splitter.partition,
+            sum_gradients=sum_gradients,
+            sum_hessians=sum_hessians,
+            value=0
+        )
+
+        self.root.partition_start = 0
+        self.root.partition_stop = n_samples
+
+        if self.root.n_samples < 2 * self.min_samples_leaf:
+            # Do not even bother computing any splitting statistics.
+            self._finalize_leaf(self.root)
+            return
+        if sum_hessians < self.splitter.min_hessian_to_split:
+            self._finalize_leaf(self.root)
+            return
+
+        self.root.histograms = self.histogram_builder.compute_histograms_brute(
+            self.root.sample_indices)
+        self._compute_best_split_and_push(self.root)
+
+    def _compute_best_split_and_push(self, node):
+        """Compute the best possible split (SplitInfo) of a given node.
+
+        Also push it in the heap of splittable nodes if gain isn't zero.
+        The gain of a node is 0 if either all the leaves are pure
+        (best gain = 0), or if no split would satisfy the constraints,
+        (min_hessians_to_split, min_gain_to_split, min_samples_leaf)
+        """
+
+        node.split_info = self.splitter.find_node_split(
+            node.n_samples, node.histograms, node.sum_gradients,
+            node.sum_hessians, node.value, node.children_lower_bound,
+            node.children_upper_bound)
+
+        if node.split_info.gain <= 0:  # no valid split
+            self._finalize_leaf(node)
+        else:
+            heappush(self.splittable_nodes, node)
+
+    def split_next(self):
+        """Split the node with highest potential gain.
+
+        Returns
+        -------
+        left : TreeNode
+            The resulting left child.
+        right : TreeNode
+            The resulting right child.
+        """
+        # Consider the node with the highest loss reduction (a.k.a. gain)
+        node = heappop(self.splittable_nodes)
+
+        tic = time()
+        (sample_indices_left,
+         sample_indices_right,
+         right_child_pos) = self.splitter.split_indices(node.split_info,
+                                                        node.sample_indices)
+        self.total_apply_split_time += time() - tic
+
+        depth = node.depth + 1
+        n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes)
+        n_leaf_nodes += 2
+
+        left_child_node = TreeNode(depth,
+                                   sample_indices_left,
+                                   node.split_info.sum_gradient_left,
+                                   node.split_info.sum_hessian_left,
+                                   parent=node,
+                                   value=node.split_info.value_left,
+                                   )
+        right_child_node = TreeNode(depth,
+                                    sample_indices_right,
+                                    node.split_info.sum_gradient_right,
+                                    node.split_info.sum_hessian_right,
+                                    parent=node,
+                                    value=node.split_info.value_right,
+                                    )
+
+        left_child_node.sibling = right_child_node
+        right_child_node.sibling = left_child_node
+        node.right_child = right_child_node
+        node.left_child = left_child_node
+
+        # set start and stop indices
+        left_child_node.partition_start = node.partition_start
+        left_child_node.partition_stop = node.partition_start + right_child_pos
+        right_child_node.partition_start = left_child_node.partition_stop
+        right_child_node.partition_stop = node.partition_stop
+
+        if not self.has_missing_values[node.split_info.feature_idx]:
+            # If no missing values are encountered at fit time, then samples
+            # with missing values during predict() will go to whichever child
+            # has the most samples.
+            node.split_info.missing_go_to_left = (
+                left_child_node.n_samples > right_child_node.n_samples)
+
+        self.n_nodes += 2
+
+        if (self.max_leaf_nodes is not None
+                and n_leaf_nodes == self.max_leaf_nodes):
+            self._finalize_leaf(left_child_node)
+            self._finalize_leaf(right_child_node)
+            self._finalize_splittable_nodes()
+            return left_child_node, right_child_node
+
+        if self.max_depth is not None and depth == self.max_depth:
+            self._finalize_leaf(left_child_node)
+            self._finalize_leaf(right_child_node)
+            return left_child_node, right_child_node
+
+        if left_child_node.n_samples < self.min_samples_leaf * 2:
+            self._finalize_leaf(left_child_node)
+        if right_child_node.n_samples < self.min_samples_leaf * 2:
+            self._finalize_leaf(right_child_node)
+
+        if self.with_monotonic_cst:
+            # Set value bounds for respecting monotonic constraints
+            # See test_nodes_values() for details
+            if (self.monotonic_cst[node.split_info.feature_idx] ==
+                    MonotonicConstraint.NO_CST):
+                lower_left = lower_right = node.children_lower_bound
+                upper_left = upper_right = node.children_upper_bound
+            else:
+                mid = (left_child_node.value + right_child_node.value) / 2
+                if (self.monotonic_cst[node.split_info.feature_idx] ==
+                        MonotonicConstraint.POS):
+                    lower_left, upper_left = node.children_lower_bound, mid
+                    lower_right, upper_right = mid, node.children_upper_bound
+                else:  # NEG
+                    lower_left, upper_left = mid, node.children_upper_bound
+                    lower_right, upper_right = node.children_lower_bound, mid
+            left_child_node.set_children_bounds(lower_left, upper_left)
+            right_child_node.set_children_bounds(lower_right, upper_right)
+
+        # Compute histograms of children, and compute their best possible split
+        # (if needed)
+        should_split_left = not left_child_node.is_leaf
+        should_split_right = not right_child_node.is_leaf
+        if should_split_left or should_split_right:
+
+            # We will compute the histograms of both nodes even if one of them
+            # is a leaf, since computing the second histogram is very cheap
+            # (using histogram subtraction).
+            n_samples_left = left_child_node.sample_indices.shape[0]
+            n_samples_right = right_child_node.sample_indices.shape[0]
+            if n_samples_left < n_samples_right:
+                smallest_child = left_child_node
+                largest_child = right_child_node
+            else:
+                smallest_child = right_child_node
+                largest_child = left_child_node
+
+            # We use the brute O(n_samples) method on the child that has the
+            # smallest number of samples, and the subtraction trick O(n_bins)
+            # on the other one.
+            tic = time()
+            smallest_child.histograms = \
+                self.histogram_builder.compute_histograms_brute(
+                    smallest_child.sample_indices)
+            largest_child.histograms = \
+                self.histogram_builder.compute_histograms_subtraction(
+                    node.histograms, smallest_child.histograms)
+            self.total_compute_hist_time += time() - tic
+
+            tic = time()
+            if should_split_left:
+                self._compute_best_split_and_push(left_child_node)
+            if should_split_right:
+                self._compute_best_split_and_push(right_child_node)
+            self.total_find_split_time += time() - tic
+
+        return left_child_node, right_child_node
+
+    def _finalize_leaf(self, node):
+        """Make node a leaf of the tree being grown."""
+
+        node.is_leaf = True
+        self.finalized_leaves.append(node)
+
+    def _finalize_splittable_nodes(self):
+        """Transform all splittable nodes into leaves.
+
+        Used when some constraint is met e.g. maximum number of leaves or
+        maximum depth."""
+        while len(self.splittable_nodes) > 0:
+            node = self.splittable_nodes.pop()
+            self._finalize_leaf(node)
+
+    def make_predictor(self, bin_thresholds=None):
+        """Make a TreePredictor object out of the current tree.
+
+        Parameters
+        ----------
+        bin_thresholds : array-like of floats, optional (default=None)
+            The actual thresholds values of each bin.
+
+        Returns
+        -------
+        A TreePredictor object.
+        """
+        predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE)
+        _fill_predictor_node_array(predictor_nodes, self.root,
+                                   bin_thresholds, self.n_bins_non_missing)
+        return TreePredictor(predictor_nodes)
+
+
+def _fill_predictor_node_array(predictor_nodes, grower_node,
+                               bin_thresholds, n_bins_non_missing,
+                               next_free_idx=0):
+    """Helper used in make_predictor to set the TreePredictor fields."""
+    node = predictor_nodes[next_free_idx]
+    node['count'] = grower_node.n_samples
+    node['depth'] = grower_node.depth
+    if grower_node.split_info is not None:
+        node['gain'] = grower_node.split_info.gain
+    else:
+        node['gain'] = -1
+
+    node['value'] = grower_node.value
+
+    if grower_node.is_leaf:
+        # Leaf node
+        node['is_leaf'] = True
+        return next_free_idx + 1
+    else:
+        # Decision node
+        split_info = grower_node.split_info
+        feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx
+        node['feature_idx'] = feature_idx
+        node['bin_threshold'] = bin_idx
+        node['missing_go_to_left'] = split_info.missing_go_to_left
+
+        if split_info.bin_idx == n_bins_non_missing[feature_idx] - 1:
+            # Split is on the last non-missing bin: it's a "split on nans". All
+            # nans go to the right, the rest go to the left.
+            node['threshold'] = np.inf
+        elif bin_thresholds is not None:
+            node['threshold'] = bin_thresholds[feature_idx][bin_idx]
+
+        next_free_idx += 1
+
+        node['left'] = next_free_idx
+        next_free_idx = _fill_predictor_node_array(
+            predictor_nodes, grower_node.left_child,
+            bin_thresholds=bin_thresholds,
+            n_bins_non_missing=n_bins_non_missing,
+            next_free_idx=next_free_idx)
+
+        node['right'] = next_free_idx
+        return _fill_predictor_node_array(
+            predictor_nodes, grower_node.right_child,
+            bin_thresholds=bin_thresholds,
+            n_bins_non_missing=n_bins_non_missing,
+            next_free_idx=next_free_idx)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/histogram.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/histogram.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/loss.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/loss.py
@ -0,0 +1,426 @@
+"""
+This module contains the loss classes.
+
+Specific losses are used for regression, binary classification or multiclass
+classification.
+"""
+# Author: Nicolas Hug
+
+from abc import ABC, abstractmethod
+
+import numpy as np
+from scipy.special import expit, logsumexp, xlogy
+
+from .common import Y_DTYPE
+from .common import G_H_DTYPE
+from ._loss import _update_gradients_least_squares
+from ._loss import _update_gradients_hessians_least_squares
+from ._loss import _update_gradients_least_absolute_deviation
+from ._loss import _update_gradients_hessians_least_absolute_deviation
+from ._loss import _update_gradients_hessians_binary_crossentropy
+from ._loss import _update_gradients_hessians_categorical_crossentropy
+from ._loss import _update_gradients_hessians_poisson
+from ...utils.stats import _weighted_percentile
+
+
+class BaseLoss(ABC):
+    """Base class for a loss."""
+
+    def __init__(self, hessians_are_constant):
+        self.hessians_are_constant = hessians_are_constant
+
+    def __call__(self, y_true, raw_predictions, sample_weight):
+        """Return the weighted average loss"""
+        return np.average(self.pointwise_loss(y_true, raw_predictions),
+                          weights=sample_weight)
+
+    @abstractmethod
+    def pointwise_loss(self, y_true, raw_predictions):
+        """Return loss value for each input"""
+
+    # This variable indicates whether the loss requires the leaves values to
+    # be updated once the tree has been trained. The trees are trained to
+    # predict a Newton-Raphson step (see grower._finalize_leaf()). But for
+    # some losses (e.g. least absolute deviation) we need to adjust the tree
+    # values to account for the "line search" of the gradient descent
+    # procedure. See the original paper Greedy Function Approximation: A
+    # Gradient Boosting Machine by Friedman
+    # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
+    need_update_leaves_values = False
+
+    def init_gradients_and_hessians(self, n_samples, prediction_dim,
+                                    sample_weight):
+        """Return initial gradients and hessians.
+
+        Unless hessians are constant, arrays are initialized with undefined
+        values.
+
+        Parameters
+        ----------
+        n_samples : int
+            The number of samples passed to `fit()`.
+
+        prediction_dim : int
+            The dimension of a raw prediction, i.e. the number of trees
+            built at each iteration. Equals 1 for regression and binary
+            classification, or K where K is the number of classes for
+            multiclass classification.
+
+        sample_weight : array-like of shape(n_samples,) default=None
+            Weights of training data.
+
+        Returns
+        -------
+        gradients : ndarray, shape (prediction_dim, n_samples)
+            The initial gradients. The array is not initialized.
+        hessians : ndarray, shape (prediction_dim, n_samples)
+            If hessians are constant (e.g. for `LeastSquares` loss, the
+            array is initialized to ``1``. Otherwise, the array is allocated
+            without being initialized.
+        """
+        shape = (prediction_dim, n_samples)
+        gradients = np.empty(shape=shape, dtype=G_H_DTYPE)
+
+        if self.hessians_are_constant:
+            # If the hessians are constant, we consider they are equal to 1.
+            # - This is correct for the half LS loss
+            # - For LAD loss, hessians are actually 0, but they are always
+            #   ignored anyway.
+            hessians = np.ones(shape=(1, 1), dtype=G_H_DTYPE)
+        else:
+            hessians = np.empty(shape=shape, dtype=G_H_DTYPE)
+
+        return gradients, hessians
+
+    @abstractmethod
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        """Return initial predictions (before the first iteration).
+
+        Parameters
+        ----------
+        y_train : ndarray, shape (n_samples,)
+            The target training values.
+
+        sample_weight : array-like of shape(n_samples,) default=None
+            Weights of training data.
+
+        prediction_dim : int
+            The dimension of one prediction: 1 for binary classification and
+            regression, n_classes for multiclass classification.
+
+        Returns
+        -------
+        baseline_prediction : float or ndarray, shape (1, prediction_dim)
+            The baseline prediction.
+        """
+
+    @abstractmethod
+    def update_gradients_and_hessians(self, gradients, hessians, y_true,
+                                      raw_predictions, sample_weight):
+        """Update gradients and hessians arrays, inplace.
+
+        The gradients (resp. hessians) are the first (resp. second) order
+        derivatives of the loss for each sample with respect to the
+        predictions of model, evaluated at iteration ``i - 1``.
+
+        Parameters
+        ----------
+        gradients : ndarray, shape (prediction_dim, n_samples)
+            The gradients (treated as OUT array).
+
+        hessians : ndarray, shape (prediction_dim, n_samples) or \
+            (1,)
+            The hessians (treated as OUT array).
+
+        y_true : ndarray, shape (n_samples,)
+            The true target values or each training sample.
+
+        raw_predictions : ndarray, shape (prediction_dim, n_samples)
+            The raw_predictions (i.e. values from the trees) of the tree
+            ensemble at iteration ``i - 1``.
+
+        sample_weight : array-like of shape(n_samples,) default=None
+            Weights of training data.
+        """
+
+
+class LeastSquares(BaseLoss):
+    """Least squares loss, for regression.
+
+    For a given sample x_i, least squares loss is defined as::
+
+        loss(x_i) = 0.5 * (y_true_i - raw_pred_i)**2
+
+    This actually computes the half least squares loss to simplify
+    the computation of the gradients and get a unit hessian (and be consistent
+    with what is done in LightGBM).
+    """
+
+    def __init__(self, sample_weight):
+        # If sample weights are provided, the hessians and gradients
+        # are multiplied by sample_weight, which means the hessians are
+        # equal to sample weights.
+        super().__init__(hessians_are_constant=sample_weight is None)
+
+    def pointwise_loss(self, y_true, raw_predictions):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        loss = 0.5 * np.power(y_true - raw_predictions, 2)
+        return loss
+
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        return np.average(y_train, weights=sample_weight)
+
+    @staticmethod
+    def inverse_link_function(raw_predictions):
+        return raw_predictions
+
+    def update_gradients_and_hessians(self, gradients, hessians, y_true,
+                                      raw_predictions, sample_weight):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        gradients = gradients.reshape(-1)
+        if sample_weight is None:
+            _update_gradients_least_squares(gradients, y_true, raw_predictions)
+        else:
+            hessians = hessians.reshape(-1)
+            _update_gradients_hessians_least_squares(gradients, hessians,
+                                                     y_true, raw_predictions,
+                                                     sample_weight)
+
+
+class LeastAbsoluteDeviation(BaseLoss):
+    """Least absolute deviation, for regression.
+
+    For a given sample x_i, the loss is defined as::
+
+        loss(x_i) = |y_true_i - raw_pred_i|
+    """
+
+    def __init__(self, sample_weight):
+        # If sample weights are provided, the hessians and gradients
+        # are multiplied by sample_weight, which means the hessians are
+        # equal to sample weights.
+        super().__init__(hessians_are_constant=sample_weight is None)
+
+    # This variable indicates whether the loss requires the leaves values to
+    # be updated once the tree has been trained. The trees are trained to
+    # predict a Newton-Raphson step (see grower._finalize_leaf()). But for
+    # some losses (e.g. least absolute deviation) we need to adjust the tree
+    # values to account for the "line search" of the gradient descent
+    # procedure. See the original paper Greedy Function Approximation: A
+    # Gradient Boosting Machine by Friedman
+    # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
+    need_update_leaves_values = True
+
+    def pointwise_loss(self, y_true, raw_predictions):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        loss = np.abs(y_true - raw_predictions)
+        return loss
+
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        if sample_weight is None:
+            return np.median(y_train)
+        else:
+            return _weighted_percentile(y_train, sample_weight, 50)
+
+    @staticmethod
+    def inverse_link_function(raw_predictions):
+        return raw_predictions
+
+    def update_gradients_and_hessians(self, gradients, hessians, y_true,
+                                      raw_predictions, sample_weight):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        gradients = gradients.reshape(-1)
+        if sample_weight is None:
+            _update_gradients_least_absolute_deviation(gradients, y_true,
+                                                       raw_predictions)
+        else:
+            hessians = hessians.reshape(-1)
+            _update_gradients_hessians_least_absolute_deviation(
+                gradients, hessians, y_true, raw_predictions, sample_weight)
+
+    def update_leaves_values(self, grower, y_true, raw_predictions,
+                             sample_weight):
+        # Update the values predicted by the tree with
+        # median(y_true - raw_predictions).
+        # See note about need_update_leaves_values in BaseLoss.
+
+        # TODO: ideally this should be computed in parallel over the leaves
+        # using something similar to _update_raw_predictions(), but this
+        # requires a cython version of median()
+        for leaf in grower.finalized_leaves:
+            indices = leaf.sample_indices
+            if sample_weight is None:
+                median_res = np.median(y_true[indices]
+                                       - raw_predictions[indices])
+            else:
+                median_res = _weighted_percentile(y_true[indices]
+                                                  - raw_predictions[indices],
+                                                  sample_weight=sample_weight,
+                                                  percentile=50)
+            leaf.value = grower.shrinkage * median_res
+            # Note that the regularization is ignored here
+
+
+class Poisson(BaseLoss):
+    """Poisson deviance loss with log-link, for regression.
+
+    For a given sample x_i, Poisson deviance loss is defined as::
+
+        loss(x_i) = y_true_i * log(y_true_i/exp(raw_pred_i))
+                    - y_true_i + exp(raw_pred_i))
+
+    This actually computes half the Poisson deviance to simplify
+    the computation of the gradients.
+    """
+
+    def __init__(self, sample_weight):
+        super().__init__(hessians_are_constant=False)
+
+    inverse_link_function = staticmethod(np.exp)
+
+    def pointwise_loss(self, y_true, raw_predictions):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        # TODO: For speed, we could remove the constant xlogy(y_true, y_true)
+        # Advantage of this form: minimum of zero at raw_predictions = y_true.
+        loss = (xlogy(y_true, y_true) - y_true * (raw_predictions + 1)
+                + np.exp(raw_predictions))
+        return loss
+
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        y_pred = np.average(y_train, weights=sample_weight)
+        eps = np.finfo(y_train.dtype).eps
+        y_pred = np.clip(y_pred, eps, None)
+        return np.log(y_pred)
+
+    def update_gradients_and_hessians(self, gradients, hessians, y_true,
+                                      raw_predictions, sample_weight):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        gradients = gradients.reshape(-1)
+        hessians = hessians.reshape(-1)
+        _update_gradients_hessians_poisson(gradients, hessians,
+                                           y_true, raw_predictions,
+                                           sample_weight)
+
+
+class BinaryCrossEntropy(BaseLoss):
+    """Binary cross-entropy loss, for binary classification.
+
+    For a given sample x_i, the binary cross-entropy loss is defined as the
+    negative log-likelihood of the model which can be expressed as::
+
+        loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
+
+    See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
+    section 4.4.1 (about logistic regression).
+    """
+
+    def __init__(self, sample_weight):
+        super().__init__(hessians_are_constant=False)
+
+    inverse_link_function = staticmethod(expit)
+
+    def pointwise_loss(self, y_true, raw_predictions):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        # logaddexp(0, x) = log(1 + exp(x))
+        loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions
+        return loss
+
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        if prediction_dim > 2:
+            raise ValueError(
+                "loss='binary_crossentropy' is not defined for multiclass"
+                " classification with n_classes=%d, use"
+                " loss='categorical_crossentropy' instead" % prediction_dim)
+        proba_positive_class = np.average(y_train, weights=sample_weight)
+        eps = np.finfo(y_train.dtype).eps
+        proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps)
+        # log(x / 1 - x) is the anti function of sigmoid, or the link function
+        # of the Binomial model.
+        return np.log(proba_positive_class / (1 - proba_positive_class))
+
+    def update_gradients_and_hessians(self, gradients, hessians, y_true,
+                                      raw_predictions, sample_weight):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        gradients = gradients.reshape(-1)
+        hessians = hessians.reshape(-1)
+        _update_gradients_hessians_binary_crossentropy(
+            gradients, hessians, y_true, raw_predictions, sample_weight)
+
+    def predict_proba(self, raw_predictions):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        proba = np.empty((raw_predictions.shape[0], 2), dtype=Y_DTYPE)
+        proba[:, 1] = expit(raw_predictions)
+        proba[:, 0] = 1 - proba[:, 1]
+        return proba
+
+
+class CategoricalCrossEntropy(BaseLoss):
+    """Categorical cross-entropy loss, for multiclass classification.
+
+    For a given sample x_i, the categorical cross-entropy loss is defined as
+    the negative log-likelihood of the model and generalizes the binary
+    cross-entropy to more than 2 classes.
+    """
+
+    def __init__(self, sample_weight):
+        super().__init__(hessians_are_constant=False)
+
+    def pointwise_loss(self, y_true, raw_predictions):
+        one_hot_true = np.zeros_like(raw_predictions)
+        prediction_dim = raw_predictions.shape[0]
+        for k in range(prediction_dim):
+            one_hot_true[k, :] = (y_true == k)
+
+        loss = (logsumexp(raw_predictions, axis=0) -
+                (one_hot_true * raw_predictions).sum(axis=0))
+        return loss
+
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE)
+        eps = np.finfo(y_train.dtype).eps
+        for k in range(prediction_dim):
+            proba_kth_class = np.average(y_train == k,
+                                         weights=sample_weight)
+            proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps)
+            init_value[k, :] += np.log(proba_kth_class)
+
+        return init_value
+
+    def update_gradients_and_hessians(self, gradients, hessians, y_true,
+                                      raw_predictions, sample_weight):
+        _update_gradients_hessians_categorical_crossentropy(
+            gradients, hessians, y_true, raw_predictions, sample_weight)
+
+    def predict_proba(self, raw_predictions):
+        # TODO: This could be done in parallel
+        # compute softmax (using exp(log(softmax)))
+        proba = np.exp(raw_predictions -
+                       logsumexp(raw_predictions, axis=0)[np.newaxis, :])
+        return proba.T
+
+
+_LOSSES = {
+    'least_squares': LeastSquares,
+    'least_absolute_deviation': LeastAbsoluteDeviation,
+    'binary_crossentropy': BinaryCrossEntropy,
+    'categorical_crossentropy': CategoricalCrossEntropy,
+    'poisson': Poisson,
+}
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/predictor.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/predictor.py
@ -0,0 +1,86 @@
+"""
+This module contains the TreePredictor class which is used for prediction.
+"""
+# Author: Nicolas Hug
+
+import numpy as np
+
+from .common import Y_DTYPE
+from ._predictor import _predict_from_numeric_data
+from ._predictor import _predict_from_binned_data
+from ._predictor import _compute_partial_dependence
+
+
+class TreePredictor:
+    """Tree class used for predictions.
+
+    Parameters
+    ----------
+    nodes : ndarray of PREDICTOR_RECORD_DTYPE
+        The nodes of the tree.
+    """
+    def __init__(self, nodes):
+        self.nodes = nodes
+
+    def get_n_leaf_nodes(self):
+        """Return number of leaves."""
+        return int(self.nodes['is_leaf'].sum())
+
+    def get_max_depth(self):
+        """Return maximum depth among all leaves."""
+        return int(self.nodes['depth'].max())
+
+    def predict(self, X):
+        """Predict raw values for non-binned data.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        y : ndarray, shape (n_samples,)
+            The raw predicted values.
+        """
+        out = np.empty(X.shape[0], dtype=Y_DTYPE)
+        _predict_from_numeric_data(self.nodes, X, out)
+        return out
+
+    def predict_binned(self, X, missing_values_bin_idx):
+        """Predict raw values for binned data.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            The input samples.
+        missing_values_bin_idx : uint8
+            Index of the bin that is used for missing values. This is the
+            index of the last bin and is always equal to max_bins (as passed
+            to the GBDT classes), or equivalently to n_bins - 1.
+
+        Returns
+        -------
+        y : ndarray, shape (n_samples,)
+            The raw predicted values.
+        """
+        out = np.empty(X.shape[0], dtype=Y_DTYPE)
+        _predict_from_binned_data(self.nodes, X, missing_values_bin_idx, out)
+        return out
+
+    def compute_partial_dependence(self, grid, target_features, out):
+        """Fast partial dependence computation.
+
+        Parameters
+        ----------
+        grid : ndarray, shape (n_samples, n_target_features)
+            The grid points on which the partial dependence should be
+            evaluated.
+        target_features : ndarray, shape (n_target_features)
+            The set of target features for which the partial dependence
+            should be evaluated.
+        out : ndarray, shape (n_samples)
+            The value of the partial dependence function on each grid
+            point.
+        """
+        _compute_partial_dependence(self.nodes, grid, target_features, out)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/splitting.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/splitting.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/init.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/init.py
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_binning.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_binning.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_compare_lightgbm.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_compare_lightgbm.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_gradient_boosting.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_gradient_boosting.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_grower.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_grower.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_histogram.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_histogram.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_loss.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_loss.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_monotonic_contraints.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_monotonic_contraints.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_predictor.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_predictor.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_splitting.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_splitting.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_warm_start.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/pycache/test_warm_start.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
@ -0,0 +1,314 @@
+import numpy as np
+from numpy.testing import assert_array_equal, assert_allclose
+import pytest
+
+from sklearn.ensemble._hist_gradient_boosting.binning import (
+    _BinMapper,
+    _find_binning_thresholds as _find_binning_thresholds_orig,
+    _map_to_bins
+)
+from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import ALMOST_INF
+
+
+DATA = np.random.RandomState(42).normal(
+    loc=[0, 10], scale=[1, 0.01], size=(int(1e6), 2)
+).astype(X_DTYPE)
+
+
+def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5),
+                             random_state=None):
+    # Just a redef to avoid having to pass arguments all the time (as the
+    # function is private we don't use default values for parameters)
+    return _find_binning_thresholds_orig(data, max_bins, subsample,
+                                         random_state)
+
+
+def test_find_binning_thresholds_regular_data():
+    data = np.linspace(0, 10, 1001).reshape(-1, 1)
+    bin_thresholds = _find_binning_thresholds(data, max_bins=10)
+    assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9])
+    assert len(bin_thresholds) == 1
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=5)
+    assert_allclose(bin_thresholds[0], [2, 4, 6, 8])
+    assert len(bin_thresholds) == 1
+
+
+def test_find_binning_thresholds_small_regular_data():
+    data = np.linspace(0, 10, 11).reshape(-1, 1)
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=5)
+    assert_allclose(bin_thresholds[0], [2, 4, 6, 8])
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=10)
+    assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=11)
+    assert_allclose(bin_thresholds[0], np.arange(10) + .5)
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=255)
+    assert_allclose(bin_thresholds[0], np.arange(10) + .5)
+
+
+def test_find_binning_thresholds_random_data():
+    bin_thresholds = _find_binning_thresholds(DATA, max_bins=255,
+                                              random_state=0)
+    assert len(bin_thresholds) == 2
+    for i in range(len(bin_thresholds)):
+        assert bin_thresholds[i].shape == (254,)  # 255 - 1
+        assert bin_thresholds[i].dtype == DATA.dtype
+
+    assert_allclose(bin_thresholds[0][[64, 128, 192]],
+                    np.array([-0.7, 0.0, 0.7]), atol=1e-1)
+
+    assert_allclose(bin_thresholds[1][[64, 128, 192]],
+                    np.array([9.99, 10.00, 10.01]), atol=1e-2)
+
+
+def test_find_binning_thresholds_low_n_bins():
+    bin_thresholds = _find_binning_thresholds(DATA, max_bins=128,
+                                              random_state=0)
+    assert len(bin_thresholds) == 2
+    for i in range(len(bin_thresholds)):
+        assert bin_thresholds[i].shape == (127,)  # 128 - 1
+        assert bin_thresholds[i].dtype == DATA.dtype
+
+
+@pytest.mark.parametrize('n_bins', (2, 257))
+def test_invalid_n_bins(n_bins):
+    err_msg = (
+        'n_bins={} should be no smaller than 3 and no larger than 256'
+        .format(n_bins))
+    with pytest.raises(ValueError, match=err_msg):
+        _BinMapper(n_bins=n_bins).fit(DATA)
+
+
+def test_bin_mapper_n_features_transform():
+    mapper = _BinMapper(n_bins=42, random_state=42).fit(DATA)
+    err_msg = 'This estimator was fitted with 2 features but 4 got passed'
+    with pytest.raises(ValueError, match=err_msg):
+        mapper.transform(np.repeat(DATA, 2, axis=1))
+
+
+@pytest.mark.parametrize('max_bins', [16, 128, 255])
+def test_map_to_bins(max_bins):
+    bin_thresholds = _find_binning_thresholds(DATA, max_bins=max_bins,
+                                              random_state=0)
+    binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order='F')
+    last_bin_idx = max_bins
+    _map_to_bins(DATA, bin_thresholds, last_bin_idx, binned)
+    assert binned.shape == DATA.shape
+    assert binned.dtype == np.uint8
+    assert binned.flags.f_contiguous
+
+    min_indices = DATA.argmin(axis=0)
+    max_indices = DATA.argmax(axis=0)
+
+    for feature_idx, min_idx in enumerate(min_indices):
+        assert binned[min_idx, feature_idx] == 0
+    for feature_idx, max_idx in enumerate(max_indices):
+        assert binned[max_idx, feature_idx] == max_bins - 1
+
+
+@pytest.mark.parametrize("max_bins", [5, 10, 42])
+def test_bin_mapper_random_data(max_bins):
+    n_samples, n_features = DATA.shape
+
+    expected_count_per_bin = n_samples // max_bins
+    tol = int(0.05 * expected_count_per_bin)
+
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    mapper = _BinMapper(n_bins=n_bins, random_state=42).fit(DATA)
+    binned = mapper.transform(DATA)
+
+    assert binned.shape == (n_samples, n_features)
+    assert binned.dtype == np.uint8
+    assert_array_equal(binned.min(axis=0), np.array([0, 0]))
+    assert_array_equal(binned.max(axis=0),
+                       np.array([max_bins - 1, max_bins - 1]))
+    assert len(mapper.bin_thresholds_) == n_features
+    for bin_thresholds_feature in mapper.bin_thresholds_:
+        assert bin_thresholds_feature.shape == (max_bins - 1,)
+        assert bin_thresholds_feature.dtype == DATA.dtype
+    assert np.all(mapper.n_bins_non_missing_ == max_bins)
+
+    # Check that the binned data is approximately balanced across bins.
+    for feature_idx in range(n_features):
+        for bin_idx in range(max_bins):
+            count = (binned[:, feature_idx] == bin_idx).sum()
+            assert abs(count - expected_count_per_bin) < tol
+
+
+@pytest.mark.parametrize("n_samples, max_bins", [
+    (5, 5),
+    (5, 10),
+    (5, 11),
+    (42, 255)
+])
+def test_bin_mapper_small_random_data(n_samples, max_bins):
+    data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1)
+    assert len(np.unique(data)) == n_samples
+
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    mapper = _BinMapper(n_bins=n_bins, random_state=42)
+    binned = mapper.fit_transform(data)
+
+    assert binned.shape == data.shape
+    assert binned.dtype == np.uint8
+    assert_array_equal(binned.ravel()[np.argsort(data.ravel())],
+                       np.arange(n_samples))
+
+
+@pytest.mark.parametrize("max_bins, n_distinct, multiplier", [
+    (5, 5, 1),
+    (5, 5, 3),
+    (255, 12, 42),
+])
+def test_bin_mapper_identity_repeated_values(max_bins, n_distinct, multiplier):
+    data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1)
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    binned = _BinMapper(n_bins=n_bins).fit_transform(data)
+    assert_array_equal(data, binned)
+
+
+@pytest.mark.parametrize('n_distinct', [2, 7, 42])
+def test_bin_mapper_repeated_values_invariance(n_distinct):
+    rng = np.random.RandomState(42)
+    distinct_values = rng.normal(size=n_distinct)
+    assert len(np.unique(distinct_values)) == n_distinct
+
+    repeated_indices = rng.randint(low=0, high=n_distinct, size=1000)
+    data = distinct_values[repeated_indices]
+    rng.shuffle(data)
+    assert_array_equal(np.unique(data), np.sort(distinct_values))
+
+    data = data.reshape(-1, 1)
+
+    mapper_1 = _BinMapper(n_bins=n_distinct + 1)
+    binned_1 = mapper_1.fit_transform(data)
+    assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct))
+
+    # Adding more bins to the mapper yields the same results (same thresholds)
+    mapper_2 = _BinMapper(n_bins=min(256, n_distinct * 3) + 1)
+    binned_2 = mapper_2.fit_transform(data)
+
+    assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0])
+    assert_array_equal(binned_1, binned_2)
+
+
+@pytest.mark.parametrize("max_bins, scale, offset", [
+    (3, 2, -1),
+    (42, 1, 0),
+    (255, 0.3, 42),
+])
+def test_bin_mapper_identity_small(max_bins, scale, offset):
+    data = np.arange(max_bins).reshape(-1, 1) * scale + offset
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    binned = _BinMapper(n_bins=n_bins).fit_transform(data)
+    assert_array_equal(binned, np.arange(max_bins).reshape(-1, 1))
+
+
+@pytest.mark.parametrize('max_bins_small, max_bins_large', [
+    (2, 2),
+    (3, 3),
+    (4, 4),
+    (42, 42),
+    (255, 255),
+    (5, 17),
+    (42, 255),
+])
+def test_bin_mapper_idempotence(max_bins_small, max_bins_large):
+    assert max_bins_large >= max_bins_small
+    data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1)
+    mapper_small = _BinMapper(n_bins=max_bins_small + 1)
+    mapper_large = _BinMapper(n_bins=max_bins_small + 1)
+    binned_small = mapper_small.fit_transform(data)
+    binned_large = mapper_large.fit_transform(binned_small)
+    assert_array_equal(binned_small, binned_large)
+
+
+@pytest.mark.parametrize('n_bins', [10, 100, 256])
+@pytest.mark.parametrize('diff', [-5, 0, 5])
+def test_n_bins_non_missing(n_bins, diff):
+    # Check that n_bins_non_missing is n_unique_values when
+    # there are not a lot of unique values, else n_bins - 1.
+
+    n_unique_values = n_bins + diff
+    X = list(range(n_unique_values)) * 2
+    X = np.array(X).reshape(-1, 1)
+    mapper = _BinMapper(n_bins=n_bins).fit(X)
+    assert np.all(mapper.n_bins_non_missing_ == min(
+        n_bins - 1, n_unique_values))
+
+
+def test_subsample():
+    # Make sure bin thresholds are different when applying subsampling
+    mapper_no_subsample = _BinMapper(subsample=None, random_state=0).fit(DATA)
+    mapper_subsample = _BinMapper(subsample=256, random_state=0).fit(DATA)
+
+    for feature in range(DATA.shape[1]):
+        assert not np.allclose(mapper_no_subsample.bin_thresholds_[feature],
+                               mapper_subsample.bin_thresholds_[feature],
+                               rtol=1e-4)
+
+
+@pytest.mark.parametrize(
+    'n_bins, n_bins_non_missing, X_trans_expected', [
+        (256, [4, 2, 2], [[0,   0,   0],  # 255 <=> missing value
+                          [255, 255, 0],
+                          [1,   0,   0],
+                          [255, 1,   1],
+                          [2,   1,   1],
+                          [3,   0,   0]]),
+        (3, [2, 2, 2], [[0, 0, 0],  # 2 <=> missing value
+                        [2, 2, 0],
+                        [0, 0, 0],
+                        [2, 1, 1],
+                        [1, 1, 1],
+                        [1, 0, 0]])])
+def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected):
+    # check for missing values: make sure nans are mapped to the last bin
+    # and that the _BinMapper attributes are correct
+
+    X = [[1,      1,      0],
+         [np.NaN, np.NaN, 0],
+         [2,      1,      0],
+         [np.NaN, 2,      1],
+         [3,      2,      1],
+         [4,      1,      0]]
+
+    X = np.array(X)
+
+    mapper = _BinMapper(n_bins=n_bins)
+    mapper.fit(X)
+
+    assert_array_equal(mapper.n_bins_non_missing_, n_bins_non_missing)
+
+    for feature_idx in range(X.shape[1]):
+        assert len(mapper.bin_thresholds_[feature_idx]) == \
+            n_bins_non_missing[feature_idx] - 1
+
+    assert mapper.missing_values_bin_idx_ == n_bins - 1
+
+    X_trans = mapper.transform(X)
+    assert_array_equal(X_trans, X_trans_expected)
+
+
+def test_infinite_values():
+    # Make sure infinite values are properly handled.
+    bin_mapper = _BinMapper()
+
+    X = np.array([-np.inf, 0, 1,  np.inf]).reshape(-1, 1)
+
+    bin_mapper.fit(X)
+    assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, .5, ALMOST_INF])
+    assert bin_mapper.n_bins_non_missing_ == [4]
+
+    expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1)
+    assert_array_equal(bin_mapper.transform(X), expected_binned_X)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
@ -0,0 +1,223 @@
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+from sklearn.datasets import make_classification, make_regression
+import numpy as np
+import pytest
+
+# To use this experimental feature, we need to explicitly ask for it:
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.ensemble._hist_gradient_boosting.utils import (
+    get_equivalent_estimator)
+
+
+@pytest.mark.parametrize('seed', range(5))
+@pytest.mark.parametrize('min_samples_leaf', (1, 20))
+@pytest.mark.parametrize('n_samples, max_leaf_nodes', [
+    (255, 4096),
+    (1000, 8),
+])
+def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
+                                     max_leaf_nodes):
+    # Make sure sklearn has the same predictions as lightgbm for easy targets.
+    #
+    # In particular when the size of the trees are bound and the number of
+    # samples is large enough, the structure of the prediction trees found by
+    # LightGBM and sklearn should be exactly identical.
+    #
+    # Notes:
+    # - Several candidate splits may have equal gains when the number of
+    #   samples in a node is low (and because of float errors). Therefore the
+    #   predictions on the test set might differ if the structure of the tree
+    #   is not exactly the same. To avoid this issue we only compare the
+    #   predictions on the test set when the number of samples is large enough
+    #   and max_leaf_nodes is low enough.
+    # - To ignore  discrepancies caused by small differences the binning
+    #   strategy, data is pre-binned if n_samples > 255.
+    # - We don't check the least_absolute_deviation loss here. This is because
+    #   LightGBM's computation of the median (used for the initial value of
+    #   raw_prediction) is a bit off (they'll e.g. return midpoints when there
+    #   is no need to.). Since these tests only run 1 iteration, the
+    #   discrepancy between the initial values leads to biggish differences in
+    #   the predictions. These differences are much smaller with more
+    #   iterations.
+    pytest.importorskip("lightgbm")
+
+    rng = np.random.RandomState(seed=seed)
+    n_samples = n_samples
+    max_iter = 1
+    max_bins = 255
+
+    X, y = make_regression(n_samples=n_samples, n_features=5,
+                           n_informative=5, random_state=0)
+
+    if n_samples > 255:
+        # bin data and convert it to float32 so that the estimator doesn't
+        # treat it as pre-binned
+        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+
+    est_sklearn = HistGradientBoostingRegressor(
+        max_iter=max_iter,
+        max_bins=max_bins,
+        learning_rate=1,
+        early_stopping=False,
+        min_samples_leaf=min_samples_leaf,
+        max_leaf_nodes=max_leaf_nodes)
+    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
+
+    est_lightgbm.fit(X_train, y_train)
+    est_sklearn.fit(X_train, y_train)
+
+    # We need X to be treated an numerical data, not pre-binned data.
+    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
+
+    pred_lightgbm = est_lightgbm.predict(X_train)
+    pred_sklearn = est_sklearn.predict(X_train)
+    # less than 1% of the predictions are different up to the 3rd decimal
+    assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < .011
+
+    if max_leaf_nodes < 10 and n_samples >= 1000:
+        pred_lightgbm = est_lightgbm.predict(X_test)
+        pred_sklearn = est_sklearn.predict(X_test)
+        # less than 1% of the predictions are different up to the 4th decimal
+        assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < .01
+
+
+@pytest.mark.parametrize('seed', range(5))
+@pytest.mark.parametrize('min_samples_leaf', (1, 20))
+@pytest.mark.parametrize('n_samples, max_leaf_nodes', [
+    (255, 4096),
+    (1000, 8),
+])
+def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
+                                         max_leaf_nodes):
+    # Same as test_same_predictions_regression but for classification
+    pytest.importorskip("lightgbm")
+
+    rng = np.random.RandomState(seed=seed)
+    n_samples = n_samples
+    max_iter = 1
+    max_bins = 255
+
+    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5,
+                               n_informative=5, n_redundant=0, random_state=0)
+
+    if n_samples > 255:
+        # bin data and convert it to float32 so that the estimator doesn't
+        # treat it as pre-binned
+        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+
+    est_sklearn = HistGradientBoostingClassifier(
+        loss='binary_crossentropy',
+        max_iter=max_iter,
+        max_bins=max_bins,
+        learning_rate=1,
+        early_stopping=False,
+        min_samples_leaf=min_samples_leaf,
+        max_leaf_nodes=max_leaf_nodes)
+    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
+
+    est_lightgbm.fit(X_train, y_train)
+    est_sklearn.fit(X_train, y_train)
+
+    # We need X to be treated an numerical data, not pre-binned data.
+    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
+
+    pred_lightgbm = est_lightgbm.predict(X_train)
+    pred_sklearn = est_sklearn.predict(X_train)
+    assert np.mean(pred_sklearn == pred_lightgbm) > .89
+
+    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
+    acc_sklearn = accuracy_score(y_train, pred_sklearn)
+    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)
+
+    if max_leaf_nodes < 10 and n_samples >= 1000:
+
+        pred_lightgbm = est_lightgbm.predict(X_test)
+        pred_sklearn = est_sklearn.predict(X_test)
+        assert np.mean(pred_sklearn == pred_lightgbm) > .89
+
+        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
+        acc_sklearn = accuracy_score(y_test, pred_sklearn)
+        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
+
+
+@pytest.mark.parametrize('seed', range(5))
+@pytest.mark.parametrize('min_samples_leaf', (1, 20))
+@pytest.mark.parametrize('n_samples, max_leaf_nodes', [
+    (255, 4096),
+    (10000, 8),
+])
+def test_same_predictions_multiclass_classification(
+        seed, min_samples_leaf, n_samples, max_leaf_nodes):
+    # Same as test_same_predictions_regression but for classification
+    pytest.importorskip("lightgbm")
+
+    rng = np.random.RandomState(seed=seed)
+    n_samples = n_samples
+    max_iter = 1
+    max_bins = 255
+    lr = 1
+
+    X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5,
+                               n_informative=5, n_redundant=0,
+                               n_clusters_per_class=1, random_state=0)
+
+    if n_samples > 255:
+        # bin data and convert it to float32 so that the estimator doesn't
+        # treat it as pre-binned
+        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+
+    est_sklearn = HistGradientBoostingClassifier(
+        loss='categorical_crossentropy',
+        max_iter=max_iter,
+        max_bins=max_bins,
+        learning_rate=lr,
+        early_stopping=False,
+        min_samples_leaf=min_samples_leaf,
+        max_leaf_nodes=max_leaf_nodes)
+    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
+
+    est_lightgbm.fit(X_train, y_train)
+    est_sklearn.fit(X_train, y_train)
+
+    # We need X to be treated an numerical data, not pre-binned data.
+    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
+
+    pred_lightgbm = est_lightgbm.predict(X_train)
+    pred_sklearn = est_sklearn.predict(X_train)
+    assert np.mean(pred_sklearn == pred_lightgbm) > .89
+
+    proba_lightgbm = est_lightgbm.predict_proba(X_train)
+    proba_sklearn = est_sklearn.predict_proba(X_train)
+    # assert more than 75% of the predicted probabilities are the same up to
+    # the second decimal
+    assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75
+
+    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
+    acc_sklearn = accuracy_score(y_train, pred_sklearn)
+    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
+
+    if max_leaf_nodes < 10 and n_samples >= 1000:
+
+        pred_lightgbm = est_lightgbm.predict(X_test)
+        pred_sklearn = est_sklearn.predict(X_test)
+        assert np.mean(pred_sklearn == pred_lightgbm) > .89
+
+        proba_lightgbm = est_lightgbm.predict_proba(X_train)
+        proba_sklearn = est_sklearn.predict_proba(X_train)
+        # assert more than 75% of the predicted probabilities are the same up
+        # to the second decimal
+        assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75
+
+        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
+        acc_sklearn = accuracy_score(y_test, pred_sklearn)
+        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@ -0,0 +1,746 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+from sklearn.datasets import make_classification, make_regression
+from sklearn.datasets import make_low_rank_matrix
+from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler
+from sklearn.model_selection import train_test_split
+from sklearn.base import clone, BaseEstimator, TransformerMixin
+from sklearn.pipeline import make_pipeline
+from sklearn.metrics import mean_poisson_deviance
+from sklearn.dummy import DummyRegressor
+
+# To use this experimental feature, we need to explicitly ask for it:
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
+from sklearn.ensemble._hist_gradient_boosting.loss import LeastSquares
+from sklearn.ensemble._hist_gradient_boosting.loss import BinaryCrossEntropy
+from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.utils import shuffle
+
+
+X_classification, y_classification = make_classification(random_state=0)
+X_regression, y_regression = make_regression(random_state=0)
+
+
+def _make_dumb_dataset(n_samples):
+    """Make a dumb dataset to test early stopping."""
+    rng = np.random.RandomState(42)
+    X_dumb = rng.randn(n_samples, 1)
+    y_dumb = (X_dumb[:, 0] > 0).astype('int64')
+    return X_dumb, y_dumb
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+@pytest.mark.parametrize(
+    'params, err_msg',
+    [({'loss': 'blah'}, 'Loss blah is not supported for'),
+     ({'learning_rate': 0}, 'learning_rate=0 must be strictly positive'),
+     ({'learning_rate': -1}, 'learning_rate=-1 must be strictly positive'),
+     ({'max_iter': 0}, 'max_iter=0 must not be smaller than 1'),
+     ({'max_leaf_nodes': 0}, 'max_leaf_nodes=0 should not be smaller than 2'),
+     ({'max_leaf_nodes': 1}, 'max_leaf_nodes=1 should not be smaller than 2'),
+     ({'max_depth': 0}, 'max_depth=0 should not be smaller than 1'),
+     ({'min_samples_leaf': 0}, 'min_samples_leaf=0 should not be smaller'),
+     ({'l2_regularization': -1}, 'l2_regularization=-1 must be positive'),
+     ({'max_bins': 1}, 'max_bins=1 should be no smaller than 2 and no larger'),
+     ({'max_bins': 256}, 'max_bins=256 should be no smaller than 2 and no'),
+     ({'n_iter_no_change': -1}, 'n_iter_no_change=-1 must be positive'),
+     ({'validation_fraction': -1}, 'validation_fraction=-1 must be strictly'),
+     ({'validation_fraction': 0}, 'validation_fraction=0 must be strictly'),
+     ({'tol': -1}, 'tol=-1 must not be smaller than 0')]
+)
+def test_init_parameters_validation(GradientBoosting, X, y, params, err_msg):
+
+    with pytest.raises(ValueError, match=err_msg):
+        GradientBoosting(**params).fit(X, y)
+
+
+def test_invalid_classification_loss():
+    binary_clf = HistGradientBoostingClassifier(loss="binary_crossentropy")
+    err_msg = ("loss='binary_crossentropy' is not defined for multiclass "
+               "classification with n_classes=3, use "
+               "loss='categorical_crossentropy' instead")
+    with pytest.raises(ValueError, match=err_msg):
+        binary_clf.fit(np.zeros(shape=(3, 2)), np.arange(3))
+
+
+@pytest.mark.parametrize(
+    'scoring, validation_fraction, early_stopping, n_iter_no_change, tol', [
+        ('neg_mean_squared_error', .1, True, 5, 1e-7),  # use scorer
+        ('neg_mean_squared_error', None, True, 5, 1e-1),  # use scorer on train
+        (None, .1, True, 5, 1e-7),  # same with default scorer
+        (None, None, True, 5, 1e-1),
+        ('loss', .1, True, 5, 1e-7),  # use loss
+        ('loss', None, True, 5, 1e-1),  # use loss on training data
+        (None, None, False, 5, None),  # no early stopping
+        ])
+def test_early_stopping_regression(scoring, validation_fraction,
+                                   early_stopping, n_iter_no_change, tol):
+
+    max_iter = 200
+
+    X, y = make_regression(n_samples=50, random_state=0)
+
+    gb = HistGradientBoostingRegressor(
+        verbose=1,  # just for coverage
+        min_samples_leaf=5,  # easier to overfit fast
+        scoring=scoring,
+        tol=tol,
+        early_stopping=early_stopping,
+        validation_fraction=validation_fraction,
+        max_iter=max_iter,
+        n_iter_no_change=n_iter_no_change,
+        random_state=0
+    )
+    gb.fit(X, y)
+
+    if early_stopping:
+        assert n_iter_no_change <= gb.n_iter_ < max_iter
+    else:
+        assert gb.n_iter_ == max_iter
+
+
+@pytest.mark.parametrize('data', (
+    make_classification(n_samples=30, random_state=0),
+    make_classification(n_samples=30, n_classes=3, n_clusters_per_class=1,
+                        random_state=0)
+))
+@pytest.mark.parametrize(
+    'scoring, validation_fraction, early_stopping, n_iter_no_change, tol', [
+        ('accuracy', .1, True, 5, 1e-7),  # use scorer
+        ('accuracy', None, True, 5, 1e-1),  # use scorer on training data
+        (None, .1, True, 5, 1e-7),  # same with default scorer
+        (None, None, True, 5, 1e-1),
+        ('loss', .1, True, 5, 1e-7),  # use loss
+        ('loss', None, True, 5, 1e-1),  # use loss on training data
+        (None, None, False, 5, None),  # no early stopping
+        ])
+def test_early_stopping_classification(data, scoring, validation_fraction,
+                                       early_stopping, n_iter_no_change, tol):
+
+    max_iter = 50
+
+    X, y = data
+
+    gb = HistGradientBoostingClassifier(
+        verbose=1,  # just for coverage
+        min_samples_leaf=5,  # easier to overfit fast
+        scoring=scoring,
+        tol=tol,
+        early_stopping=early_stopping,
+        validation_fraction=validation_fraction,
+        max_iter=max_iter,
+        n_iter_no_change=n_iter_no_change,
+        random_state=0
+    )
+    gb.fit(X, y)
+
+    if early_stopping is True:
+        assert n_iter_no_change <= gb.n_iter_ < max_iter
+    else:
+        assert gb.n_iter_ == max_iter
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, *_make_dumb_dataset(10000)),
+    (HistGradientBoostingClassifier, *_make_dumb_dataset(10001)),
+    (HistGradientBoostingRegressor, *_make_dumb_dataset(10000)),
+    (HistGradientBoostingRegressor, *_make_dumb_dataset(10001))
+])
+def test_early_stopping_default(GradientBoosting, X, y):
+    # Test that early stopping is enabled by default if and only if there
+    # are more than 10000 samples
+    gb = GradientBoosting(max_iter=10, n_iter_no_change=2, tol=1e-1)
+    gb.fit(X, y)
+    if X.shape[0] > 10000:
+        assert gb.n_iter_ < gb.max_iter
+    else:
+        assert gb.n_iter_ == gb.max_iter
+
+
+@pytest.mark.parametrize(
+    'scores, n_iter_no_change, tol, stopping',
+    [
+        ([], 1, 0.001, False),  # not enough iterations
+        ([1, 1, 1], 5, 0.001, False),  # not enough iterations
+        ([1, 1, 1, 1, 1], 5, 0.001, False),  # not enough iterations
+        ([1, 2, 3, 4, 5, 6], 5, 0.001, False),  # significant improvement
+        ([1, 2, 3, 4, 5, 6], 5, 0., False),  # significant improvement
+        ([1, 2, 3, 4, 5, 6], 5, 0.999, False),  # significant improvement
+        ([1, 2, 3, 4, 5, 6], 5, 5 - 1e-5, False),  # significant improvement
+        ([1] * 6, 5, 0., True),  # no significant improvement
+        ([1] * 6, 5, 0.001, True),  # no significant improvement
+        ([1] * 6, 5, 5, True),  # no significant improvement
+    ]
+)
+def test_should_stop(scores, n_iter_no_change, tol, stopping):
+
+    gbdt = HistGradientBoostingClassifier(
+        n_iter_no_change=n_iter_no_change, tol=tol
+    )
+    assert gbdt._should_stop(scores) == stopping
+
+
+def test_least_absolute_deviation():
+    # For coverage only.
+    X, y = make_regression(n_samples=500, random_state=0)
+    gbdt = HistGradientBoostingRegressor(loss='least_absolute_deviation',
+                                         random_state=0)
+    gbdt.fit(X, y)
+    assert gbdt.score(X, y) > .9
+
+
+@pytest.mark.parametrize('y', [([1., -2., 0.]), ([0., 0., 0.])])
+def test_poisson_y_positive(y):
+    # Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0.
+    err_msg = r"loss='poisson' requires non-negative y and sum\(y\) > 0."
+    gbdt = HistGradientBoostingRegressor(loss='poisson', random_state=0)
+    with pytest.raises(ValueError, match=err_msg):
+        gbdt.fit(np.zeros(shape=(len(y), 1)), y)
+
+
+def test_poisson():
+    # For Poisson distributed target, Poisson loss should give better results
+    # than least squares measured in Poisson deviance as metric.
+    rng = np.random.RandomState(42)
+    n_train, n_test, n_features = 500, 100, 100
+    X = make_low_rank_matrix(n_samples=n_train+n_test, n_features=n_features,
+                             random_state=rng)
+    # We create a log-linear Poisson model and downscale coef as it will get
+    # exponentiated.
+    coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
+    y = rng.poisson(lam=np.exp(X @ coef))
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test,
+                                                        random_state=rng)
+    gbdt_pois = HistGradientBoostingRegressor(loss='poisson', random_state=rng)
+    gbdt_ls = HistGradientBoostingRegressor(loss='least_squares',
+                                            random_state=rng)
+    gbdt_pois.fit(X_train, y_train)
+    gbdt_ls.fit(X_train, y_train)
+    dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)
+
+    for X, y in [(X_train, y_train), (X_test, y_test)]:
+        metric_pois = mean_poisson_deviance(y, gbdt_pois.predict(X))
+        # least_squares might produce non-positive predictions => clip
+        metric_ls = mean_poisson_deviance(y, np.clip(gbdt_ls.predict(X), 1e-15,
+                                                     None))
+        metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
+        assert metric_pois < metric_ls
+        assert metric_pois < metric_dummy
+
+
+def test_binning_train_validation_are_separated():
+    # Make sure training and validation data are binned separately.
+    # See issue 13926
+
+    rng = np.random.RandomState(0)
+    validation_fraction = .2
+    gb = HistGradientBoostingClassifier(
+        early_stopping=True,
+        validation_fraction=validation_fraction,
+        random_state=rng
+    )
+    gb.fit(X_classification, y_classification)
+    mapper_training_data = gb.bin_mapper_
+
+    # Note that since the data is small there is no subsampling and the
+    # random_state doesn't matter
+    mapper_whole_data = _BinMapper(random_state=0)
+    mapper_whole_data.fit(X_classification)
+
+    n_samples = X_classification.shape[0]
+    assert np.all(mapper_training_data.n_bins_non_missing_ ==
+                  int((1 - validation_fraction) * n_samples))
+    assert np.all(mapper_training_data.n_bins_non_missing_ !=
+                  mapper_whole_data.n_bins_non_missing_)
+
+
+def test_missing_values_trivial():
+    # sanity check for missing values support. With only one feature and
+    # y == isnan(X), the gbdt is supposed to reach perfect accuracy on the
+    # training set.
+
+    n_samples = 100
+    n_features = 1
+    rng = np.random.RandomState(0)
+
+    X = rng.normal(size=(n_samples, n_features))
+    mask = rng.binomial(1, .5, size=X.shape).astype(np.bool)
+    X[mask] = np.nan
+    y = mask.ravel()
+    gb = HistGradientBoostingClassifier()
+    gb.fit(X, y)
+
+    assert gb.score(X, y) == pytest.approx(1)
+
+
+@pytest.mark.parametrize('problem', ('classification', 'regression'))
+@pytest.mark.parametrize(
+    'missing_proportion, expected_min_score_classification, '
+    'expected_min_score_regression', [
+        (.1, .97, .89),
+        (.2, .93, .81),
+        (.5, .79, .52)])
+def test_missing_values_resilience(problem, missing_proportion,
+                                   expected_min_score_classification,
+                                   expected_min_score_regression):
+    # Make sure the estimators can deal with missing values and still yield
+    # decent predictions
+
+    rng = np.random.RandomState(0)
+    n_samples = 1000
+    n_features = 2
+    if problem == 'regression':
+        X, y = make_regression(n_samples=n_samples, n_features=n_features,
+                               n_informative=n_features, random_state=rng)
+        gb = HistGradientBoostingRegressor()
+        expected_min_score = expected_min_score_regression
+    else:
+        X, y = make_classification(n_samples=n_samples, n_features=n_features,
+                                   n_informative=n_features, n_redundant=0,
+                                   n_repeated=0, random_state=rng)
+        gb = HistGradientBoostingClassifier()
+        expected_min_score = expected_min_score_classification
+
+    mask = rng.binomial(1, missing_proportion, size=X.shape).astype(np.bool)
+    X[mask] = np.nan
+
+    gb.fit(X, y)
+
+    assert gb.score(X, y) > expected_min_score
+
+
+@pytest.mark.parametrize('data', [
+    make_classification(random_state=0, n_classes=2),
+    make_classification(random_state=0, n_classes=3, n_informative=3)
+], ids=['binary_crossentropy', 'categorical_crossentropy'])
+def test_zero_division_hessians(data):
+    # non regression test for issue #14018
+    # make sure we avoid zero division errors when computing the leaves values.
+
+    # If the learning rate is too high, the raw predictions are bad and will
+    # saturate the softmax (or sigmoid in binary classif). This leads to
+    # probabilities being exactly 0 or 1, gradients being constant, and
+    # hessians being zero.
+    X, y = data
+    gb = HistGradientBoostingClassifier(learning_rate=100, max_iter=10)
+    gb.fit(X, y)
+
+
+def test_small_trainset():
+    # Make sure that the small trainset is stratified and has the expected
+    # length (10k samples)
+    n_samples = 20000
+    original_distrib = {0: 0.1, 1: 0.2, 2: 0.3, 3: 0.4}
+    rng = np.random.RandomState(42)
+    X = rng.randn(n_samples).reshape(n_samples, 1)
+    y = [[class_] * int(prop * n_samples) for (class_, prop)
+         in original_distrib.items()]
+    y = shuffle(np.concatenate(y))
+    gb = HistGradientBoostingClassifier()
+
+    # Compute the small training set
+    X_small, y_small, _ = gb._get_small_trainset(X, y, seed=42,
+                                                 sample_weight_train=None)
+
+    # Compute the class distribution in the small training set
+    unique, counts = np.unique(y_small, return_counts=True)
+    small_distrib = {class_: count / 10000 for (class_, count)
+                     in zip(unique, counts)}
+
+    # Test that the small training set has the expected length
+    assert X_small.shape[0] == 10000
+    assert y_small.shape[0] == 10000
+
+    # Test that the class distributions in the whole dataset and in the small
+    # training set are identical
+    assert small_distrib == pytest.approx(original_distrib)
+
+
+def test_missing_values_minmax_imputation():
+    # Compare the buit-in missing value handling of Histogram GBC with an
+    # a-priori missing value imputation strategy that should yield the same
+    # results in terms of decision function.
+    #
+    # Each feature (containing NaNs) is replaced by 2 features:
+    # - one where the nans are replaced by min(feature) - 1
+    # - one where the nans are replaced by max(feature) + 1
+    # A split where nans go to the left has an equivalent split in the
+    # first (min) feature, and a split where nans go to the right has an
+    # equivalent split in the second (max) feature.
+    #
+    # Assuming the data is such that there is never a tie to select the best
+    # feature to split on during training, the learned decision trees should be
+    # strictly equivalent (learn a sequence of splits that encode the same
+    # decision function).
+    #
+    # The MinMaxImputer transformer is meant to be a toy implementation of the
+    # "Missing In Attributes" (MIA) missing value handling for decision trees
+    # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305
+    # The implementation of MIA as an imputation transformer was suggested by
+    # "Remark 3" in https://arxiv.org/abs/1902.06931
+
+    class MinMaxImputer(BaseEstimator, TransformerMixin):
+
+        def fit(self, X, y=None):
+            mm = MinMaxScaler().fit(X)
+            self.data_min_ = mm.data_min_
+            self.data_max_ = mm.data_max_
+            return self
+
+        def transform(self, X):
+            X_min, X_max = X.copy(), X.copy()
+
+            for feature_idx in range(X.shape[1]):
+                nan_mask = np.isnan(X[:, feature_idx])
+                X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1
+                X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1
+
+            return np.concatenate([X_min, X_max], axis=1)
+
+    def make_missing_value_data(n_samples=int(1e4), seed=0):
+        rng = np.random.RandomState(seed)
+        X, y = make_regression(n_samples=n_samples, n_features=4,
+                               random_state=rng)
+
+        # Pre-bin the data to ensure a deterministic handling by the 2
+        # strategies and also make it easier to insert np.nan in a structured
+        # way:
+        X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X)
+
+        # First feature has missing values completely at random:
+        rnd_mask = rng.rand(X.shape[0]) > 0.9
+        X[rnd_mask, 0] = np.nan
+
+        # Second and third features have missing values for extreme values
+        # (censoring missingness):
+        low_mask = X[:, 1] == 0
+        X[low_mask, 1] = np.nan
+
+        high_mask = X[:, 2] == X[:, 2].max()
+        X[high_mask, 2] = np.nan
+
+        # Make the last feature nan pattern very informative:
+        y_max = np.percentile(y, 70)
+        y_max_mask = y >= y_max
+        y[y_max_mask] = y_max
+        X[y_max_mask, 3] = np.nan
+
+        # Check that there is at least one missing value in each feature:
+        for feature_idx in range(X.shape[1]):
+            assert any(np.isnan(X[:, feature_idx]))
+
+        # Let's use a test set to check that the learned decision function is
+        # the same as evaluated on unseen data. Otherwise it could just be the
+        # case that we find two independent ways to overfit the training set.
+        return train_test_split(X, y, random_state=rng)
+
+    # n_samples need to be large enough to minimize the likelihood of having
+    # several candidate splits with the same gain value in a given tree.
+    X_train, X_test, y_train, y_test = make_missing_value_data(
+        n_samples=int(1e4), seed=0)
+
+    # Use a small number of leaf nodes and iterations so as to keep
+    # under-fitting models to minimize the likelihood of ties when training the
+    # model.
+    gbm1 = HistGradientBoostingRegressor(max_iter=100,
+                                         max_leaf_nodes=5,
+                                         random_state=0)
+    gbm1.fit(X_train, y_train)
+
+    gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1))
+    gbm2.fit(X_train, y_train)
+
+    # Check that the model reach the same score:
+    assert gbm1.score(X_train, y_train) == \
+        pytest.approx(gbm2.score(X_train, y_train))
+
+    assert gbm1.score(X_test, y_test) == \
+        pytest.approx(gbm2.score(X_test, y_test))
+
+    # Check the individual prediction match as a finer grained
+    # decision function check.
+    assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train))
+    assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test))
+
+
+def test_infinite_values():
+    # Basic test for infinite values
+
+    X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
+    y = np.array([0, 0, 1, 1])
+
+    gbdt = HistGradientBoostingRegressor(min_samples_leaf=1)
+    gbdt.fit(X, y)
+    np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4)
+
+
+def test_consistent_lengths():
+    X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
+    y = np.array([0, 0, 1, 1])
+    sample_weight = np.array([.1, .3, .1])
+    gbdt = HistGradientBoostingRegressor()
+    with pytest.raises(ValueError,
+                       match=r"sample_weight.shape == \(3,\), expected"):
+        gbdt.fit(X, y, sample_weight)
+
+    with pytest.raises(ValueError,
+                       match="Found input variables with inconsistent number"):
+        gbdt.fit(X, y[1:])
+
+
+def test_infinite_values_missing_values():
+    # High level test making sure that inf and nan values are properly handled
+    # when both are present. This is similar to
+    # test_split_on_nan_with_infinite_values() in test_grower.py, though we
+    # cannot check the predictions for binned values here.
+
+    X = np.asarray([-np.inf, 0, 1, np.inf, np.nan]).reshape(-1, 1)
+    y_isnan = np.isnan(X.ravel())
+    y_isinf = X.ravel() == np.inf
+
+    stump_clf = HistGradientBoostingClassifier(min_samples_leaf=1, max_iter=1,
+                                               learning_rate=1, max_depth=2)
+
+    assert stump_clf.fit(X, y_isinf).score(X, y_isinf) == 1
+    assert stump_clf.fit(X, y_isnan).score(X, y_isnan) == 1
+
+
+def test_crossentropy_binary_problem():
+    # categorical_crossentropy should only be used if there are more than two
+    # classes present. PR #14869
+    X = [[1], [0]]
+    y = [0, 1]
+    gbrt = HistGradientBoostingClassifier(loss='categorical_crossentropy')
+    with pytest.raises(ValueError,
+                       match="'categorical_crossentropy' is not suitable for"):
+        gbrt.fit(X, y)
+
+
+@pytest.mark.parametrize("scoring", [None, 'loss'])
+def test_string_target_early_stopping(scoring):
+    # Regression tests for #14709 where the targets need to be encoded before
+    # to compute the score
+    rng = np.random.RandomState(42)
+    X = rng.randn(100, 10)
+    y = np.array(['x'] * 50 + ['y'] * 50, dtype=object)
+    gbrt = HistGradientBoostingClassifier(n_iter_no_change=10, scoring=scoring)
+    gbrt.fit(X, y)
+
+
+def test_zero_sample_weights_regression():
+    # Make sure setting a SW to zero amounts to ignoring the corresponding
+    # sample
+
+    X = [[1, 0],
+         [1, 0],
+         [1, 0],
+         [0, 1]]
+    y = [0, 0, 1, 0]
+    # ignore the first 2 training samples by setting their weight to 0
+    sample_weight = [0, 0, 1, 1]
+    gb = HistGradientBoostingRegressor(min_samples_leaf=1)
+    gb.fit(X, y, sample_weight=sample_weight)
+    assert gb.predict([[1, 0]])[0] > 0.5
+
+
+def test_zero_sample_weights_classification():
+    # Make sure setting a SW to zero amounts to ignoring the corresponding
+    # sample
+
+    X = [[1, 0],
+         [1, 0],
+         [1, 0],
+         [0, 1]]
+    y = [0, 0, 1, 0]
+    # ignore the first 2 training samples by setting their weight to 0
+    sample_weight = [0, 0, 1, 1]
+    gb = HistGradientBoostingClassifier(loss='binary_crossentropy',
+                                        min_samples_leaf=1)
+    gb.fit(X, y, sample_weight=sample_weight)
+    assert_array_equal(gb.predict([[1, 0]]), [1])
+
+    X = [[1, 0],
+         [1, 0],
+         [1, 0],
+         [0, 1],
+         [1, 1]]
+    y = [0, 0, 1, 0, 2]
+    # ignore the first 2 training samples by setting their weight to 0
+    sample_weight = [0, 0, 1, 1, 1]
+    gb = HistGradientBoostingClassifier(loss='categorical_crossentropy',
+                                        min_samples_leaf=1)
+    gb.fit(X, y, sample_weight=sample_weight)
+    assert_array_equal(gb.predict([[1, 0]]), [1])
+
+
+@pytest.mark.parametrize('problem', (
+    'regression',
+    'binary_classification',
+    'multiclass_classification'
+))
+@pytest.mark.parametrize('duplication', ('half', 'all'))
+def test_sample_weight_effect(problem, duplication):
+    # High level test to make sure that duplicating a sample is equivalent to
+    # giving it weight of 2.
+
+    # fails for n_samples > 255 because binning does not take sample weights
+    # into account. Keeping n_samples <= 255 makes
+    # sure only unique values are used so SW have no effect on binning.
+    n_samples = 255
+    n_features = 2
+    if problem == 'regression':
+        X, y = make_regression(n_samples=n_samples, n_features=n_features,
+                               n_informative=n_features, random_state=0)
+        Klass = HistGradientBoostingRegressor
+    else:
+        n_classes = 2 if problem == 'binary_classification' else 3
+        X, y = make_classification(n_samples=n_samples, n_features=n_features,
+                                   n_informative=n_features, n_redundant=0,
+                                   n_clusters_per_class=1,
+                                   n_classes=n_classes, random_state=0)
+        Klass = HistGradientBoostingClassifier
+
+    # This test can't pass if min_samples_leaf > 1 because that would force 2
+    # samples to be in the same node in est_sw, while these samples would be
+    # free to be separate in est_dup: est_dup would just group together the
+    # duplicated samples.
+    est = Klass(min_samples_leaf=1)
+
+    # Create dataset with duplicate and corresponding sample weights
+    if duplication == 'half':
+        lim = n_samples // 2
+    else:
+        lim = n_samples
+    X_dup = np.r_[X, X[:lim]]
+    y_dup = np.r_[y, y[:lim]]
+    sample_weight = np.ones(shape=(n_samples))
+    sample_weight[:lim] = 2
+
+    est_sw = clone(est).fit(X, y, sample_weight=sample_weight)
+    est_dup = clone(est).fit(X_dup, y_dup)
+
+    # checking raw_predict is stricter than just predict for classification
+    assert np.allclose(est_sw._raw_predict(X_dup),
+                       est_dup._raw_predict(X_dup))
+
+
+@pytest.mark.parametrize('loss_name', ('least_squares',
+                                       'least_absolute_deviation'))
+def test_sum_hessians_are_sample_weight(loss_name):
+    # For losses with constant hessians, the sum_hessians field of the
+    # histograms must be equal to the sum of the sample weight of samples at
+    # the corresponding bin.
+
+    rng = np.random.RandomState(0)
+    n_samples = 1000
+    n_features = 2
+    X, y = make_regression(n_samples=n_samples, n_features=n_features,
+                           random_state=rng)
+    bin_mapper = _BinMapper()
+    X_binned = bin_mapper.fit_transform(X)
+
+    sample_weight = rng.normal(size=n_samples)
+
+    loss = _LOSSES[loss_name](sample_weight=sample_weight)
+    gradients, hessians = loss.init_gradients_and_hessians(
+        n_samples=n_samples, prediction_dim=1, sample_weight=sample_weight)
+    raw_predictions = rng.normal(size=(1, n_samples))
+    loss.update_gradients_and_hessians(gradients, hessians, y,
+                                       raw_predictions, sample_weight)
+
+    # build sum_sample_weight which contains the sum of the sample weights at
+    # each bin (for each feature). This must be equal to the sum_hessians
+    # field of the corresponding histogram
+    sum_sw = np.zeros(shape=(n_features, bin_mapper.n_bins))
+    for feature_idx in range(n_features):
+        for sample_idx in range(n_samples):
+            sum_sw[feature_idx, X_binned[sample_idx, feature_idx]] += (
+                sample_weight[sample_idx])
+
+    # Build histogram
+    grower = TreeGrower(X_binned, gradients[0], hessians[0],
+                        n_bins=bin_mapper.n_bins)
+    histograms = grower.histogram_builder.compute_histograms_brute(
+        grower.root.sample_indices)
+
+    for feature_idx in range(n_features):
+        for bin_idx in range(bin_mapper.n_bins):
+            assert histograms[feature_idx, bin_idx]['sum_hessians'] == (
+                pytest.approx(sum_sw[feature_idx, bin_idx], rel=1e-5))
+
+
+def test_max_depth_max_leaf_nodes():
+    # Non regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/16179
+    # there was a bug when the max_depth and the max_leaf_nodes criteria were
+    # met at the same time, which would lead to max_leaf_nodes not being
+    # respected.
+    X, y = make_classification(random_state=0)
+    est = HistGradientBoostingClassifier(max_depth=2, max_leaf_nodes=3,
+                                         max_iter=1).fit(X, y)
+    tree = est._predictors[0][0]
+    assert tree.get_max_depth() == 2
+    assert tree.get_n_leaf_nodes() == 3  # would be 4 prior to bug fix
+
+
+def test_early_stopping_on_test_set_with_warm_start():
+    # Non regression test for #16661 where second fit fails with
+    # warm_start=True, early_stopping is on, and no validation set
+    X, y = make_classification(random_state=0)
+    gb = HistGradientBoostingClassifier(
+        max_iter=1, scoring='loss', warm_start=True, early_stopping=True,
+        n_iter_no_change=1, validation_fraction=None)
+
+    gb.fit(X, y)
+    # does not raise on second call
+    gb.set_params(max_iter=2)
+    gb.fit(X, y)
+
+
+@pytest.mark.parametrize('Est', (HistGradientBoostingClassifier,
+                                 HistGradientBoostingRegressor))
+def test_single_node_trees(Est):
+    # Make sure it's still possible to build single-node trees. In that case
+    # the value of the root is set to 0. That's a correct value: if the tree is
+    # single-node that's because min_gain_to_split is not respected right from
+    # the root, so we don't want the tree to have any impact on the
+    # predictions.
+
+    X, y = make_classification(random_state=0)
+    y[:] = 1  # constant target will lead to a single root node
+
+    est = Est(max_iter=20)
+    est.fit(X, y)
+
+    assert all(len(predictor[0].nodes) == 1 for predictor in est._predictors)
+    assert all(predictor[0].nodes[0]['value'] == 0
+               for predictor in est._predictors)
+    # Still gives correct predictions thanks to the baseline prediction
+    assert_allclose(est.predict(X), y)
+
+
+@pytest.mark.parametrize('Est, loss, X, y', [
+    (
+        HistGradientBoostingClassifier,
+        BinaryCrossEntropy(sample_weight=None),
+        X_classification,
+        y_classification
+    ),
+    (
+        HistGradientBoostingRegressor,
+        LeastSquares(sample_weight=None),
+        X_regression,
+        y_regression
+    )
+])
+def test_custom_loss(Est, loss, X, y):
+    est = Est(loss=loss, max_iter=20)
+    est.fit(X, y)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
@ -0,0 +1,399 @@
+import numpy as np
+import pytest
+from pytest import approx
+
+from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+
+
+def _make_training_data(n_bins=256, constant_hessian=True):
+    rng = np.random.RandomState(42)
+    n_samples = 10000
+
+    # Generate some test data directly binned so as to test the grower code
+    # independently of the binning logic.
+    X_binned = rng.randint(0, n_bins - 1, size=(n_samples, 2),
+                           dtype=X_BINNED_DTYPE)
+    X_binned = np.asfortranarray(X_binned)
+
+    def true_decision_function(input_features):
+        """Ground truth decision function
+
+        This is a very simple yet asymmetric decision tree. Therefore the
+        grower code should have no trouble recovering the decision function
+        from 10000 training samples.
+        """
+        if input_features[0] <= n_bins // 2:
+            return -1
+        else:
+            return -1 if input_features[1] <= n_bins // 3 else 1
+
+    target = np.array([true_decision_function(x) for x in X_binned],
+                      dtype=Y_DTYPE)
+
+    # Assume a square loss applied to an initial model that always predicts 0
+    # (hardcoded for this test):
+    all_gradients = target.astype(G_H_DTYPE)
+    shape_hessians = 1 if constant_hessian else all_gradients.shape
+    all_hessians = np.ones(shape=shape_hessians, dtype=G_H_DTYPE)
+
+    return X_binned, all_gradients, all_hessians
+
+
+def _check_children_consistency(parent, left, right):
+    # Make sure the samples are correctly dispatched from a parent to its
+    # children
+    assert parent.left_child is left
+    assert parent.right_child is right
+
+    # each sample from the parent is propagated to one of the two children
+    assert (len(left.sample_indices) + len(right.sample_indices)
+            == len(parent.sample_indices))
+
+    assert (set(left.sample_indices).union(set(right.sample_indices))
+            == set(parent.sample_indices))
+
+    # samples are sent either to the left or the right node, never to both
+    assert (set(left.sample_indices).intersection(set(right.sample_indices))
+            == set())
+
+
+@pytest.mark.parametrize(
+    'n_bins, constant_hessian, stopping_param, shrinkage',
+    [
+        (11, True, "min_gain_to_split", 0.5),
+        (11, False, "min_gain_to_split", 1.),
+        (11, True, "max_leaf_nodes", 1.),
+        (11, False, "max_leaf_nodes", 0.1),
+        (42, True, "max_leaf_nodes", 0.01),
+        (42, False, "max_leaf_nodes", 1.),
+        (256, True, "min_gain_to_split", 1.),
+        (256, True, "max_leaf_nodes", 0.1),
+    ]
+)
+def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage):
+    X_binned, all_gradients, all_hessians = _make_training_data(
+        n_bins=n_bins, constant_hessian=constant_hessian)
+    n_samples = X_binned.shape[0]
+
+    if stopping_param == "max_leaf_nodes":
+        stopping_param = {"max_leaf_nodes": 3}
+    else:
+        stopping_param = {"min_gain_to_split": 0.01}
+
+    grower = TreeGrower(X_binned, all_gradients, all_hessians,
+                        n_bins=n_bins, shrinkage=shrinkage,
+                        min_samples_leaf=1, **stopping_param)
+
+    # The root node is not yet splitted, but the best possible split has
+    # already been evaluated:
+    assert grower.root.left_child is None
+    assert grower.root.right_child is None
+
+    root_split = grower.root.split_info
+    assert root_split.feature_idx == 0
+    assert root_split.bin_idx == n_bins // 2
+    assert len(grower.splittable_nodes) == 1
+
+    # Calling split next applies the next split and computes the best split
+    # for each of the two newly introduced children nodes.
+    left_node, right_node = grower.split_next()
+
+    # All training samples have ben splitted in the two nodes, approximately
+    # 50%/50%
+    _check_children_consistency(grower.root, left_node, right_node)
+    assert len(left_node.sample_indices) > 0.4 * n_samples
+    assert len(left_node.sample_indices) < 0.6 * n_samples
+
+    if grower.min_gain_to_split > 0:
+        # The left node is too pure: there is no gain to split it further.
+        assert left_node.split_info.gain < grower.min_gain_to_split
+        assert left_node in grower.finalized_leaves
+
+    # The right node can still be splitted further, this time on feature #1
+    split_info = right_node.split_info
+    assert split_info.gain > 1.
+    assert split_info.feature_idx == 1
+    assert split_info.bin_idx == n_bins // 3
+    assert right_node.left_child is None
+    assert right_node.right_child is None
+
+    # The right split has not been applied yet. Let's do it now:
+    assert len(grower.splittable_nodes) == 1
+    right_left_node, right_right_node = grower.split_next()
+    _check_children_consistency(right_node, right_left_node, right_right_node)
+    assert len(right_left_node.sample_indices) > 0.1 * n_samples
+    assert len(right_left_node.sample_indices) < 0.2 * n_samples
+
+    assert len(right_right_node.sample_indices) > 0.2 * n_samples
+    assert len(right_right_node.sample_indices) < 0.4 * n_samples
+
+    # All the leafs are pure, it is not possible to split any further:
+    assert not grower.splittable_nodes
+
+    grower._apply_shrinkage()
+
+    # Check the values of the leaves:
+    assert grower.root.left_child.value == approx(shrinkage)
+    assert grower.root.right_child.left_child.value == approx(shrinkage)
+    assert grower.root.right_child.right_child.value == approx(-shrinkage,
+                                                               rel=1e-3)
+
+
+def test_predictor_from_grower():
+    # Build a tree on the toy 3-leaf dataset to extract the predictor.
+    n_bins = 256
+    X_binned, all_gradients, all_hessians = _make_training_data(
+        n_bins=n_bins)
+    grower = TreeGrower(X_binned, all_gradients, all_hessians,
+                        n_bins=n_bins, shrinkage=1.,
+                        max_leaf_nodes=3, min_samples_leaf=5)
+    grower.grow()
+    assert grower.n_nodes == 5  # (2 decision nodes + 3 leaves)
+
+    # Check that the node structure can be converted into a predictor
+    # object to perform predictions at scale
+    predictor = grower.make_predictor()
+    assert predictor.nodes.shape[0] == 5
+    assert predictor.nodes['is_leaf'].sum() == 3
+
+    # Probe some predictions for each leaf of the tree
+    # each group of 3 samples corresponds to a condition in _make_training_data
+    input_data = np.array([
+        [0, 0],
+        [42, 99],
+        [128, 254],
+
+        [129, 0],
+        [129, 85],
+        [254, 85],
+
+        [129, 86],
+        [129, 254],
+        [242, 100],
+    ], dtype=np.uint8)
+    missing_values_bin_idx = n_bins - 1
+    predictions = predictor.predict_binned(input_data, missing_values_bin_idx)
+    expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1]
+    assert np.allclose(predictions, expected_targets)
+
+    # Check that training set can be recovered exactly:
+    predictions = predictor.predict_binned(X_binned, missing_values_bin_idx)
+    assert np.allclose(predictions, -all_gradients)
+
+
+@pytest.mark.parametrize(
+    'n_samples, min_samples_leaf, n_bins, constant_hessian, noise',
+    [
+        (11, 10, 7, True, 0),
+        (13, 10, 42, False, 0),
+        (56, 10, 255, True, 0.1),
+        (101, 3, 7, True, 0),
+        (200, 42, 42, False, 0),
+        (300, 55, 255, True, 0.1),
+        (300, 301, 255, True, 0.1),
+    ]
+)
+def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins,
+                          constant_hessian, noise):
+    rng = np.random.RandomState(seed=0)
+    # data = linear target, 3 features, 1 irrelevant.
+    X = rng.normal(size=(n_samples, 3))
+    y = X[:, 0] - X[:, 1]
+    if noise:
+        y_scale = y.std()
+        y += rng.normal(scale=noise, size=n_samples) * y_scale
+    mapper = _BinMapper(n_bins=n_bins)
+    X = mapper.fit_transform(X)
+
+    all_gradients = y.astype(G_H_DTYPE)
+    shape_hessian = 1 if constant_hessian else all_gradients.shape
+    all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE)
+    grower = TreeGrower(X, all_gradients, all_hessians,
+                        n_bins=n_bins, shrinkage=1.,
+                        min_samples_leaf=min_samples_leaf,
+                        max_leaf_nodes=n_samples)
+    grower.grow()
+    predictor = grower.make_predictor(
+        bin_thresholds=mapper.bin_thresholds_)
+
+    if n_samples >= min_samples_leaf:
+        for node in predictor.nodes:
+            if node['is_leaf']:
+                assert node['count'] >= min_samples_leaf
+    else:
+        assert predictor.nodes.shape[0] == 1
+        assert predictor.nodes[0]['is_leaf']
+        assert predictor.nodes[0]['count'] == n_samples
+
+
+@pytest.mark.parametrize('n_samples, min_samples_leaf', [
+                         (99, 50),
+                         (100, 50)])
+def test_min_samples_leaf_root(n_samples, min_samples_leaf):
+    # Make sure root node isn't split if n_samples is not at least twice
+    # min_samples_leaf
+    rng = np.random.RandomState(seed=0)
+
+    n_bins = 256
+
+    # data = linear target, 3 features, 1 irrelevant.
+    X = rng.normal(size=(n_samples, 3))
+    y = X[:, 0] - X[:, 1]
+    mapper = _BinMapper(n_bins=n_bins)
+    X = mapper.fit_transform(X)
+
+    all_gradients = y.astype(G_H_DTYPE)
+    all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+    grower = TreeGrower(X, all_gradients, all_hessians,
+                        n_bins=n_bins, shrinkage=1.,
+                        min_samples_leaf=min_samples_leaf,
+                        max_leaf_nodes=n_samples)
+    grower.grow()
+    if n_samples >= min_samples_leaf * 2:
+        assert len(grower.finalized_leaves) >= 2
+    else:
+        assert len(grower.finalized_leaves) == 1
+
+
+def assert_is_stump(grower):
+    # To assert that stumps are created when max_depth=1
+    for leaf in (grower.root.left_child, grower.root.right_child):
+        assert leaf.left_child is None
+        assert leaf.right_child is None
+
+
+@pytest.mark.parametrize('max_depth', [1, 2, 3])
+def test_max_depth(max_depth):
+    # Make sure max_depth parameter works as expected
+    rng = np.random.RandomState(seed=0)
+
+    n_bins = 256
+    n_samples = 1000
+
+    # data = linear target, 3 features, 1 irrelevant.
+    X = rng.normal(size=(n_samples, 3))
+    y = X[:, 0] - X[:, 1]
+    mapper = _BinMapper(n_bins=n_bins)
+    X = mapper.fit_transform(X)
+
+    all_gradients = y.astype(G_H_DTYPE)
+    all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+    grower = TreeGrower(X, all_gradients, all_hessians, max_depth=max_depth)
+    grower.grow()
+
+    depth = max(leaf.depth for leaf in grower.finalized_leaves)
+    assert depth == max_depth
+
+    if max_depth == 1:
+        assert_is_stump(grower)
+
+
+def test_input_validation():
+
+    X_binned, all_gradients, all_hessians = _make_training_data()
+
+    X_binned_float = X_binned.astype(np.float32)
+    with pytest.raises(NotImplementedError,
+                       match="X_binned must be of type uint8"):
+        TreeGrower(X_binned_float, all_gradients, all_hessians)
+
+    X_binned_C_array = np.ascontiguousarray(X_binned)
+    with pytest.raises(
+            ValueError,
+            match="X_binned should be passed as Fortran contiguous array"):
+        TreeGrower(X_binned_C_array, all_gradients, all_hessians)
+
+
+def test_init_parameters_validation():
+    X_binned, all_gradients, all_hessians = _make_training_data()
+    with pytest.raises(ValueError,
+                       match="min_gain_to_split=-1 must be positive"):
+
+        TreeGrower(X_binned, all_gradients, all_hessians,
+                   min_gain_to_split=-1)
+
+    with pytest.raises(ValueError,
+                       match="min_hessian_to_split=-1 must be positive"):
+        TreeGrower(X_binned, all_gradients, all_hessians,
+                   min_hessian_to_split=-1)
+
+
+def test_missing_value_predict_only():
+    # Make sure that missing values are supported at predict time even if they
+    # were not encountered in the training data: the missing values are
+    # assigned to whichever child has the most samples.
+
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8)
+    X_binned = np.asfortranarray(X_binned)
+
+    gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
+    hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+
+    grower = TreeGrower(X_binned, gradients, hessians, min_samples_leaf=5,
+                        has_missing_values=False)
+    grower.grow()
+
+    predictor = grower.make_predictor()
+
+    # go from root to a leaf, always following node with the most samples.
+    # That's the path nans are supposed to take
+    node = predictor.nodes[0]
+    while not node['is_leaf']:
+        left = predictor.nodes[node['left']]
+        right = predictor.nodes[node['right']]
+        node = left if left['count'] > right['count'] else right
+
+    prediction_main_path = node['value']
+
+    # now build X_test with only nans, and make sure all predictions are equal
+    # to prediction_main_path
+    all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan)
+    assert np.all(predictor.predict(all_nans) == prediction_main_path)
+
+
+def test_split_on_nan_with_infinite_values():
+    # Make sure the split on nan situations are respected even when there are
+    # samples with +inf values (we set the threshold to +inf when we have a
+    # split on nan so this test makes sure this does not introduce edge-case
+    # bugs). We need to use the private API so that we can also test
+    # predict_binned().
+
+    X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1)
+    # the gradient values will force a split on nan situation
+    gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE)
+    hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+
+    bin_mapper = _BinMapper()
+    X_binned = bin_mapper.fit_transform(X)
+
+    n_bins_non_missing = 3
+    has_missing_values = True
+    grower = TreeGrower(X_binned, gradients, hessians,
+                        n_bins_non_missing=n_bins_non_missing,
+                        has_missing_values=has_missing_values,
+                        min_samples_leaf=1)
+
+    grower.grow()
+
+    predictor = grower.make_predictor(
+        bin_thresholds=bin_mapper.bin_thresholds_
+    )
+
+    # sanity check: this was a split on nan
+    assert predictor.nodes[0]['threshold'] == np.inf
+    assert predictor.nodes[0]['bin_threshold'] == n_bins_non_missing - 1
+
+    # Make sure in particular that the +inf sample is mapped to the left child
+    # Note that lightgbm "fails" here and will assign the inf sample to the
+    # right child, even though it's a "split on nan" situation.
+    predictions = predictor.predict(X)
+    predictions_binned = predictor.predict_binned(
+        X_binned, missing_values_bin_idx=bin_mapper.missing_values_bin_idx_)
+    np.testing.assert_allclose(predictions, -gradients)
+    np.testing.assert_allclose(predictions_binned, -gradients)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py
@ -0,0 +1,202 @@
+import numpy as np
+import pytest
+
+from numpy.testing import assert_allclose
+from numpy.testing import assert_array_equal
+
+from sklearn.ensemble._hist_gradient_boosting.histogram import (
+    _build_histogram_naive,
+    _build_histogram,
+    _build_histogram_no_hessian,
+    _build_histogram_root_no_hessian,
+    _build_histogram_root,
+    _subtract_histograms
+)
+from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
+
+
+@pytest.mark.parametrize(
+    'build_func', [_build_histogram_naive, _build_histogram])
+def test_build_histogram(build_func):
+    binned_feature = np.array([0, 2, 0, 1, 2, 0, 2, 1], dtype=X_BINNED_DTYPE)
+
+    # Small sample_indices (below unrolling threshold)
+    ordered_gradients = np.array([0, 1, 3], dtype=G_H_DTYPE)
+    ordered_hessians = np.array([1, 1, 2], dtype=G_H_DTYPE)
+
+    sample_indices = np.array([0, 2, 3], dtype=np.uint32)
+    hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE)
+    build_func(0, sample_indices, binned_feature, ordered_gradients,
+               ordered_hessians, hist)
+    hist = hist[0]
+    assert_array_equal(hist['count'], [2, 1, 0])
+    assert_allclose(hist['sum_gradients'], [1, 3, 0])
+    assert_allclose(hist['sum_hessians'], [2, 2, 0])
+
+    # Larger sample_indices (above unrolling threshold)
+    sample_indices = np.array([0, 2, 3, 6, 7], dtype=np.uint32)
+    ordered_gradients = np.array([0, 1, 3, 0, 1], dtype=G_H_DTYPE)
+    ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=G_H_DTYPE)
+
+    hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE)
+    build_func(0, sample_indices, binned_feature, ordered_gradients,
+               ordered_hessians, hist)
+    hist = hist[0]
+    assert_array_equal(hist['count'], [2, 2, 1])
+    assert_allclose(hist['sum_gradients'], [1, 4, 0])
+    assert_allclose(hist['sum_hessians'], [2, 2, 1])
+
+
+def test_histogram_sample_order_independence():
+    # Make sure the order of the samples has no impact on the histogram
+    # computations
+    rng = np.random.RandomState(42)
+    n_sub_samples = 100
+    n_samples = 1000
+    n_bins = 256
+
+    binned_feature = rng.randint(0, n_bins - 1, size=n_samples,
+                                 dtype=X_BINNED_DTYPE)
+    sample_indices = rng.choice(np.arange(n_samples, dtype=np.uint32),
+                                n_sub_samples, replace=False)
+    ordered_gradients = rng.randn(n_sub_samples).astype(G_H_DTYPE)
+    hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    _build_histogram_no_hessian(0, sample_indices, binned_feature,
+                                ordered_gradients, hist_gc)
+
+    ordered_hessians = rng.exponential(size=n_sub_samples).astype(G_H_DTYPE)
+    hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    _build_histogram(0, sample_indices, binned_feature,
+                     ordered_gradients, ordered_hessians, hist_ghc)
+
+    permutation = rng.permutation(n_sub_samples)
+    hist_gc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    _build_histogram_no_hessian(0, sample_indices[permutation],
+                                binned_feature, ordered_gradients[permutation],
+                                hist_gc_perm)
+
+    hist_ghc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    _build_histogram(0, sample_indices[permutation], binned_feature,
+                     ordered_gradients[permutation],
+                     ordered_hessians[permutation], hist_ghc_perm)
+
+    hist_gc = hist_gc[0]
+    hist_ghc = hist_ghc[0]
+    hist_gc_perm = hist_gc_perm[0]
+    hist_ghc_perm = hist_ghc_perm[0]
+
+    assert_allclose(hist_gc['sum_gradients'], hist_gc_perm['sum_gradients'])
+    assert_array_equal(hist_gc['count'], hist_gc_perm['count'])
+
+    assert_allclose(hist_ghc['sum_gradients'], hist_ghc_perm['sum_gradients'])
+    assert_allclose(hist_ghc['sum_hessians'], hist_ghc_perm['sum_hessians'])
+    assert_array_equal(hist_ghc['count'], hist_ghc_perm['count'])
+
+
+@pytest.mark.parametrize("constant_hessian", [True, False])
+def test_unrolled_equivalent_to_naive(constant_hessian):
+    # Make sure the different unrolled histogram computations give the same
+    # results as the naive one.
+    rng = np.random.RandomState(42)
+    n_samples = 10
+    n_bins = 5
+    sample_indices = np.arange(n_samples).astype(np.uint32)
+    binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8)
+    ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
+    if constant_hessian:
+        ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
+    else:
+        ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
+
+    hist_gc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    hist_ghc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    hist_naive = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+
+    _build_histogram_root_no_hessian(0, binned_feature,
+                                     ordered_gradients, hist_gc_root)
+    _build_histogram_root(0, binned_feature, ordered_gradients,
+                          ordered_hessians, hist_ghc_root)
+    _build_histogram_no_hessian(0, sample_indices, binned_feature,
+                                ordered_gradients, hist_gc)
+    _build_histogram(0, sample_indices, binned_feature,
+                     ordered_gradients, ordered_hessians, hist_ghc)
+    _build_histogram_naive(0, sample_indices, binned_feature,
+                           ordered_gradients, ordered_hessians, hist_naive)
+
+    hist_naive = hist_naive[0]
+    hist_gc_root = hist_gc_root[0]
+    hist_ghc_root = hist_ghc_root[0]
+    hist_gc = hist_gc[0]
+    hist_ghc = hist_ghc[0]
+    for hist in (hist_gc_root, hist_ghc_root, hist_gc, hist_ghc):
+        assert_array_equal(hist['count'], hist_naive['count'])
+        assert_allclose(hist['sum_gradients'], hist_naive['sum_gradients'])
+    for hist in (hist_ghc_root, hist_ghc):
+        assert_allclose(hist['sum_hessians'], hist_naive['sum_hessians'])
+    for hist in (hist_gc_root, hist_gc):
+        assert_array_equal(hist['sum_hessians'], np.zeros(n_bins))
+
+
+@pytest.mark.parametrize("constant_hessian", [True, False])
+def test_hist_subtraction(constant_hessian):
+    # Make sure the histogram subtraction trick gives the same result as the
+    # classical method.
+    rng = np.random.RandomState(42)
+    n_samples = 10
+    n_bins = 5
+    sample_indices = np.arange(n_samples).astype(np.uint32)
+    binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8)
+    ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
+    if constant_hessian:
+        ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
+    else:
+        ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
+
+    hist_parent = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    if constant_hessian:
+        _build_histogram_no_hessian(0, sample_indices, binned_feature,
+                                    ordered_gradients, hist_parent)
+    else:
+        _build_histogram(0, sample_indices, binned_feature,
+                         ordered_gradients, ordered_hessians, hist_parent)
+
+    mask = rng.randint(0, 2, n_samples).astype(np.bool)
+
+    sample_indices_left = sample_indices[mask]
+    ordered_gradients_left = ordered_gradients[mask]
+    ordered_hessians_left = ordered_hessians[mask]
+    hist_left = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    if constant_hessian:
+        _build_histogram_no_hessian(0, sample_indices_left,
+                                    binned_feature, ordered_gradients_left,
+                                    hist_left)
+    else:
+        _build_histogram(0, sample_indices_left, binned_feature,
+                         ordered_gradients_left, ordered_hessians_left,
+                         hist_left)
+
+    sample_indices_right = sample_indices[~mask]
+    ordered_gradients_right = ordered_gradients[~mask]
+    ordered_hessians_right = ordered_hessians[~mask]
+    hist_right = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    if constant_hessian:
+        _build_histogram_no_hessian(0, sample_indices_right,
+                                    binned_feature, ordered_gradients_right,
+                                    hist_right)
+    else:
+        _build_histogram(0, sample_indices_right, binned_feature,
+                         ordered_gradients_right, ordered_hessians_right,
+                         hist_right)
+
+    hist_left_sub = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    hist_right_sub = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    _subtract_histograms(0, n_bins, hist_parent, hist_right, hist_left_sub)
+    _subtract_histograms(0, n_bins, hist_parent, hist_left, hist_right_sub)
+
+    for key in ('count', 'sum_hessians', 'sum_gradients'):
+        assert_allclose(hist_left[key], hist_left_sub[key], rtol=1e-6)
+        assert_allclose(hist_right[key], hist_right_sub[key], rtol=1e-6)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
@ -0,0 +1,318 @@
+import numpy as np
+from numpy.testing import assert_almost_equal
+from numpy.testing import assert_allclose
+from scipy.optimize import newton
+from sklearn.utils import assert_all_finite
+from sklearn.utils.fixes import sp_version, parse_version
+import pytest
+
+from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
+from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+from sklearn.utils._testing import skip_if_32bit
+
+
+def get_derivatives_helper(loss):
+    """Return get_gradients() and get_hessians() functions for a given loss.
+    """
+
+    def get_gradients(y_true, raw_predictions):
+        # create gradients and hessians array, update inplace, and return
+        gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
+        hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
+        loss.update_gradients_and_hessians(gradients, hessians, y_true,
+                                           raw_predictions, None)
+        return gradients
+
+    def get_hessians(y_true, raw_predictions):
+        # create gradients and hessians array, update inplace, and return
+        gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
+        hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
+        loss.update_gradients_and_hessians(gradients, hessians, y_true,
+                                           raw_predictions, None)
+
+        if loss.__class__.__name__ == 'LeastSquares':
+            # hessians aren't updated because they're constant:
+            # the value is 1 (and not 2) because the loss is actually an half
+            # least squares loss.
+            hessians = np.full_like(raw_predictions, fill_value=1)
+        elif loss.__class__.__name__ == 'LeastAbsoluteDeviation':
+            # hessians aren't updated because they're constant
+            hessians = np.full_like(raw_predictions, fill_value=0)
+
+        return hessians
+
+    return get_gradients, get_hessians
+
+
+@pytest.mark.parametrize('loss, x0, y_true', [
+    ('least_squares', -2., 42),
+    ('least_squares', 117., 1.05),
+    ('least_squares', 0., 0.),
+    # I don't understand why but y_true == 0 fails :/
+    # ('binary_crossentropy', 0.3, 0),
+    ('binary_crossentropy', -12, 1),
+    ('binary_crossentropy', 30, 1),
+    ('poisson', 12., 1.),
+    ('poisson', 0., 2.),
+    ('poisson', -22., 10.),
+])
+@pytest.mark.skipif(sp_version == parse_version('1.2.0'),
+                    reason='bug in scipy 1.2.0, see scipy issue #9608')
+@skip_if_32bit
+def test_derivatives(loss, x0, y_true):
+    # Check that gradients are zero when the loss is minimized on 1D array
+    # using Halley's method with the first and second order derivatives
+    # computed by the Loss instance.
+
+    loss = _LOSSES[loss](sample_weight=None)
+    y_true = np.array([y_true], dtype=Y_DTYPE)
+    x0 = np.array([x0], dtype=Y_DTYPE).reshape(1, 1)
+    get_gradients, get_hessians = get_derivatives_helper(loss)
+
+    def func(x):
+        return loss.pointwise_loss(y_true, x)
+
+    def fprime(x):
+        return get_gradients(y_true, x)
+
+    def fprime2(x):
+        return get_hessians(y_true, x)
+
+    optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2,
+                     maxiter=70, tol=2e-8)
+    assert np.allclose(loss.inverse_link_function(optimum), y_true)
+    assert np.allclose(loss.pointwise_loss(y_true, optimum), 0)
+    assert np.allclose(get_gradients(y_true, optimum), 0, atol=1e-7)
+
+
+@pytest.mark.parametrize('loss, n_classes, prediction_dim', [
+    ('least_squares', 0, 1),
+    ('least_absolute_deviation', 0, 1),
+    ('binary_crossentropy', 2, 1),
+    ('categorical_crossentropy', 3, 3),
+    ('poisson', 0, 1),
+])
+@pytest.mark.skipif(Y_DTYPE != np.float64,
+                    reason='Need 64 bits float precision for numerical checks')
+def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
+    # Make sure gradients and hessians computed in the loss are correct, by
+    # comparing with their approximations computed with finite central
+    # differences.
+    # See https://en.wikipedia.org/wiki/Finite_difference.
+
+    rng = np.random.RandomState(seed)
+    n_samples = 100
+    if loss in ('least_squares', 'least_absolute_deviation'):
+        y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
+    elif loss in ('poisson'):
+        y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
+    else:
+        y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
+    raw_predictions = rng.normal(
+        size=(prediction_dim, n_samples)
+    ).astype(Y_DTYPE)
+    loss = _LOSSES[loss](sample_weight=None)
+    get_gradients, get_hessians = get_derivatives_helper(loss)
+
+    # only take gradients and hessians of first tree / class.
+    gradients = get_gradients(y_true, raw_predictions)[0, :].ravel()
+    hessians = get_hessians(y_true, raw_predictions)[0, :].ravel()
+
+    # Approximate gradients
+    # For multiclass loss, we should only change the predictions of one tree
+    # (here the first), hence the use of offset[0, :] += eps
+    # As a softmax is computed, offsetting the whole array by a constant would
+    # have no effect on the probabilities, and thus on the loss
+    eps = 1e-9
+    offset = np.zeros_like(raw_predictions)
+    offset[0, :] = eps
+    f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset / 2)
+    f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset / 2)
+    numerical_gradients = (f_plus_eps - f_minus_eps) / eps
+
+    # Approximate hessians
+    eps = 1e-4  # need big enough eps as we divide by its square
+    offset[0, :] = eps
+    f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset)
+    f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset)
+    f = loss.pointwise_loss(y_true, raw_predictions)
+    numerical_hessians = (f_plus_eps + f_minus_eps - 2 * f) / eps**2
+
+    assert_allclose(numerical_gradients, gradients, rtol=1e-4, atol=1e-7)
+    assert_allclose(numerical_hessians, hessians, rtol=1e-4, atol=1e-7)
+
+
+def test_baseline_least_squares():
+    rng = np.random.RandomState(0)
+
+    loss = _LOSSES['least_squares'](sample_weight=None)
+    y_train = rng.normal(size=100)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+    assert baseline_prediction.shape == tuple()  # scalar
+    assert baseline_prediction.dtype == y_train.dtype
+    # Make sure baseline prediction is the mean of all targets
+    assert_almost_equal(baseline_prediction, y_train.mean())
+    assert np.allclose(loss.inverse_link_function(baseline_prediction),
+                       baseline_prediction)
+
+
+def test_baseline_least_absolute_deviation():
+    rng = np.random.RandomState(0)
+
+    loss = _LOSSES['least_absolute_deviation'](sample_weight=None)
+    y_train = rng.normal(size=100)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+    assert baseline_prediction.shape == tuple()  # scalar
+    assert baseline_prediction.dtype == y_train.dtype
+    # Make sure baseline prediction is the median of all targets
+    assert np.allclose(loss.inverse_link_function(baseline_prediction),
+                       baseline_prediction)
+    assert baseline_prediction == pytest.approx(np.median(y_train))
+
+
+def test_baseline_poisson():
+    rng = np.random.RandomState(0)
+
+    loss = _LOSSES['poisson'](sample_weight=None)
+    y_train = rng.poisson(size=100).astype(np.float64)
+    # Sanity check, make sure at least one sample is non-zero so we don't take
+    # log(0)
+    assert y_train.sum() > 0
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+    assert np.isscalar(baseline_prediction)
+    assert baseline_prediction.dtype == y_train.dtype
+    assert_all_finite(baseline_prediction)
+    # Make sure baseline prediction produces the log of the mean of all targets
+    assert_almost_equal(np.log(y_train.mean()), baseline_prediction)
+
+    # Test baseline for y_true = 0
+    y_train.fill(0.)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+    assert_all_finite(baseline_prediction)
+
+
+def test_baseline_binary_crossentropy():
+    rng = np.random.RandomState(0)
+
+    loss = _LOSSES['binary_crossentropy'](sample_weight=None)
+    for y_train in (np.zeros(shape=100), np.ones(shape=100)):
+        y_train = y_train.astype(np.float64)
+        baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+        assert_all_finite(baseline_prediction)
+        assert np.allclose(loss.inverse_link_function(baseline_prediction),
+                           y_train[0])
+
+    # Make sure baseline prediction is equal to link_function(p), where p
+    # is the proba of the positive class. We want predict_proba() to return p,
+    # and by definition
+    # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction)
+    # So we want raw_prediction = link_function(p) = log(p / (1 - p))
+    y_train = rng.randint(0, 2, size=100).astype(np.float64)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+    assert baseline_prediction.shape == tuple()  # scalar
+    assert baseline_prediction.dtype == y_train.dtype
+    p = y_train.mean()
+    assert np.allclose(baseline_prediction, np.log(p / (1 - p)))
+
+
+def test_baseline_categorical_crossentropy():
+    rng = np.random.RandomState(0)
+
+    prediction_dim = 4
+    loss = _LOSSES['categorical_crossentropy'](sample_weight=None)
+    for y_train in (np.zeros(shape=100), np.ones(shape=100)):
+        y_train = y_train.astype(np.float64)
+        baseline_prediction = loss.get_baseline_prediction(y_train, None,
+                                                           prediction_dim)
+        assert baseline_prediction.dtype == y_train.dtype
+        assert_all_finite(baseline_prediction)
+
+    # Same logic as for above test. Here inverse_link_function = softmax and
+    # link_function = log
+    y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None,
+                                                       prediction_dim)
+    assert baseline_prediction.shape == (prediction_dim, 1)
+    for k in range(prediction_dim):
+        p = (y_train == k).mean()
+        assert np.allclose(baseline_prediction[k, :], np.log(p))
+
+
+@pytest.mark.parametrize('loss, problem', [
+    ('least_squares', 'regression'),
+    ('least_absolute_deviation', 'regression'),
+    ('binary_crossentropy', 'classification'),
+    ('categorical_crossentropy', 'classification'),
+    ('poisson', 'poisson_regression'),
+    ])
+@pytest.mark.parametrize('sample_weight', ['ones', 'random'])
+def test_sample_weight_multiplies_gradients(loss, problem, sample_weight):
+    # Make sure that passing sample weights to the gradient and hessians
+    # computation methods is equivalent to multiplying by the weights.
+
+    rng = np.random.RandomState(42)
+    n_samples = 1000
+
+    if loss == 'categorical_crossentropy':
+        n_classes = prediction_dim = 3
+    else:
+        n_classes = prediction_dim = 1
+
+    if problem == 'regression':
+        y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
+    elif problem == 'poisson_regression':
+        y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
+    else:
+        y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
+
+    if sample_weight == 'ones':
+        sample_weight = np.ones(shape=n_samples, dtype=Y_DTYPE)
+    else:
+        sample_weight = rng.normal(size=n_samples).astype(Y_DTYPE)
+
+    loss_ = _LOSSES[loss](sample_weight=sample_weight)
+
+    baseline_prediction = loss_.get_baseline_prediction(
+        y_true, None, prediction_dim
+    )
+    raw_predictions = np.zeros(shape=(prediction_dim, n_samples),
+                               dtype=baseline_prediction.dtype)
+    raw_predictions += baseline_prediction
+
+    gradients = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
+    hessians = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
+    loss_.update_gradients_and_hessians(gradients, hessians, y_true,
+                                        raw_predictions, None)
+
+    gradients_sw = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
+    hessians_sw = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
+    loss_.update_gradients_and_hessians(gradients_sw, hessians_sw, y_true,
+                                        raw_predictions, sample_weight)
+
+    assert np.allclose(gradients * sample_weight, gradients_sw)
+    assert np.allclose(hessians * sample_weight, hessians_sw)
+
+
+def test_init_gradient_and_hessians_sample_weight():
+    # Make sure that passing sample_weight to a loss correctly influences the
+    # hessians_are_constant attribute, and consequently the shape of the
+    # hessians array.
+
+    prediction_dim = 2
+    n_samples = 5
+    sample_weight = None
+    loss = _LOSSES['least_squares'](sample_weight=sample_weight)
+    _, hessians = loss.init_gradients_and_hessians(
+        n_samples=n_samples, prediction_dim=prediction_dim,
+        sample_weight=None)
+    assert loss.hessians_are_constant
+    assert hessians.shape == (1, 1)
+
+    sample_weight = np.ones(n_samples)
+    loss = _LOSSES['least_squares'](sample_weight=sample_weight)
+    _, hessians = loss.init_gradients_and_hessians(
+        n_samples=n_samples, prediction_dim=prediction_dim,
+        sample_weight=sample_weight)
+    assert not loss.hessians_are_constant
+    assert hessians.shape == (prediction_dim, n_samples)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
@ -0,0 +1,341 @@
+import numpy as np
+import pytest
+
+from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
+from sklearn.ensemble._hist_gradient_boosting.splitting import (
+    Splitter,
+    compute_node_value
+)
+from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.ensemble import HistGradientBoostingClassifier
+
+
+def is_increasing(a):
+    return (np.diff(a) >= 0.0).all()
+
+
+def is_decreasing(a):
+    return (np.diff(a) <= 0.0).all()
+
+
+def assert_leaves_values_monotonic(predictor, monotonic_cst):
+    # make sure leaves values (from left to right) are either all increasing
+    # or all decreasing (or neither) depending on the monotonic constraint.
+    nodes = predictor.nodes
+
+    def get_leaves_values():
+        """get leaves values from left to right"""
+        values = []
+
+        def depth_first_collect_leaf_values(node_idx):
+            node = nodes[node_idx]
+            if node['is_leaf']:
+                values.append(node['value'])
+                return
+            depth_first_collect_leaf_values(node['left'])
+            depth_first_collect_leaf_values(node['right'])
+
+        depth_first_collect_leaf_values(0)  # start at root (0)
+        return values
+
+    values = get_leaves_values()
+
+    if monotonic_cst == MonotonicConstraint.NO_CST:
+        # some increasing, some decreasing
+        assert not is_increasing(values) and not is_decreasing(values)
+    elif monotonic_cst == MonotonicConstraint.POS:
+        # all increasing
+        assert is_increasing(values)
+    else:  # NEG
+        # all decreasing
+        assert is_decreasing(values)
+
+
+def assert_children_values_monotonic(predictor, monotonic_cst):
+    # Make sure siblings values respect the monotonic constraints. Left should
+    # be lower (resp greater) than right child if constraint is POS (resp.
+    # NEG).
+    # Note that this property alone isn't enough to ensure full monotonicity,
+    # since we also need to guanrantee that all the descendents of the left
+    # child won't be greater (resp. lower) than the right child, or its
+    # descendents. That's why we need to bound the predicted values (this is
+    # tested in assert_children_values_bounded)
+    nodes = predictor.nodes
+    left_lower = []
+    left_greater = []
+    for node in nodes:
+        if node['is_leaf']:
+            continue
+
+        left_idx = node['left']
+        right_idx = node['right']
+
+        if nodes[left_idx]['value'] < nodes[right_idx]['value']:
+            left_lower.append(node)
+        elif nodes[left_idx]['value'] > nodes[right_idx]['value']:
+            left_greater.append(node)
+
+    if monotonic_cst == MonotonicConstraint.NO_CST:
+        assert left_lower and left_greater
+    elif monotonic_cst == MonotonicConstraint.POS:
+        assert left_lower and not left_greater
+    else:  # NEG
+        assert not left_lower and left_greater
+
+
+def assert_children_values_bounded(grower, monotonic_cst):
+    # Make sure that the values of the children of a node are bounded by the
+    # middle value between that node and its sibling (if there is a monotonic
+    # constraint).
+    # As a bonus, we also check that the siblings values are properly ordered
+    # which is slightly redundant with assert_children_values_monotonic (but
+    # this check is done on the grower nodes whereas
+    # assert_children_values_monotonic is done on the predictor nodes)
+
+    if monotonic_cst == MonotonicConstraint.NO_CST:
+        return
+
+    def recursively_check_children_node_values(node):
+        if node.is_leaf:
+            return
+        if node is not grower.root and node is node.parent.left_child:
+            sibling = node.sibling  # on the right
+            middle = (node.value + sibling.value) / 2
+            if monotonic_cst == MonotonicConstraint.POS:
+                assert (node.left_child.value <=
+                        node.right_child.value <=
+                        middle)
+                if not sibling.is_leaf:
+                    assert (middle <=
+                            sibling.left_child.value <=
+                            sibling.right_child.value)
+            else:  # NEG
+                assert (node.left_child.value >=
+                        node.right_child.value >=
+                        middle)
+                if not sibling.is_leaf:
+                    assert (middle >=
+                            sibling.left_child.value >=
+                            sibling.right_child.value)
+
+        recursively_check_children_node_values(node.left_child)
+        recursively_check_children_node_values(node.right_child)
+
+    recursively_check_children_node_values(grower.root)
+
+
+@pytest.mark.parametrize('seed', range(3))
+@pytest.mark.parametrize('monotonic_cst', (
+    MonotonicConstraint.NO_CST,
+    MonotonicConstraint.POS,
+    MonotonicConstraint.NEG,
+))
+def test_nodes_values(monotonic_cst, seed):
+    # Build a single tree with only one feature, and make sure the nodes
+    # values respect the monotonic constraints.
+
+    # Considering the following tree with a monotonic POS constraint, we
+    # should have:
+    #
+    #       root
+    #      /    \
+    #     5     10    # middle = 7.5
+    #    / \   / \
+    #   a  b  c  d
+    #
+    # a <= b and c <= d  (assert_children_values_monotonic)
+    # a, b <= middle <= c, d (assert_children_values_bounded)
+    # a <= b <= c <= d (assert_leaves_values_monotonic)
+    #
+    # The last one is a consequence of the others, but can't hurt to check
+
+    rng = np.random.RandomState(seed)
+    n_samples = 1000
+    n_features = 1
+    X_binned = rng.randint(0, 255, size=(n_samples, n_features),
+                           dtype=np.uint8)
+    X_binned = np.asfortranarray(X_binned)
+
+    gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
+    hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+
+    grower = TreeGrower(X_binned, gradients, hessians,
+                        monotonic_cst=[monotonic_cst],
+                        shrinkage=.1)
+    grower.grow()
+
+    # grow() will shrink the leaves values at the very end. For our comparison
+    # tests, we need to revert the shrinkage of the leaves, else we would
+    # compare the value of a leaf (shrunk) with a node (not shrunk) and the
+    # test would not be correct.
+    for leave in grower.finalized_leaves:
+        leave.value /= grower.shrinkage
+
+    # The consistency of the bounds can only be checked on the tree grower
+    # as the node bounds are not copied into the predictor tree. The
+    # consistency checks on the values of node children and leaves can be
+    # done either on the grower tree or on the predictor tree. We only
+    # do those checks on the predictor tree as the latter is derived from
+    # the former.
+    predictor = grower.make_predictor()
+    assert_children_values_monotonic(predictor, monotonic_cst)
+    assert_children_values_bounded(grower, monotonic_cst)
+    assert_leaves_values_monotonic(predictor, monotonic_cst)
+
+
+@pytest.mark.parametrize('seed', range(3))
+def test_predictions(seed):
+    # Train a model with a POS constraint on the first feature and a NEG
+    # constraint on the second feature, and make sure the constraints are
+    # respected by checking the predictions.
+    # test adapted from lightgbm's test_monotone_constraint(), itself inspired
+    # by https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html
+
+    rng = np.random.RandomState(seed)
+
+    n_samples = 1000
+    f_0 = rng.rand(n_samples)  # positive correlation with y
+    f_1 = rng.rand(n_samples)  # negative correslation with y
+    X = np.c_[f_0, f_1]
+    noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
+    y = (5 * f_0 + np.sin(10 * np.pi * f_0) -
+         5 * f_1 - np.cos(10 * np.pi * f_1) +
+         noise)
+
+    gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1])
+    gbdt.fit(X, y)
+
+    linspace = np.linspace(0, 1, 100)
+    sin = np.sin(linspace)
+    constant = np.full_like(linspace, fill_value=.5)
+
+    # We now assert the predictions properly respect the constraints, on each
+    # feature. When testing for a feature we need to set the other one to a
+    # constant, because the monotonic constraints are only a "all else being
+    # equal" type of constraints:
+    # a constraint on the first feature only means that
+    # x0 < x0' => f(x0, x1) < f(x0', x1)
+    # while x1 stays constant.
+    # The constraint does not guanrantee that
+    # x0 < x0' => f(x0, x1) < f(x0', x1')
+
+    # First feature (POS)
+    # assert pred is all increasing when f_0 is all increasing
+    X = np.c_[linspace, constant]
+    pred = gbdt.predict(X)
+    assert is_increasing(pred)
+    # assert pred actually follows the variations of f_0
+    X = np.c_[sin, constant]
+    pred = gbdt.predict(X)
+    assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0))
+
+    # Second feature (NEG)
+    # assert pred is all decreasing when f_1 is all increasing
+    X = np.c_[constant, linspace]
+    pred = gbdt.predict(X)
+    assert is_decreasing(pred)
+    # assert pred actually follows the inverse variations of f_1
+    X = np.c_[constant, sin]
+    pred = gbdt.predict(X)
+    assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all()
+
+
+def test_input_error():
+    X = [[1, 2], [2, 3], [3, 4]]
+    y = [0, 1, 2]
+
+    gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, 0, -1])
+    with pytest.raises(ValueError,
+                       match='monotonic_cst has shape 3 but the input data'):
+        gbdt.fit(X, y)
+
+    for monotonic_cst in ([1, 3], [1, -3]):
+        gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
+        with pytest.raises(ValueError,
+                           match='must be None or an array-like of '
+                                 '-1, 0 or 1'):
+            gbdt.fit(X, y)
+
+    gbdt = HistGradientBoostingClassifier(monotonic_cst=[0, 1])
+    with pytest.raises(
+            ValueError,
+            match='monotonic constraints are not supported '
+                  'for multiclass classification'
+            ):
+        gbdt.fit(X, y)
+
+
+def test_bounded_value_min_gain_to_split():
+    # The purpose of this test is to show that when computing the gain at a
+    # given split, the value of the current node should be properly bounded to
+    # respect the monotonic constraints, because it strongly interacts with
+    # min_gain_to_split. We build a simple example where gradients are [1, 1,
+    # 100, 1, 1] (hessians are all ones). The best split happens on the 3rd
+    # bin, and depending on whether the value of the node is bounded or not,
+    # the min_gain_to_split constraint is or isn't satisfied.
+    l2_regularization = 0
+    min_hessian_to_split = 0
+    min_samples_leaf = 1
+    n_bins = n_samples = 5
+    X_binned = np.arange(n_samples).reshape(-1, 1).astype(X_BINNED_DTYPE)
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
+    all_gradients = np.array([1, 1, 100, 1, 1], dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = all_hessians.sum()
+    hessians_are_constant = False
+
+    builder = HistogramBuilder(X_binned, n_bins, all_gradients,
+                               all_hessians, hessians_are_constant)
+    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
+                                  dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
+    missing_values_bin_idx = n_bins - 1
+    children_lower_bound, children_upper_bound = -np.inf, np.inf
+
+    min_gain_to_split = 2000
+    splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
+                        has_missing_values, monotonic_cst, l2_regularization,
+                        min_hessian_to_split, min_samples_leaf,
+                        min_gain_to_split, hessians_are_constant)
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+
+    # Since the gradient array is [1, 1, 100, 1, 1]
+    # the max possible gain happens on the 3rd bin (or equivalently in the 2nd)
+    # and is equal to about 1307, which less than min_gain_to_split = 2000, so
+    # the node is considered unsplittable (gain = -1)
+    current_lower_bound, current_upper_bound = -np.inf, np.inf
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               current_lower_bound, current_upper_bound,
+                               l2_regularization)
+    # the unbounded value is equal to -sum_gradients / sum_hessians
+    assert value == pytest.approx(-104 / 5)
+    split_info = splitter.find_node_split(n_samples, histograms,
+                                          sum_gradients, sum_hessians, value,
+                                          lower_bound=children_lower_bound,
+                                          upper_bound=children_upper_bound)
+    assert split_info.gain == -1  # min_gain_to_split not respected
+
+    # here again the max possible gain is on the 3rd bin but we now cap the
+    # value of the node into [-10, inf].
+    # This means the gain is now about 2430 which is more than the
+    # min_gain_to_split constraint.
+    current_lower_bound, current_upper_bound = -10, np.inf
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               current_lower_bound, current_upper_bound,
+                               l2_regularization)
+    assert value == -10
+    split_info = splitter.find_node_split(n_samples, histograms,
+                                          sum_gradients, sum_hessians, value,
+                                          lower_bound=children_lower_bound,
+                                          upper_bound=children_upper_bound)
+    assert split_info.gain > min_gain_to_split
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
@ -0,0 +1,76 @@
+import numpy as np
+from sklearn.datasets import make_regression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import r2_score
+import pytest
+
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
+from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor
+from sklearn.ensemble._hist_gradient_boosting.common import (
+    G_H_DTYPE, PREDICTOR_RECORD_DTYPE, ALMOST_INF)
+
+
+@pytest.mark.parametrize('n_bins', [200, 256])
+def test_regression_dataset(n_bins):
+    X, y = make_regression(n_samples=500, n_features=10, n_informative=5,
+                           random_state=42)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, random_state=42)
+
+    mapper = _BinMapper(n_bins=n_bins, random_state=42)
+    X_train_binned = mapper.fit_transform(X_train)
+
+    # Init gradients and hessians to that of least squares loss
+    gradients = -y_train.astype(G_H_DTYPE)
+    hessians = np.ones(1, dtype=G_H_DTYPE)
+
+    min_samples_leaf = 10
+    max_leaf_nodes = 30
+    grower = TreeGrower(X_train_binned, gradients, hessians,
+                        min_samples_leaf=min_samples_leaf,
+                        max_leaf_nodes=max_leaf_nodes, n_bins=n_bins,
+                        n_bins_non_missing=mapper.n_bins_non_missing_)
+    grower.grow()
+
+    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)
+
+    assert r2_score(y_train, predictor.predict(X_train)) > 0.82
+    assert r2_score(y_test, predictor.predict(X_test)) > 0.67
+
+
+@pytest.mark.parametrize('threshold, expected_predictions', [
+    (-np.inf, [0, 1, 1, 1]),
+    (10, [0, 0, 1, 1]),
+    (20, [0, 0, 0, 1]),
+    (ALMOST_INF, [0, 0, 0, 1]),
+    (np.inf, [0, 0, 0, 0]),
+])
+def test_infinite_values_and_thresholds(threshold, expected_predictions):
+    # Make sure infinite values and infinite thresholds are handled properly.
+    # In particular, if a value is +inf and the threshold is ALMOST_INF the
+    # sample should go to the right child. If the threshold is inf (split on
+    # nan), the +inf sample will go to the left child.
+
+    X = np.array([-np.inf, 10, 20,  np.inf]).reshape(-1, 1)
+    nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE)
+
+    # We just construct a simple tree with 1 root and 2 children
+    # parent node
+    nodes[0]['left'] = 1
+    nodes[0]['right'] = 2
+    nodes[0]['feature_idx'] = 0
+    nodes[0]['threshold'] = threshold
+
+    # left child
+    nodes[1]['is_leaf'] = True
+    nodes[1]['value'] = 0
+
+    # right child
+    nodes[2]['is_leaf'] = True
+    nodes[2]['value'] = 1
+
+    predictor = TreePredictor(nodes)
+    predictions = predictor.predict(X)
+
+    assert np.all(predictions == expected_predictions)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
@ -0,0 +1,480 @@
+import numpy as np
+import pytest
+
+from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
+from sklearn.ensemble._hist_gradient_boosting.splitting import (
+    Splitter,
+    compute_node_value
+)
+from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
+from sklearn.utils._testing import skip_if_32bit
+
+
+@pytest.mark.parametrize('n_bins', [3, 32, 256])
+def test_histogram_split(n_bins):
+    rng = np.random.RandomState(42)
+    feature_idx = 0
+    l2_regularization = 0
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.
+    X_binned = np.asfortranarray(
+        rng.randint(0, n_bins - 1, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE)
+    binned_feature = X_binned.T[feature_idx]
+    sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32)
+    ordered_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
+    all_hessians = ordered_hessians
+    sum_hessians = all_hessians.sum()
+    hessians_are_constant = False
+
+    for true_bin in range(1, n_bins - 2):
+        for sign in [-1, 1]:
+            ordered_gradients = np.full_like(binned_feature, sign,
+                                             dtype=G_H_DTYPE)
+            ordered_gradients[binned_feature <= true_bin] *= -1
+            all_gradients = ordered_gradients
+            sum_gradients = all_gradients.sum()
+
+            builder = HistogramBuilder(X_binned,
+                                       n_bins,
+                                       all_gradients,
+                                       all_hessians,
+                                       hessians_are_constant)
+            n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
+                                          dtype=np.uint32)
+            has_missing_values = np.array([False] * X_binned.shape[1],
+                                          dtype=np.uint8)
+            monotonic_cst = np.array(
+                [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+                dtype=np.int8)
+            missing_values_bin_idx = n_bins - 1
+            splitter = Splitter(X_binned,
+                                n_bins_non_missing,
+                                missing_values_bin_idx,
+                                has_missing_values,
+                                monotonic_cst,
+                                l2_regularization,
+                                min_hessian_to_split,
+                                min_samples_leaf, min_gain_to_split,
+                                hessians_are_constant)
+
+            histograms = builder.compute_histograms_brute(sample_indices)
+            value = compute_node_value(sum_gradients, sum_hessians,
+                                       -np.inf, np.inf, l2_regularization)
+            split_info = splitter.find_node_split(
+                sample_indices.shape[0], histograms, sum_gradients,
+                sum_hessians, value)
+
+            assert split_info.bin_idx == true_bin
+            assert split_info.gain >= 0
+            assert split_info.feature_idx == feature_idx
+            assert (split_info.n_samples_left + split_info.n_samples_right
+                    == sample_indices.shape[0])
+            # Constant hessian: 1. per sample.
+            assert split_info.n_samples_left == split_info.sum_hessian_left
+
+
+@skip_if_32bit
+@pytest.mark.parametrize('constant_hessian', [True, False])
+def test_gradient_and_hessian_sanity(constant_hessian):
+    # This test checks that the values of gradients and hessians are
+    # consistent in different places:
+    # - in split_info: si.sum_gradient_left + si.sum_gradient_right must be
+    #   equal to the gradient at the node. Same for hessians.
+    # - in the histograms: summing 'sum_gradients' over the bins must be
+    #   constant across all features, and those sums must be equal to the
+    #   node's gradient. Same for hessians.
+
+    rng = np.random.RandomState(42)
+
+    n_bins = 10
+    n_features = 20
+    n_samples = 500
+    l2_regularization = 0.
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.
+
+    X_binned = rng.randint(0, n_bins, size=(n_samples, n_features),
+                           dtype=X_BINNED_DTYPE)
+    X_binned = np.asfortranarray(X_binned)
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    if constant_hessian:
+        all_hessians = np.ones(1, dtype=G_H_DTYPE)
+        sum_hessians = 1 * n_samples
+    else:
+        all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
+        sum_hessians = all_hessians.sum()
+
+    builder = HistogramBuilder(X_binned, n_bins, all_gradients,
+                               all_hessians, constant_hessian)
+    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
+                                  dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
+                        has_missing_values, monotonic_cst, l2_regularization,
+                        min_hessian_to_split, min_samples_leaf,
+                        min_gain_to_split, constant_hessian)
+
+    hists_parent = builder.compute_histograms_brute(sample_indices)
+    value_parent = compute_node_value(sum_gradients, sum_hessians,
+                                      -np.inf, np.inf, l2_regularization)
+    si_parent = splitter.find_node_split(n_samples, hists_parent,
+                                         sum_gradients, sum_hessians,
+                                         value_parent)
+    sample_indices_left, sample_indices_right, _ = splitter.split_indices(
+        si_parent, sample_indices)
+
+    hists_left = builder.compute_histograms_brute(sample_indices_left)
+    value_left = compute_node_value(si_parent.sum_gradient_left,
+                                    si_parent.sum_hessian_left,
+                                    -np.inf, np.inf, l2_regularization)
+    hists_right = builder.compute_histograms_brute(sample_indices_right)
+    value_right = compute_node_value(si_parent.sum_gradient_right,
+                                     si_parent.sum_hessian_right,
+                                     -np.inf, np.inf, l2_regularization)
+    si_left = splitter.find_node_split(n_samples, hists_left,
+                                       si_parent.sum_gradient_left,
+                                       si_parent.sum_hessian_left,
+                                       value_left)
+    si_right = splitter.find_node_split(n_samples, hists_right,
+                                        si_parent.sum_gradient_right,
+                                        si_parent.sum_hessian_right,
+                                        value_right)
+
+    # make sure that si.sum_gradient_left + si.sum_gradient_right have their
+    # expected value, same for hessians
+    for si, indices in (
+            (si_parent, sample_indices),
+            (si_left, sample_indices_left),
+            (si_right, sample_indices_right)):
+        gradient = si.sum_gradient_right + si.sum_gradient_left
+        expected_gradient = all_gradients[indices].sum()
+        hessian = si.sum_hessian_right + si.sum_hessian_left
+        if constant_hessian:
+            expected_hessian = indices.shape[0] * all_hessians[0]
+        else:
+            expected_hessian = all_hessians[indices].sum()
+
+        assert np.isclose(gradient, expected_gradient)
+        assert np.isclose(hessian, expected_hessian)
+
+    # make sure sum of gradients in histograms are the same for all features,
+    # and make sure they're equal to their expected value
+    hists_parent = np.asarray(hists_parent, dtype=HISTOGRAM_DTYPE)
+    hists_left = np.asarray(hists_left, dtype=HISTOGRAM_DTYPE)
+    hists_right = np.asarray(hists_right, dtype=HISTOGRAM_DTYPE)
+    for hists, indices in (
+            (hists_parent, sample_indices),
+            (hists_left, sample_indices_left),
+            (hists_right, sample_indices_right)):
+        # note: gradients and hessians have shape (n_features,),
+        # we're comparing them to *scalars*. This has the benefit of also
+        # making sure that all the entries are equal across features.
+        gradients = hists['sum_gradients'].sum(axis=1)  # shape = (n_features,)
+        expected_gradient = all_gradients[indices].sum()  # scalar
+        hessians = hists['sum_hessians'].sum(axis=1)
+        if constant_hessian:
+            # 0 is not the actual hessian, but it's not computed in this case
+            expected_hessian = 0.
+        else:
+            expected_hessian = all_hessians[indices].sum()
+
+        assert np.allclose(gradients, expected_gradient)
+        assert np.allclose(hessians, expected_hessian)
+
+
+def test_split_indices():
+    # Check that split_indices returns the correct splits and that
+    # splitter.partition is consistent with what is returned.
+    rng = np.random.RandomState(421)
+
+    n_bins = 5
+    n_samples = 10
+    l2_regularization = 0.
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.
+
+    # split will happen on feature 1 and on bin 3
+    X_binned = [[0, 0],
+                [0, 3],
+                [0, 4],
+                [0, 0],
+                [0, 0],
+                [0, 0],
+                [0, 0],
+                [0, 4],
+                [0, 0],
+                [0, 4]]
+    X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE)
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
+    all_hessians = np.ones(1, dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = 1 * n_samples
+    hessians_are_constant = True
+
+    builder = HistogramBuilder(X_binned, n_bins,
+                               all_gradients, all_hessians,
+                               hessians_are_constant)
+    n_bins_non_missing = np.array([n_bins] * X_binned.shape[1],
+                                  dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
+                        has_missing_values, monotonic_cst, l2_regularization,
+                        min_hessian_to_split, min_samples_leaf,
+                        min_gain_to_split, hessians_are_constant)
+
+    assert np.all(sample_indices == splitter.partition)
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               -np.inf, np.inf, l2_regularization)
+    si_root = splitter.find_node_split(n_samples, histograms,
+                                       sum_gradients, sum_hessians, value)
+
+    # sanity checks for best split
+    assert si_root.feature_idx == 1
+    assert si_root.bin_idx == 3
+
+    samples_left, samples_right, position_right = splitter.split_indices(
+        si_root, splitter.partition)
+    assert set(samples_left) == set([0, 1, 3, 4, 5, 6, 8])
+    assert set(samples_right) == set([2, 7, 9])
+
+    assert list(samples_left) == list(splitter.partition[:position_right])
+    assert list(samples_right) == list(splitter.partition[position_right:])
+
+    # Check that the resulting split indices sizes are consistent with the
+    # count statistics anticipated when looking for the best split.
+    assert samples_left.shape[0] == si_root.n_samples_left
+    assert samples_right.shape[0] == si_root.n_samples_right
+
+
+def test_min_gain_to_split():
+    # Try to split a pure node (all gradients are equal, same for hessians)
+    # with min_gain_to_split = 0 and make sure that the node is not split (best
+    # possible gain = -1). Note: before the strict inequality comparison, this
+    # test would fail because the node would be split with a gain of 0.
+    rng = np.random.RandomState(42)
+    l2_regularization = 0
+    min_hessian_to_split = 0
+    min_samples_leaf = 1
+    min_gain_to_split = 0.
+    n_bins = 255
+    n_samples = 100
+    X_binned = np.asfortranarray(
+        rng.randint(0, n_bins, size=(n_samples, 1)), dtype=X_BINNED_DTYPE)
+    binned_feature = X_binned[:, 0]
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
+    all_gradients = np.ones_like(binned_feature, dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = all_hessians.sum()
+    hessians_are_constant = False
+
+    builder = HistogramBuilder(X_binned, n_bins, all_gradients,
+                               all_hessians, hessians_are_constant)
+    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
+                                  dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
+                        has_missing_values, monotonic_cst, l2_regularization,
+                        min_hessian_to_split, min_samples_leaf,
+                        min_gain_to_split, hessians_are_constant)
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               -np.inf, np.inf, l2_regularization)
+    split_info = splitter.find_node_split(n_samples, histograms,
+                                          sum_gradients, sum_hessians, value)
+    assert split_info.gain == -1
+
+
+@pytest.mark.parametrize(
+    'X_binned, all_gradients, has_missing_values, n_bins_non_missing, '
+    ' expected_split_on_nan, expected_bin_idx, expected_go_to_left', [
+
+        # basic sanity check with no missing values: given the gradient
+        # values, the split must occur on bin_idx=3
+        ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],  # X_binned
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],  # gradients
+         False,  # no missing values
+         10,  # n_bins_non_missing
+         False,  # don't split on nans
+         3,  # expected_bin_idx
+         'not_applicable'),
+
+        # We replace 2 samples by NaNs (bin_idx=8)
+        # These 2 samples were mapped to the left node before, so they should
+        # be mapped to left node again
+        # Notice how the bin_idx threshold changes from 3 to 1.
+        ([8, 0, 1, 8, 2, 3, 4, 5, 6, 7],  # 8 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         8,  # n_bins_non_missing
+         False,  # don't split on nans
+         1,  # cut on bin_idx=1
+         True),  # missing values go to left
+
+        # same as above, but with non-consecutive missing_values_bin
+        ([9, 0, 1, 9, 2, 3, 4, 5, 6, 7],  # 9 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         8,  # n_bins_non_missing
+         False,  # don't split on nans
+         1,  # cut on bin_idx=1
+         True),  # missing values go to left
+
+        # this time replacing 2 samples that were on the right.
+        ([0, 1, 2, 3, 8, 4, 8, 5, 6, 7],  # 8 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         8,  # n_bins_non_missing
+         False,  # don't split on nans
+         3,  # cut on bin_idx=3 (like in first case)
+         False),  # missing values go to right
+
+        # same as above, but with non-consecutive missing_values_bin
+        ([0, 1, 2, 3, 9, 4, 9, 5, 6, 7],  # 9 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         8,  # n_bins_non_missing
+         False,  # don't split on nans
+         3,  # cut on bin_idx=3 (like in first case)
+         False),  # missing values go to right
+
+        # For the following cases, split_on_nans is True (we replace all of
+        # the samples with nans, instead of just 2).
+        ([0, 1, 2, 3, 4, 4, 4, 4, 4, 4],  # 4 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         4,  # n_bins_non_missing
+         True,  # split on nans
+         3,  # cut on bin_idx=3
+         False),  # missing values go to right
+
+        # same as above, but with non-consecutive missing_values_bin
+        ([0, 1, 2, 3, 9, 9, 9, 9, 9, 9],  # 9 <=> missing
+         [1, 1, 1, 1, 1, 1, 5, 5, 5, 5],
+         True,  # missing values
+         4,  # n_bins_non_missing
+         True,  # split on nans
+         3,  # cut on bin_idx=3
+         False),  # missing values go to right
+
+        ([6, 6, 6, 6, 0, 1, 2, 3, 4, 5],  # 6 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         6,  # n_bins_non_missing
+         True,  # split on nans
+         5,  # cut on bin_idx=5
+         False),  # missing values go to right
+
+        # same as above, but with non-consecutive missing_values_bin
+        ([9, 9, 9, 9, 0, 1, 2, 3, 4, 5],  # 9 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         6,  # n_bins_non_missing
+         True,  # split on nans
+         5,  # cut on bin_idx=5
+         False),  # missing values go to right
+    ]
+)
+def test_splitting_missing_values(X_binned, all_gradients,
+                                  has_missing_values, n_bins_non_missing,
+                                  expected_split_on_nan, expected_bin_idx,
+                                  expected_go_to_left):
+    # Make sure missing values are properly supported.
+    # we build an artificial example with gradients such that the best split
+    # is on bin_idx=3, when there are no missing values.
+    # Then we introduce missing values and:
+    #   - make sure the chosen bin is correct (find_best_bin()): it's
+    #     still the same split, even though the index of the bin may change
+    #   - make sure the missing values are mapped to the correct child
+    #     (split_indices())
+
+    n_bins = max(X_binned) + 1
+    n_samples = len(X_binned)
+    l2_regularization = 0.
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.
+
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1)
+    X_binned = np.asfortranarray(X_binned)
+    all_gradients = np.array(all_gradients, dtype=G_H_DTYPE)
+    has_missing_values = np.array([has_missing_values], dtype=np.uint8)
+    all_hessians = np.ones(1, dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = 1 * n_samples
+    hessians_are_constant = True
+
+    builder = HistogramBuilder(X_binned, n_bins,
+                               all_gradients, all_hessians,
+                               hessians_are_constant)
+
+    n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(X_binned, n_bins_non_missing,
+                        missing_values_bin_idx, has_missing_values,
+                        monotonic_cst,
+                        l2_regularization, min_hessian_to_split,
+                        min_samples_leaf, min_gain_to_split,
+                        hessians_are_constant)
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               -np.inf, np.inf, l2_regularization)
+    split_info = splitter.find_node_split(n_samples, histograms,
+                                          sum_gradients, sum_hessians, value)
+
+    assert split_info.bin_idx == expected_bin_idx
+    if has_missing_values:
+        assert split_info.missing_go_to_left == expected_go_to_left
+
+    split_on_nan = split_info.bin_idx == n_bins_non_missing[0] - 1
+    assert split_on_nan == expected_split_on_nan
+
+    # Make sure the split is properly computed.
+    # This also make sure missing values are properly assigned to the correct
+    # child in split_indices()
+    samples_left, samples_right, _ = splitter.split_indices(
+        split_info, splitter.partition)
+
+    if not expected_split_on_nan:
+        # When we don't split on nans, the split should always be the same.
+        assert set(samples_left) == set([0, 1, 2, 3])
+        assert set(samples_right) == set([4, 5, 6, 7, 8, 9])
+    else:
+        # When we split on nans, samples with missing values are always mapped
+        # to the right child.
+        missing_samples_indices = np.flatnonzero(
+            np.array(X_binned) == missing_values_bin_idx)
+        non_missing_samples_indices = np.flatnonzero(
+            np.array(X_binned) != missing_values_bin_idx)
+
+        assert set(samples_right) == set(missing_samples_indices)
+        assert set(samples_left) == set(non_missing_samples_indices)
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
@ -0,0 +1,206 @@
+import numpy as np
+from numpy.testing import assert_array_equal
+from numpy.testing import assert_allclose
+
+import pytest
+
+from sklearn.base import clone
+from sklearn.datasets import make_classification, make_regression
+
+# To use this experimental feature, we need to explicitly ask for it:
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.metrics import check_scoring
+
+
+X_classification, y_classification = make_classification(random_state=0)
+X_regression, y_regression = make_regression(random_state=0)
+
+
+def _assert_predictor_equal(gb_1, gb_2, X):
+    """Assert that two HistGBM instances are identical."""
+    # Check identical nodes for each tree
+    for (pred_ith_1, pred_ith_2) in zip(gb_1._predictors, gb_2._predictors):
+        for (predictor_1, predictor_2) in zip(pred_ith_1, pred_ith_2):
+            assert_array_equal(predictor_1.nodes, predictor_2.nodes)
+
+    # Check identical predictions
+    assert_allclose(gb_1.predict(X), gb_2.predict(X))
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+def test_max_iter_with_warm_start_validation(GradientBoosting, X, y):
+    # Check that a ValueError is raised when the maximum number of iterations
+    # is smaller than the number of iterations from the previous fit when warm
+    # start is True.
+
+    estimator = GradientBoosting(max_iter=10, early_stopping=False,
+                                 warm_start=True)
+    estimator.fit(X, y)
+    estimator.set_params(max_iter=5)
+    err_msg = ('max_iter=5 must be larger than or equal to n_iter_=10 '
+               'when warm_start==True')
+    with pytest.raises(ValueError, match=err_msg):
+        estimator.fit(X, y)
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+def test_warm_start_yields_identical_results(GradientBoosting, X, y):
+    # Make sure that fitting 50 iterations and then 25 with warm start is
+    # equivalent to fitting 75 iterations.
+
+    rng = 42
+    gb_warm_start = GradientBoosting(
+        n_iter_no_change=100, max_iter=50, random_state=rng, warm_start=True
+    )
+    gb_warm_start.fit(X, y).set_params(max_iter=75).fit(X, y)
+
+    gb_no_warm_start = GradientBoosting(
+        n_iter_no_change=100, max_iter=75, random_state=rng, warm_start=False
+    )
+    gb_no_warm_start.fit(X, y)
+
+    # Check that both predictors are equal
+    _assert_predictor_equal(gb_warm_start, gb_no_warm_start, X)
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+def test_warm_start_max_depth(GradientBoosting, X, y):
+    # Test if possible to fit trees of different depth in ensemble.
+    gb = GradientBoosting(max_iter=20, min_samples_leaf=1,
+                          warm_start=True, max_depth=2, early_stopping=False)
+    gb.fit(X, y)
+    gb.set_params(max_iter=30, max_depth=3, n_iter_no_change=110)
+    gb.fit(X, y)
+
+    # First 20 trees have max_depth == 2
+    for i in range(20):
+        assert gb._predictors[i][0].get_max_depth() == 2
+    # Last 10 trees have max_depth == 3
+    for i in range(1, 11):
+        assert gb._predictors[-i][0].get_max_depth() == 3
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+@pytest.mark.parametrize('scoring', (None, 'loss'))
+def test_warm_start_early_stopping(GradientBoosting, X, y, scoring):
+    # Make sure that early stopping occurs after a small number of iterations
+    # when fitting a second time with warm starting.
+
+    n_iter_no_change = 5
+    gb = GradientBoosting(
+        n_iter_no_change=n_iter_no_change, max_iter=10000, early_stopping=True,
+        random_state=42, warm_start=True, tol=1e-3, scoring=scoring,
+    )
+    gb.fit(X, y)
+    n_iter_first_fit = gb.n_iter_
+    gb.fit(X, y)
+    n_iter_second_fit = gb.n_iter_
+    assert 0 < n_iter_second_fit - n_iter_first_fit < n_iter_no_change
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+def test_warm_start_equal_n_estimators(GradientBoosting, X, y):
+    # Test if warm start with equal n_estimators does nothing
+    gb_1 = GradientBoosting(max_depth=2, early_stopping=False)
+    gb_1.fit(X, y)
+
+    gb_2 = clone(gb_1)
+    gb_2.set_params(max_iter=gb_1.max_iter, warm_start=True,
+                    n_iter_no_change=5)
+    gb_2.fit(X, y)
+
+    # Check that both predictors are equal
+    _assert_predictor_equal(gb_1, gb_2, X)
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+def test_warm_start_clear(GradientBoosting, X, y):
+    # Test if fit clears state.
+    gb_1 = GradientBoosting(n_iter_no_change=5, random_state=42)
+    gb_1.fit(X, y)
+
+    gb_2 = GradientBoosting(n_iter_no_change=5, random_state=42,
+                            warm_start=True)
+    gb_2.fit(X, y)  # inits state
+    gb_2.set_params(warm_start=False)
+    gb_2.fit(X, y)  # clears old state and equals est
+
+    # Check that both predictors have the same train_score_ and
+    # validation_score_ attributes
+    assert_allclose(gb_1.train_score_, gb_2.train_score_)
+    assert_allclose(gb_1.validation_score_, gb_2.validation_score_)
+
+    # Check that both predictors are equal
+    _assert_predictor_equal(gb_1, gb_2, X)
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+@pytest.mark.parametrize('rng_type', ('none', 'int', 'instance'))
+def test_random_seeds_warm_start(GradientBoosting, X, y, rng_type):
+    # Make sure the seeds for train/val split and small trainset subsampling
+    # are correctly set in a warm start context.
+    def _get_rng(rng_type):
+        # Helper to avoid consuming rngs
+        if rng_type == 'none':
+            return None
+        elif rng_type == 'int':
+            return 42
+        else:
+            return np.random.RandomState(0)
+
+    random_state = _get_rng(rng_type)
+    gb_1 = GradientBoosting(early_stopping=True, max_iter=2,
+                            random_state=random_state)
+    gb_1.set_params(scoring=check_scoring(gb_1))
+    gb_1.fit(X, y)
+    random_seed_1_1 = gb_1._random_seed
+
+    gb_1.fit(X, y)
+    random_seed_1_2 = gb_1._random_seed  # clear the old state, different seed
+
+    random_state = _get_rng(rng_type)
+    gb_2 = GradientBoosting(early_stopping=True, max_iter=2,
+                            random_state=random_state, warm_start=True)
+    gb_2.set_params(scoring=check_scoring(gb_2))
+    gb_2.fit(X, y)  # inits state
+    random_seed_2_1 = gb_2._random_seed
+    gb_2.fit(X, y)  # clears old state and equals est
+    random_seed_2_2 = gb_2._random_seed
+
+    # Without warm starting, the seeds should be
+    # * all different if random state is None
+    # * all equal if random state is an integer
+    # * different when refitting and equal with a new estimator (because
+    #   the random state is mutated)
+    if rng_type == 'none':
+        assert random_seed_1_1 != random_seed_1_2 != random_seed_2_1
+    elif rng_type == 'int':
+        assert random_seed_1_1 == random_seed_1_2 == random_seed_2_1
+    else:
+        assert random_seed_1_1 == random_seed_2_1 != random_seed_1_2
+
+    # With warm starting, the seeds must be equal
+    assert random_seed_2_1 == random_seed_2_2
--- a/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/utils.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/utils.cp36-win32.pyd