Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
|
@ -0,0 +1,5 @@
|
|||
"""This module implements histogram-based gradient boosting estimators.
|
||||
|
||||
The implementation is a port from pygbm which is itself strongly inspired
|
||||
from LightGBM.
|
||||
"""
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,204 @@
|
|||
"""
|
||||
This module contains the BinMapper class.
|
||||
|
||||
BinMapper is used for mapping a real-valued dataset into integer-valued bins.
|
||||
Bin thresholds are computed with the quantiles so that each bin contains
|
||||
approximately the same number of samples.
|
||||
"""
|
||||
# Author: Nicolas Hug
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...utils import check_random_state, check_array
|
||||
from ...base import BaseEstimator, TransformerMixin
|
||||
from ...utils.validation import check_is_fitted
|
||||
from ._binning import _map_to_bins
|
||||
from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF
|
||||
|
||||
|
||||
def _find_binning_thresholds(data, max_bins, subsample, random_state):
|
||||
"""Extract feature-wise quantiles from numerical data.
|
||||
|
||||
Missing values are ignored for finding the thresholds.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array-like, shape (n_samples, n_features)
|
||||
The data to bin.
|
||||
max_bins: int
|
||||
The maximum number of bins to use for non-missing values. If for a
|
||||
given feature the number of unique values is less than ``max_bins``,
|
||||
then those unique values will be used to compute the bin thresholds,
|
||||
instead of the quantiles.
|
||||
subsample : int or None
|
||||
If ``n_samples > subsample``, then ``sub_samples`` samples will be
|
||||
randomly chosen to compute the quantiles. If ``None``, the whole data
|
||||
is used.
|
||||
random_state: int, RandomState instance or None
|
||||
Pseudo-random number generator to control the random sub-sampling.
|
||||
Pass an int for reproducible output across multiple
|
||||
function calls.
|
||||
See :term: `Glossary <random_state>`.
|
||||
|
||||
Return
|
||||
------
|
||||
binning_thresholds: list of arrays
|
||||
For each feature, stores the increasing numeric values that can
|
||||
be used to separate the bins. Thus ``len(binning_thresholds) ==
|
||||
n_features``.
|
||||
"""
|
||||
rng = check_random_state(random_state)
|
||||
if subsample is not None and data.shape[0] > subsample:
|
||||
subset = rng.choice(data.shape[0], subsample, replace=False)
|
||||
data = data.take(subset, axis=0)
|
||||
|
||||
binning_thresholds = []
|
||||
for f_idx in range(data.shape[1]):
|
||||
col_data = data[:, f_idx]
|
||||
# ignore missing values when computing bin thresholds
|
||||
missing_mask = np.isnan(col_data)
|
||||
if missing_mask.any():
|
||||
col_data = col_data[~missing_mask]
|
||||
col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
|
||||
distinct_values = np.unique(col_data)
|
||||
if len(distinct_values) <= max_bins:
|
||||
midpoints = distinct_values[:-1] + distinct_values[1:]
|
||||
midpoints *= .5
|
||||
else:
|
||||
# We sort again the data in this case. We could compute
|
||||
# approximate midpoint percentiles using the output of
|
||||
# np.unique(col_data, return_counts) instead but this is more
|
||||
# work and the performance benefit will be limited because we
|
||||
# work on a fixed-size subsample of the full data.
|
||||
percentiles = np.linspace(0, 100, num=max_bins + 1)
|
||||
percentiles = percentiles[1:-1]
|
||||
midpoints = np.percentile(col_data, percentiles,
|
||||
interpolation='midpoint').astype(X_DTYPE)
|
||||
assert midpoints.shape[0] == max_bins - 1
|
||||
|
||||
# We avoid having +inf thresholds: +inf thresholds are only allowed in
|
||||
# a "split on nan" situation.
|
||||
np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)
|
||||
|
||||
binning_thresholds.append(midpoints)
|
||||
|
||||
return binning_thresholds
|
||||
|
||||
|
||||
class _BinMapper(TransformerMixin, BaseEstimator):
|
||||
"""Transformer that maps a dataset into integer-valued bins.
|
||||
|
||||
The bins are created in a feature-wise fashion, using quantiles so that
|
||||
each bins contains approximately the same number of samples.
|
||||
|
||||
For large datasets, quantiles are computed on a subset of the data to
|
||||
speed-up the binning, but the quantiles should remain stable.
|
||||
|
||||
Features with a small number of values may be binned into less than
|
||||
``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved
|
||||
for missing values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_bins : int, optional (default=256)
|
||||
The maximum number of bins to use (including the bin for missing
|
||||
values). Non-missing values are binned on ``max_bins = n_bins - 1``
|
||||
bins. The last bin is always reserved for missing values. If for a
|
||||
given feature the number of unique values is less than ``max_bins``,
|
||||
then those unique values will be used to compute the bin thresholds,
|
||||
instead of the quantiles.
|
||||
subsample : int or None, optional (default=2e5)
|
||||
If ``n_samples > subsample``, then ``sub_samples`` samples will be
|
||||
randomly chosen to compute the quantiles. If ``None``, the whole data
|
||||
is used.
|
||||
random_state: int, RandomState instance or None
|
||||
Pseudo-random number generator to control the random sub-sampling.
|
||||
Pass an int for reproducible output across multiple
|
||||
function calls.
|
||||
See :term: `Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
bin_thresholds_ : list of arrays
|
||||
For each feature, gives the real-valued bin threhsolds. There are
|
||||
``max_bins - 1`` thresholds, where ``max_bins = n_bins - 1`` is the
|
||||
number of bins used for non-missing values.
|
||||
n_bins_non_missing_ : array of uint32
|
||||
For each feature, gives the number of bins actually used for
|
||||
non-missing values. For features with a lot of unique values, this is
|
||||
equal to ``n_bins - 1``.
|
||||
missing_values_bin_idx_ : uint8
|
||||
The index of the bin where missing values are mapped. This is a
|
||||
constant across all features. This corresponds to the last bin, and
|
||||
it is always equal to ``n_bins - 1``. Note that if ``n_bins_missing_``
|
||||
is less than ``n_bins - 1`` for a given feature, then there are
|
||||
empty (and unused) bins.
|
||||
"""
|
||||
def __init__(self, n_bins=256, subsample=int(2e5), random_state=None):
|
||||
self.n_bins = n_bins
|
||||
self.subsample = subsample
|
||||
self.random_state = random_state
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit data X by computing the binning thresholds.
|
||||
|
||||
The last bin is reserved for missing values, whether missing values
|
||||
are present in the data or not.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
The data to bin.
|
||||
y: None
|
||||
Ignored.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
"""
|
||||
if not (3 <= self.n_bins <= 256):
|
||||
# min is 3: at least 2 distinct bins and a missing values bin
|
||||
raise ValueError('n_bins={} should be no smaller than 3 '
|
||||
'and no larger than 256.'.format(self.n_bins))
|
||||
|
||||
X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
|
||||
max_bins = self.n_bins - 1
|
||||
self.bin_thresholds_ = _find_binning_thresholds(
|
||||
X, max_bins, subsample=self.subsample,
|
||||
random_state=self.random_state)
|
||||
|
||||
self.n_bins_non_missing_ = np.array(
|
||||
[thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_],
|
||||
dtype=np.uint32)
|
||||
|
||||
self.missing_values_bin_idx_ = self.n_bins - 1
|
||||
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Bin data X.
|
||||
|
||||
Missing values will be mapped to the last bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
The data to bin.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_binned : array-like, shape (n_samples, n_features)
|
||||
The binned data (fortran-aligned).
|
||||
"""
|
||||
X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
|
||||
check_is_fitted(self)
|
||||
if X.shape[1] != self.n_bins_non_missing_.shape[0]:
|
||||
raise ValueError(
|
||||
'This estimator was fitted with {} features but {} got passed '
|
||||
'to transform()'.format(self.n_bins_non_missing_.shape[0],
|
||||
X.shape[1])
|
||||
)
|
||||
binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F')
|
||||
_map_to_bins(X, self.bin_thresholds_, self.missing_values_bin_idx_,
|
||||
binned)
|
||||
return binned
|
Binary file not shown.
|
@ -0,0 +1,40 @@
|
|||
# cython: language_level=3
|
||||
import numpy as np
|
||||
cimport numpy as np
|
||||
|
||||
np.import_array()
|
||||
|
||||
|
||||
ctypedef np.npy_float64 X_DTYPE_C
|
||||
ctypedef np.npy_uint8 X_BINNED_DTYPE_C
|
||||
ctypedef np.npy_float64 Y_DTYPE_C
|
||||
ctypedef np.npy_float32 G_H_DTYPE_C
|
||||
|
||||
cdef packed struct hist_struct:
|
||||
# Same as histogram dtype but we need a struct to declare views. It needs
|
||||
# to be packed since by default numpy dtypes aren't aligned
|
||||
Y_DTYPE_C sum_gradients
|
||||
Y_DTYPE_C sum_hessians
|
||||
unsigned int count
|
||||
|
||||
|
||||
cdef packed struct node_struct:
|
||||
# Equivalent struct to PREDICTOR_RECORD_DTYPE to use in memory views. It
|
||||
# needs to be packed since by default numpy dtypes aren't aligned
|
||||
Y_DTYPE_C value
|
||||
unsigned int count
|
||||
unsigned int feature_idx
|
||||
X_DTYPE_C threshold
|
||||
unsigned char missing_go_to_left
|
||||
unsigned int left
|
||||
unsigned int right
|
||||
Y_DTYPE_C gain
|
||||
unsigned int depth
|
||||
unsigned char is_leaf
|
||||
X_BINNED_DTYPE_C bin_threshold
|
||||
|
||||
|
||||
cpdef enum MonotonicConstraint:
|
||||
NO_CST = 0
|
||||
POS = 1
|
||||
NEG = -1
|
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,571 @@
|
|||
"""
|
||||
This module contains the TreeGrower class.
|
||||
|
||||
TreeGrowee builds a regression tree fitting a Newton-Raphson step, based on
|
||||
the gradients and hessians of the training data.
|
||||
"""
|
||||
# Author: Nicolas Hug
|
||||
|
||||
from heapq import heappush, heappop
|
||||
import numpy as np
|
||||
from timeit import default_timer as time
|
||||
import numbers
|
||||
|
||||
from .splitting import Splitter
|
||||
from .histogram import HistogramBuilder
|
||||
from .predictor import TreePredictor
|
||||
from .utils import sum_parallel
|
||||
from .common import PREDICTOR_RECORD_DTYPE
|
||||
from .common import Y_DTYPE
|
||||
from .common import MonotonicConstraint
|
||||
|
||||
|
||||
EPS = np.finfo(Y_DTYPE).eps # to avoid zero division errors
|
||||
|
||||
|
||||
class TreeNode:
|
||||
"""Tree Node class used in TreeGrower.
|
||||
|
||||
This isn't used for prediction purposes, only for training (see
|
||||
TreePredictor).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
depth : int
|
||||
The depth of the node, i.e. its distance from the root.
|
||||
sample_indices : ndarray of unsigned int, shape (n_samples_at_node,)
|
||||
The indices of the samples at the node.
|
||||
sum_gradients : float
|
||||
The sum of the gradients of the samples at the node.
|
||||
sum_hessians : float
|
||||
The sum of the hessians of the samples at the node.
|
||||
parent : TreeNode or None, optional (default=None)
|
||||
The parent of the node. None for root.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
depth : int
|
||||
The depth of the node, i.e. its distance from the root.
|
||||
sample_indices : ndarray of unsigned int, shape (n_samples_at_node,)
|
||||
The indices of the samples at the node.
|
||||
sum_gradients : float
|
||||
The sum of the gradients of the samples at the node.
|
||||
sum_hessians : float
|
||||
The sum of the hessians of the samples at the node.
|
||||
parent : TreeNode or None
|
||||
The parent of the node. None for root.
|
||||
split_info : SplitInfo or None
|
||||
The result of the split evaluation.
|
||||
left_child : TreeNode or None
|
||||
The left child of the node. None for leaves.
|
||||
right_child : TreeNode or None
|
||||
The right child of the node. None for leaves.
|
||||
value : float or None
|
||||
The value of the leaf, as computed in finalize_leaf(). None for
|
||||
non-leaf nodes.
|
||||
partition_start : int
|
||||
start position of the node's sample_indices in splitter.partition.
|
||||
partition_stop : int
|
||||
stop position of the node's sample_indices in splitter.partition.
|
||||
"""
|
||||
|
||||
split_info = None
|
||||
left_child = None
|
||||
right_child = None
|
||||
histograms = None
|
||||
sibling = None
|
||||
parent = None
|
||||
|
||||
# start and stop indices of the node in the splitter.partition
|
||||
# array. Concretely,
|
||||
# self.sample_indices = view(self.splitter.partition[start:stop])
|
||||
# Please see the comments about splitter.partition and
|
||||
# splitter.split_indices for more info about this design.
|
||||
# These 2 attributes are only used in _update_raw_prediction, because we
|
||||
# need to iterate over the leaves and I don't know how to efficiently
|
||||
# store the sample_indices views because they're all of different sizes.
|
||||
partition_start = 0
|
||||
partition_stop = 0
|
||||
|
||||
def __init__(self, depth, sample_indices, sum_gradients,
|
||||
sum_hessians, parent=None, value=None):
|
||||
self.depth = depth
|
||||
self.sample_indices = sample_indices
|
||||
self.n_samples = sample_indices.shape[0]
|
||||
self.sum_gradients = sum_gradients
|
||||
self.sum_hessians = sum_hessians
|
||||
self.parent = parent
|
||||
self.value = value
|
||||
self.is_leaf = False
|
||||
self.set_children_bounds(float('-inf'), float('+inf'))
|
||||
|
||||
def set_children_bounds(self, lower, upper):
|
||||
"""Set children values bounds to respect monotonic constraints."""
|
||||
|
||||
# These are bounds for the node's *children* values, not the node's
|
||||
# value. The bounds are used in the splitter when considering potential
|
||||
# left and right child.
|
||||
self.children_lower_bound = lower
|
||||
self.children_upper_bound = upper
|
||||
|
||||
def __lt__(self, other_node):
|
||||
"""Comparison for priority queue.
|
||||
|
||||
Nodes with high gain are higher priority than nodes with low gain.
|
||||
|
||||
heapq.heappush only need the '<' operator.
|
||||
heapq.heappop take the smallest item first (smaller is higher
|
||||
priority).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other_node : TreeNode
|
||||
The node to compare with.
|
||||
"""
|
||||
return self.split_info.gain > other_node.split_info.gain
|
||||
|
||||
|
||||
class TreeGrower:
|
||||
"""Tree grower class used to build a tree.
|
||||
|
||||
The tree is fitted to predict the values of a Newton-Raphson step. The
|
||||
splits are considered in a best-first fashion, and the quality of a
|
||||
split is defined in splitting._split_gain.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X_binned : ndarray of int, shape (n_samples, n_features)
|
||||
The binned input samples. Must be Fortran-aligned.
|
||||
gradients : ndarray, shape (n_samples,)
|
||||
The gradients of each training sample. Those are the gradients of the
|
||||
loss w.r.t the predictions, evaluated at iteration ``i - 1``.
|
||||
hessians : ndarray, shape (n_samples,)
|
||||
The hessians of each training sample. Those are the hessians of the
|
||||
loss w.r.t the predictions, evaluated at iteration ``i - 1``.
|
||||
max_leaf_nodes : int or None, optional (default=None)
|
||||
The maximum number of leaves for each tree. If None, there is no
|
||||
maximum limit.
|
||||
max_depth : int or None, optional (default=None)
|
||||
The maximum depth of each tree. The depth of a tree is the number of
|
||||
edges to go from the root to the deepest leaf.
|
||||
Depth isn't constrained by default.
|
||||
min_samples_leaf : int, optional (default=20)
|
||||
The minimum number of samples per leaf.
|
||||
min_gain_to_split : float, optional (default=0.)
|
||||
The minimum gain needed to split a node. Splits with lower gain will
|
||||
be ignored.
|
||||
n_bins : int, optional (default=256)
|
||||
The total number of bins, including the bin for missing values. Used
|
||||
to define the shape of the histograms.
|
||||
n_bins_non_missing_ : array of uint32
|
||||
For each feature, gives the number of bins actually used for
|
||||
non-missing values. For features with a lot of unique values, this
|
||||
is equal to ``n_bins - 1``. If it's an int, all features are
|
||||
considered to have the same number of bins. If None, all features
|
||||
are considered to have ``n_bins - 1`` bins.
|
||||
has_missing_values : ndarray of bool or bool, optional (default=False)
|
||||
Whether each feature contains missing values (in the training data).
|
||||
If it's a bool, the same value is used for all features.
|
||||
l2_regularization : float, optional (default=0)
|
||||
The L2 regularization parameter.
|
||||
min_hessian_to_split : float, optional (default=1e-3)
|
||||
The minimum sum of hessians needed in each node. Splits that result in
|
||||
at least one child having a sum of hessians less than
|
||||
``min_hessian_to_split`` are discarded.
|
||||
shrinkage : float, optional (default=1)
|
||||
The shrinkage parameter to apply to the leaves values, also known as
|
||||
learning rate.
|
||||
"""
|
||||
def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,
|
||||
max_depth=None, min_samples_leaf=20, min_gain_to_split=0.,
|
||||
n_bins=256, n_bins_non_missing=None, has_missing_values=False,
|
||||
monotonic_cst=None, l2_regularization=0.,
|
||||
min_hessian_to_split=1e-3, shrinkage=1.):
|
||||
|
||||
self._validate_parameters(X_binned, max_leaf_nodes, max_depth,
|
||||
min_samples_leaf, min_gain_to_split,
|
||||
l2_regularization, min_hessian_to_split)
|
||||
|
||||
if n_bins_non_missing is None:
|
||||
n_bins_non_missing = n_bins - 1
|
||||
|
||||
if isinstance(n_bins_non_missing, numbers.Integral):
|
||||
n_bins_non_missing = np.array(
|
||||
[n_bins_non_missing] * X_binned.shape[1],
|
||||
dtype=np.uint32)
|
||||
else:
|
||||
n_bins_non_missing = np.asarray(n_bins_non_missing,
|
||||
dtype=np.uint32)
|
||||
|
||||
if isinstance(has_missing_values, bool):
|
||||
has_missing_values = [has_missing_values] * X_binned.shape[1]
|
||||
has_missing_values = np.asarray(has_missing_values, dtype=np.uint8)
|
||||
|
||||
if monotonic_cst is None:
|
||||
self.with_monotonic_cst = False
|
||||
monotonic_cst = np.full(shape=X_binned.shape[1],
|
||||
fill_value=MonotonicConstraint.NO_CST,
|
||||
dtype=np.int8)
|
||||
else:
|
||||
self.with_monotonic_cst = True
|
||||
monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
|
||||
|
||||
if monotonic_cst.shape[0] != X_binned.shape[1]:
|
||||
raise ValueError(
|
||||
"monotonic_cst has shape {} but the input data "
|
||||
"X has {} features.".format(
|
||||
monotonic_cst.shape[0], X_binned.shape[1]
|
||||
)
|
||||
)
|
||||
if np.any(monotonic_cst < -1) or np.any(monotonic_cst > 1):
|
||||
raise ValueError(
|
||||
"monotonic_cst must be None or an array-like of "
|
||||
"-1, 0 or 1."
|
||||
)
|
||||
|
||||
hessians_are_constant = hessians.shape[0] == 1
|
||||
self.histogram_builder = HistogramBuilder(
|
||||
X_binned, n_bins, gradients, hessians, hessians_are_constant)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
self.splitter = Splitter(
|
||||
X_binned, n_bins_non_missing, missing_values_bin_idx,
|
||||
has_missing_values, monotonic_cst,
|
||||
l2_regularization, min_hessian_to_split,
|
||||
min_samples_leaf, min_gain_to_split, hessians_are_constant)
|
||||
self.n_bins_non_missing = n_bins_non_missing
|
||||
self.max_leaf_nodes = max_leaf_nodes
|
||||
self.has_missing_values = has_missing_values
|
||||
self.monotonic_cst = monotonic_cst
|
||||
self.l2_regularization = l2_regularization
|
||||
self.n_features = X_binned.shape[1]
|
||||
self.max_depth = max_depth
|
||||
self.min_samples_leaf = min_samples_leaf
|
||||
self.X_binned = X_binned
|
||||
self.min_gain_to_split = min_gain_to_split
|
||||
self.shrinkage = shrinkage
|
||||
self.splittable_nodes = []
|
||||
self.finalized_leaves = []
|
||||
self.total_find_split_time = 0. # time spent finding the best splits
|
||||
self.total_compute_hist_time = 0. # time spent computing histograms
|
||||
self.total_apply_split_time = 0. # time spent splitting nodes
|
||||
self._intilialize_root(gradients, hessians, hessians_are_constant)
|
||||
self.n_nodes = 1
|
||||
|
||||
def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth,
|
||||
min_samples_leaf, min_gain_to_split,
|
||||
l2_regularization, min_hessian_to_split):
|
||||
"""Validate parameters passed to __init__.
|
||||
|
||||
Also validate parameters passed to splitter.
|
||||
"""
|
||||
if X_binned.dtype != np.uint8:
|
||||
raise NotImplementedError(
|
||||
"X_binned must be of type uint8.")
|
||||
if not X_binned.flags.f_contiguous:
|
||||
raise ValueError(
|
||||
"X_binned should be passed as Fortran contiguous "
|
||||
"array for maximum efficiency.")
|
||||
if max_leaf_nodes is not None and max_leaf_nodes <= 1:
|
||||
raise ValueError('max_leaf_nodes={} should not be'
|
||||
' smaller than 2'.format(max_leaf_nodes))
|
||||
if max_depth is not None and max_depth < 1:
|
||||
raise ValueError('max_depth={} should not be'
|
||||
' smaller than 1'.format(max_depth))
|
||||
if min_samples_leaf < 1:
|
||||
raise ValueError('min_samples_leaf={} should '
|
||||
'not be smaller than 1'.format(min_samples_leaf))
|
||||
if min_gain_to_split < 0:
|
||||
raise ValueError('min_gain_to_split={} '
|
||||
'must be positive.'.format(min_gain_to_split))
|
||||
if l2_regularization < 0:
|
||||
raise ValueError('l2_regularization={} must be '
|
||||
'positive.'.format(l2_regularization))
|
||||
if min_hessian_to_split < 0:
|
||||
raise ValueError('min_hessian_to_split={} '
|
||||
'must be positive.'.format(min_hessian_to_split))
|
||||
|
||||
def grow(self):
|
||||
"""Grow the tree, from root to leaves."""
|
||||
while self.splittable_nodes:
|
||||
self.split_next()
|
||||
|
||||
self._apply_shrinkage()
|
||||
|
||||
def _apply_shrinkage(self):
|
||||
"""Multiply leaves values by shrinkage parameter.
|
||||
|
||||
This must be done at the very end of the growing process. If this were
|
||||
done during the growing process e.g. in finalize_leaf(), then a leaf
|
||||
would be shrunk but its sibling would potentially not be (if it's a
|
||||
non-leaf), which would lead to a wrong computation of the 'middle'
|
||||
value needed to enforce the monotonic constraints.
|
||||
"""
|
||||
for leaf in self.finalized_leaves:
|
||||
leaf.value *= self.shrinkage
|
||||
|
||||
def _intilialize_root(self, gradients, hessians, hessians_are_constant):
|
||||
"""Initialize root node and finalize it if needed."""
|
||||
n_samples = self.X_binned.shape[0]
|
||||
depth = 0
|
||||
sum_gradients = sum_parallel(gradients)
|
||||
if self.histogram_builder.hessians_are_constant:
|
||||
sum_hessians = hessians[0] * n_samples
|
||||
else:
|
||||
sum_hessians = sum_parallel(hessians)
|
||||
self.root = TreeNode(
|
||||
depth=depth,
|
||||
sample_indices=self.splitter.partition,
|
||||
sum_gradients=sum_gradients,
|
||||
sum_hessians=sum_hessians,
|
||||
value=0
|
||||
)
|
||||
|
||||
self.root.partition_start = 0
|
||||
self.root.partition_stop = n_samples
|
||||
|
||||
if self.root.n_samples < 2 * self.min_samples_leaf:
|
||||
# Do not even bother computing any splitting statistics.
|
||||
self._finalize_leaf(self.root)
|
||||
return
|
||||
if sum_hessians < self.splitter.min_hessian_to_split:
|
||||
self._finalize_leaf(self.root)
|
||||
return
|
||||
|
||||
self.root.histograms = self.histogram_builder.compute_histograms_brute(
|
||||
self.root.sample_indices)
|
||||
self._compute_best_split_and_push(self.root)
|
||||
|
||||
def _compute_best_split_and_push(self, node):
|
||||
"""Compute the best possible split (SplitInfo) of a given node.
|
||||
|
||||
Also push it in the heap of splittable nodes if gain isn't zero.
|
||||
The gain of a node is 0 if either all the leaves are pure
|
||||
(best gain = 0), or if no split would satisfy the constraints,
|
||||
(min_hessians_to_split, min_gain_to_split, min_samples_leaf)
|
||||
"""
|
||||
|
||||
node.split_info = self.splitter.find_node_split(
|
||||
node.n_samples, node.histograms, node.sum_gradients,
|
||||
node.sum_hessians, node.value, node.children_lower_bound,
|
||||
node.children_upper_bound)
|
||||
|
||||
if node.split_info.gain <= 0: # no valid split
|
||||
self._finalize_leaf(node)
|
||||
else:
|
||||
heappush(self.splittable_nodes, node)
|
||||
|
||||
def split_next(self):
|
||||
"""Split the node with highest potential gain.
|
||||
|
||||
Returns
|
||||
-------
|
||||
left : TreeNode
|
||||
The resulting left child.
|
||||
right : TreeNode
|
||||
The resulting right child.
|
||||
"""
|
||||
# Consider the node with the highest loss reduction (a.k.a. gain)
|
||||
node = heappop(self.splittable_nodes)
|
||||
|
||||
tic = time()
|
||||
(sample_indices_left,
|
||||
sample_indices_right,
|
||||
right_child_pos) = self.splitter.split_indices(node.split_info,
|
||||
node.sample_indices)
|
||||
self.total_apply_split_time += time() - tic
|
||||
|
||||
depth = node.depth + 1
|
||||
n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes)
|
||||
n_leaf_nodes += 2
|
||||
|
||||
left_child_node = TreeNode(depth,
|
||||
sample_indices_left,
|
||||
node.split_info.sum_gradient_left,
|
||||
node.split_info.sum_hessian_left,
|
||||
parent=node,
|
||||
value=node.split_info.value_left,
|
||||
)
|
||||
right_child_node = TreeNode(depth,
|
||||
sample_indices_right,
|
||||
node.split_info.sum_gradient_right,
|
||||
node.split_info.sum_hessian_right,
|
||||
parent=node,
|
||||
value=node.split_info.value_right,
|
||||
)
|
||||
|
||||
left_child_node.sibling = right_child_node
|
||||
right_child_node.sibling = left_child_node
|
||||
node.right_child = right_child_node
|
||||
node.left_child = left_child_node
|
||||
|
||||
# set start and stop indices
|
||||
left_child_node.partition_start = node.partition_start
|
||||
left_child_node.partition_stop = node.partition_start + right_child_pos
|
||||
right_child_node.partition_start = left_child_node.partition_stop
|
||||
right_child_node.partition_stop = node.partition_stop
|
||||
|
||||
if not self.has_missing_values[node.split_info.feature_idx]:
|
||||
# If no missing values are encountered at fit time, then samples
|
||||
# with missing values during predict() will go to whichever child
|
||||
# has the most samples.
|
||||
node.split_info.missing_go_to_left = (
|
||||
left_child_node.n_samples > right_child_node.n_samples)
|
||||
|
||||
self.n_nodes += 2
|
||||
|
||||
if (self.max_leaf_nodes is not None
|
||||
and n_leaf_nodes == self.max_leaf_nodes):
|
||||
self._finalize_leaf(left_child_node)
|
||||
self._finalize_leaf(right_child_node)
|
||||
self._finalize_splittable_nodes()
|
||||
return left_child_node, right_child_node
|
||||
|
||||
if self.max_depth is not None and depth == self.max_depth:
|
||||
self._finalize_leaf(left_child_node)
|
||||
self._finalize_leaf(right_child_node)
|
||||
return left_child_node, right_child_node
|
||||
|
||||
if left_child_node.n_samples < self.min_samples_leaf * 2:
|
||||
self._finalize_leaf(left_child_node)
|
||||
if right_child_node.n_samples < self.min_samples_leaf * 2:
|
||||
self._finalize_leaf(right_child_node)
|
||||
|
||||
if self.with_monotonic_cst:
|
||||
# Set value bounds for respecting monotonic constraints
|
||||
# See test_nodes_values() for details
|
||||
if (self.monotonic_cst[node.split_info.feature_idx] ==
|
||||
MonotonicConstraint.NO_CST):
|
||||
lower_left = lower_right = node.children_lower_bound
|
||||
upper_left = upper_right = node.children_upper_bound
|
||||
else:
|
||||
mid = (left_child_node.value + right_child_node.value) / 2
|
||||
if (self.monotonic_cst[node.split_info.feature_idx] ==
|
||||
MonotonicConstraint.POS):
|
||||
lower_left, upper_left = node.children_lower_bound, mid
|
||||
lower_right, upper_right = mid, node.children_upper_bound
|
||||
else: # NEG
|
||||
lower_left, upper_left = mid, node.children_upper_bound
|
||||
lower_right, upper_right = node.children_lower_bound, mid
|
||||
left_child_node.set_children_bounds(lower_left, upper_left)
|
||||
right_child_node.set_children_bounds(lower_right, upper_right)
|
||||
|
||||
# Compute histograms of children, and compute their best possible split
|
||||
# (if needed)
|
||||
should_split_left = not left_child_node.is_leaf
|
||||
should_split_right = not right_child_node.is_leaf
|
||||
if should_split_left or should_split_right:
|
||||
|
||||
# We will compute the histograms of both nodes even if one of them
|
||||
# is a leaf, since computing the second histogram is very cheap
|
||||
# (using histogram subtraction).
|
||||
n_samples_left = left_child_node.sample_indices.shape[0]
|
||||
n_samples_right = right_child_node.sample_indices.shape[0]
|
||||
if n_samples_left < n_samples_right:
|
||||
smallest_child = left_child_node
|
||||
largest_child = right_child_node
|
||||
else:
|
||||
smallest_child = right_child_node
|
||||
largest_child = left_child_node
|
||||
|
||||
# We use the brute O(n_samples) method on the child that has the
|
||||
# smallest number of samples, and the subtraction trick O(n_bins)
|
||||
# on the other one.
|
||||
tic = time()
|
||||
smallest_child.histograms = \
|
||||
self.histogram_builder.compute_histograms_brute(
|
||||
smallest_child.sample_indices)
|
||||
largest_child.histograms = \
|
||||
self.histogram_builder.compute_histograms_subtraction(
|
||||
node.histograms, smallest_child.histograms)
|
||||
self.total_compute_hist_time += time() - tic
|
||||
|
||||
tic = time()
|
||||
if should_split_left:
|
||||
self._compute_best_split_and_push(left_child_node)
|
||||
if should_split_right:
|
||||
self._compute_best_split_and_push(right_child_node)
|
||||
self.total_find_split_time += time() - tic
|
||||
|
||||
return left_child_node, right_child_node
|
||||
|
||||
def _finalize_leaf(self, node):
|
||||
"""Make node a leaf of the tree being grown."""
|
||||
|
||||
node.is_leaf = True
|
||||
self.finalized_leaves.append(node)
|
||||
|
||||
def _finalize_splittable_nodes(self):
|
||||
"""Transform all splittable nodes into leaves.
|
||||
|
||||
Used when some constraint is met e.g. maximum number of leaves or
|
||||
maximum depth."""
|
||||
while len(self.splittable_nodes) > 0:
|
||||
node = self.splittable_nodes.pop()
|
||||
self._finalize_leaf(node)
|
||||
|
||||
def make_predictor(self, bin_thresholds=None):
|
||||
"""Make a TreePredictor object out of the current tree.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bin_thresholds : array-like of floats, optional (default=None)
|
||||
The actual thresholds values of each bin.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A TreePredictor object.
|
||||
"""
|
||||
predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE)
|
||||
_fill_predictor_node_array(predictor_nodes, self.root,
|
||||
bin_thresholds, self.n_bins_non_missing)
|
||||
return TreePredictor(predictor_nodes)
|
||||
|
||||
|
||||
def _fill_predictor_node_array(predictor_nodes, grower_node,
|
||||
bin_thresholds, n_bins_non_missing,
|
||||
next_free_idx=0):
|
||||
"""Helper used in make_predictor to set the TreePredictor fields."""
|
||||
node = predictor_nodes[next_free_idx]
|
||||
node['count'] = grower_node.n_samples
|
||||
node['depth'] = grower_node.depth
|
||||
if grower_node.split_info is not None:
|
||||
node['gain'] = grower_node.split_info.gain
|
||||
else:
|
||||
node['gain'] = -1
|
||||
|
||||
node['value'] = grower_node.value
|
||||
|
||||
if grower_node.is_leaf:
|
||||
# Leaf node
|
||||
node['is_leaf'] = True
|
||||
return next_free_idx + 1
|
||||
else:
|
||||
# Decision node
|
||||
split_info = grower_node.split_info
|
||||
feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx
|
||||
node['feature_idx'] = feature_idx
|
||||
node['bin_threshold'] = bin_idx
|
||||
node['missing_go_to_left'] = split_info.missing_go_to_left
|
||||
|
||||
if split_info.bin_idx == n_bins_non_missing[feature_idx] - 1:
|
||||
# Split is on the last non-missing bin: it's a "split on nans". All
|
||||
# nans go to the right, the rest go to the left.
|
||||
node['threshold'] = np.inf
|
||||
elif bin_thresholds is not None:
|
||||
node['threshold'] = bin_thresholds[feature_idx][bin_idx]
|
||||
|
||||
next_free_idx += 1
|
||||
|
||||
node['left'] = next_free_idx
|
||||
next_free_idx = _fill_predictor_node_array(
|
||||
predictor_nodes, grower_node.left_child,
|
||||
bin_thresholds=bin_thresholds,
|
||||
n_bins_non_missing=n_bins_non_missing,
|
||||
next_free_idx=next_free_idx)
|
||||
|
||||
node['right'] = next_free_idx
|
||||
return _fill_predictor_node_array(
|
||||
predictor_nodes, grower_node.right_child,
|
||||
bin_thresholds=bin_thresholds,
|
||||
n_bins_non_missing=n_bins_non_missing,
|
||||
next_free_idx=next_free_idx)
|
Binary file not shown.
|
@ -0,0 +1,426 @@
|
|||
"""
|
||||
This module contains the loss classes.
|
||||
|
||||
Specific losses are used for regression, binary classification or multiclass
|
||||
classification.
|
||||
"""
|
||||
# Author: Nicolas Hug
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import numpy as np
|
||||
from scipy.special import expit, logsumexp, xlogy
|
||||
|
||||
from .common import Y_DTYPE
|
||||
from .common import G_H_DTYPE
|
||||
from ._loss import _update_gradients_least_squares
|
||||
from ._loss import _update_gradients_hessians_least_squares
|
||||
from ._loss import _update_gradients_least_absolute_deviation
|
||||
from ._loss import _update_gradients_hessians_least_absolute_deviation
|
||||
from ._loss import _update_gradients_hessians_binary_crossentropy
|
||||
from ._loss import _update_gradients_hessians_categorical_crossentropy
|
||||
from ._loss import _update_gradients_hessians_poisson
|
||||
from ...utils.stats import _weighted_percentile
|
||||
|
||||
|
||||
class BaseLoss(ABC):
|
||||
"""Base class for a loss."""
|
||||
|
||||
def __init__(self, hessians_are_constant):
|
||||
self.hessians_are_constant = hessians_are_constant
|
||||
|
||||
def __call__(self, y_true, raw_predictions, sample_weight):
|
||||
"""Return the weighted average loss"""
|
||||
return np.average(self.pointwise_loss(y_true, raw_predictions),
|
||||
weights=sample_weight)
|
||||
|
||||
@abstractmethod
|
||||
def pointwise_loss(self, y_true, raw_predictions):
|
||||
"""Return loss value for each input"""
|
||||
|
||||
# This variable indicates whether the loss requires the leaves values to
|
||||
# be updated once the tree has been trained. The trees are trained to
|
||||
# predict a Newton-Raphson step (see grower._finalize_leaf()). But for
|
||||
# some losses (e.g. least absolute deviation) we need to adjust the tree
|
||||
# values to account for the "line search" of the gradient descent
|
||||
# procedure. See the original paper Greedy Function Approximation: A
|
||||
# Gradient Boosting Machine by Friedman
|
||||
# (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
|
||||
need_update_leaves_values = False
|
||||
|
||||
def init_gradients_and_hessians(self, n_samples, prediction_dim,
|
||||
sample_weight):
|
||||
"""Return initial gradients and hessians.
|
||||
|
||||
Unless hessians are constant, arrays are initialized with undefined
|
||||
values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_samples : int
|
||||
The number of samples passed to `fit()`.
|
||||
|
||||
prediction_dim : int
|
||||
The dimension of a raw prediction, i.e. the number of trees
|
||||
built at each iteration. Equals 1 for regression and binary
|
||||
classification, or K where K is the number of classes for
|
||||
multiclass classification.
|
||||
|
||||
sample_weight : array-like of shape(n_samples,) default=None
|
||||
Weights of training data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
gradients : ndarray, shape (prediction_dim, n_samples)
|
||||
The initial gradients. The array is not initialized.
|
||||
hessians : ndarray, shape (prediction_dim, n_samples)
|
||||
If hessians are constant (e.g. for `LeastSquares` loss, the
|
||||
array is initialized to ``1``. Otherwise, the array is allocated
|
||||
without being initialized.
|
||||
"""
|
||||
shape = (prediction_dim, n_samples)
|
||||
gradients = np.empty(shape=shape, dtype=G_H_DTYPE)
|
||||
|
||||
if self.hessians_are_constant:
|
||||
# If the hessians are constant, we consider they are equal to 1.
|
||||
# - This is correct for the half LS loss
|
||||
# - For LAD loss, hessians are actually 0, but they are always
|
||||
# ignored anyway.
|
||||
hessians = np.ones(shape=(1, 1), dtype=G_H_DTYPE)
|
||||
else:
|
||||
hessians = np.empty(shape=shape, dtype=G_H_DTYPE)
|
||||
|
||||
return gradients, hessians
|
||||
|
||||
@abstractmethod
|
||||
def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
|
||||
"""Return initial predictions (before the first iteration).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_train : ndarray, shape (n_samples,)
|
||||
The target training values.
|
||||
|
||||
sample_weight : array-like of shape(n_samples,) default=None
|
||||
Weights of training data.
|
||||
|
||||
prediction_dim : int
|
||||
The dimension of one prediction: 1 for binary classification and
|
||||
regression, n_classes for multiclass classification.
|
||||
|
||||
Returns
|
||||
-------
|
||||
baseline_prediction : float or ndarray, shape (1, prediction_dim)
|
||||
The baseline prediction.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def update_gradients_and_hessians(self, gradients, hessians, y_true,
|
||||
raw_predictions, sample_weight):
|
||||
"""Update gradients and hessians arrays, inplace.
|
||||
|
||||
The gradients (resp. hessians) are the first (resp. second) order
|
||||
derivatives of the loss for each sample with respect to the
|
||||
predictions of model, evaluated at iteration ``i - 1``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
gradients : ndarray, shape (prediction_dim, n_samples)
|
||||
The gradients (treated as OUT array).
|
||||
|
||||
hessians : ndarray, shape (prediction_dim, n_samples) or \
|
||||
(1,)
|
||||
The hessians (treated as OUT array).
|
||||
|
||||
y_true : ndarray, shape (n_samples,)
|
||||
The true target values or each training sample.
|
||||
|
||||
raw_predictions : ndarray, shape (prediction_dim, n_samples)
|
||||
The raw_predictions (i.e. values from the trees) of the tree
|
||||
ensemble at iteration ``i - 1``.
|
||||
|
||||
sample_weight : array-like of shape(n_samples,) default=None
|
||||
Weights of training data.
|
||||
"""
|
||||
|
||||
|
||||
class LeastSquares(BaseLoss):
|
||||
"""Least squares loss, for regression.
|
||||
|
||||
For a given sample x_i, least squares loss is defined as::
|
||||
|
||||
loss(x_i) = 0.5 * (y_true_i - raw_pred_i)**2
|
||||
|
||||
This actually computes the half least squares loss to simplify
|
||||
the computation of the gradients and get a unit hessian (and be consistent
|
||||
with what is done in LightGBM).
|
||||
"""
|
||||
|
||||
def __init__(self, sample_weight):
|
||||
# If sample weights are provided, the hessians and gradients
|
||||
# are multiplied by sample_weight, which means the hessians are
|
||||
# equal to sample weights.
|
||||
super().__init__(hessians_are_constant=sample_weight is None)
|
||||
|
||||
def pointwise_loss(self, y_true, raw_predictions):
|
||||
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
|
||||
# return a view.
|
||||
raw_predictions = raw_predictions.reshape(-1)
|
||||
loss = 0.5 * np.power(y_true - raw_predictions, 2)
|
||||
return loss
|
||||
|
||||
def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
|
||||
return np.average(y_train, weights=sample_weight)
|
||||
|
||||
@staticmethod
|
||||
def inverse_link_function(raw_predictions):
|
||||
return raw_predictions
|
||||
|
||||
def update_gradients_and_hessians(self, gradients, hessians, y_true,
|
||||
raw_predictions, sample_weight):
|
||||
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
|
||||
# return a view.
|
||||
raw_predictions = raw_predictions.reshape(-1)
|
||||
gradients = gradients.reshape(-1)
|
||||
if sample_weight is None:
|
||||
_update_gradients_least_squares(gradients, y_true, raw_predictions)
|
||||
else:
|
||||
hessians = hessians.reshape(-1)
|
||||
_update_gradients_hessians_least_squares(gradients, hessians,
|
||||
y_true, raw_predictions,
|
||||
sample_weight)
|
||||
|
||||
|
||||
class LeastAbsoluteDeviation(BaseLoss):
|
||||
"""Least absolute deviation, for regression.
|
||||
|
||||
For a given sample x_i, the loss is defined as::
|
||||
|
||||
loss(x_i) = |y_true_i - raw_pred_i|
|
||||
"""
|
||||
|
||||
def __init__(self, sample_weight):
|
||||
# If sample weights are provided, the hessians and gradients
|
||||
# are multiplied by sample_weight, which means the hessians are
|
||||
# equal to sample weights.
|
||||
super().__init__(hessians_are_constant=sample_weight is None)
|
||||
|
||||
# This variable indicates whether the loss requires the leaves values to
|
||||
# be updated once the tree has been trained. The trees are trained to
|
||||
# predict a Newton-Raphson step (see grower._finalize_leaf()). But for
|
||||
# some losses (e.g. least absolute deviation) we need to adjust the tree
|
||||
# values to account for the "line search" of the gradient descent
|
||||
# procedure. See the original paper Greedy Function Approximation: A
|
||||
# Gradient Boosting Machine by Friedman
|
||||
# (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
|
||||
need_update_leaves_values = True
|
||||
|
||||
def pointwise_loss(self, y_true, raw_predictions):
|
||||
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
|
||||
# return a view.
|
||||
raw_predictions = raw_predictions.reshape(-1)
|
||||
loss = np.abs(y_true - raw_predictions)
|
||||
return loss
|
||||
|
||||
def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
|
||||
if sample_weight is None:
|
||||
return np.median(y_train)
|
||||
else:
|
||||
return _weighted_percentile(y_train, sample_weight, 50)
|
||||
|
||||
@staticmethod
|
||||
def inverse_link_function(raw_predictions):
|
||||
return raw_predictions
|
||||
|
||||
def update_gradients_and_hessians(self, gradients, hessians, y_true,
|
||||
raw_predictions, sample_weight):
|
||||
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
|
||||
# return a view.
|
||||
raw_predictions = raw_predictions.reshape(-1)
|
||||
gradients = gradients.reshape(-1)
|
||||
if sample_weight is None:
|
||||
_update_gradients_least_absolute_deviation(gradients, y_true,
|
||||
raw_predictions)
|
||||
else:
|
||||
hessians = hessians.reshape(-1)
|
||||
_update_gradients_hessians_least_absolute_deviation(
|
||||
gradients, hessians, y_true, raw_predictions, sample_weight)
|
||||
|
||||
def update_leaves_values(self, grower, y_true, raw_predictions,
|
||||
sample_weight):
|
||||
# Update the values predicted by the tree with
|
||||
# median(y_true - raw_predictions).
|
||||
# See note about need_update_leaves_values in BaseLoss.
|
||||
|
||||
# TODO: ideally this should be computed in parallel over the leaves
|
||||
# using something similar to _update_raw_predictions(), but this
|
||||
# requires a cython version of median()
|
||||
for leaf in grower.finalized_leaves:
|
||||
indices = leaf.sample_indices
|
||||
if sample_weight is None:
|
||||
median_res = np.median(y_true[indices]
|
||||
- raw_predictions[indices])
|
||||
else:
|
||||
median_res = _weighted_percentile(y_true[indices]
|
||||
- raw_predictions[indices],
|
||||
sample_weight=sample_weight,
|
||||
percentile=50)
|
||||
leaf.value = grower.shrinkage * median_res
|
||||
# Note that the regularization is ignored here
|
||||
|
||||
|
||||
class Poisson(BaseLoss):
|
||||
"""Poisson deviance loss with log-link, for regression.
|
||||
|
||||
For a given sample x_i, Poisson deviance loss is defined as::
|
||||
|
||||
loss(x_i) = y_true_i * log(y_true_i/exp(raw_pred_i))
|
||||
- y_true_i + exp(raw_pred_i))
|
||||
|
||||
This actually computes half the Poisson deviance to simplify
|
||||
the computation of the gradients.
|
||||
"""
|
||||
|
||||
def __init__(self, sample_weight):
|
||||
super().__init__(hessians_are_constant=False)
|
||||
|
||||
inverse_link_function = staticmethod(np.exp)
|
||||
|
||||
def pointwise_loss(self, y_true, raw_predictions):
|
||||
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
|
||||
# return a view.
|
||||
raw_predictions = raw_predictions.reshape(-1)
|
||||
# TODO: For speed, we could remove the constant xlogy(y_true, y_true)
|
||||
# Advantage of this form: minimum of zero at raw_predictions = y_true.
|
||||
loss = (xlogy(y_true, y_true) - y_true * (raw_predictions + 1)
|
||||
+ np.exp(raw_predictions))
|
||||
return loss
|
||||
|
||||
def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
|
||||
y_pred = np.average(y_train, weights=sample_weight)
|
||||
eps = np.finfo(y_train.dtype).eps
|
||||
y_pred = np.clip(y_pred, eps, None)
|
||||
return np.log(y_pred)
|
||||
|
||||
def update_gradients_and_hessians(self, gradients, hessians, y_true,
|
||||
raw_predictions, sample_weight):
|
||||
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
|
||||
# return a view.
|
||||
raw_predictions = raw_predictions.reshape(-1)
|
||||
gradients = gradients.reshape(-1)
|
||||
hessians = hessians.reshape(-1)
|
||||
_update_gradients_hessians_poisson(gradients, hessians,
|
||||
y_true, raw_predictions,
|
||||
sample_weight)
|
||||
|
||||
|
||||
class BinaryCrossEntropy(BaseLoss):
|
||||
"""Binary cross-entropy loss, for binary classification.
|
||||
|
||||
For a given sample x_i, the binary cross-entropy loss is defined as the
|
||||
negative log-likelihood of the model which can be expressed as::
|
||||
|
||||
loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
|
||||
|
||||
See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
|
||||
section 4.4.1 (about logistic regression).
|
||||
"""
|
||||
|
||||
def __init__(self, sample_weight):
|
||||
super().__init__(hessians_are_constant=False)
|
||||
|
||||
inverse_link_function = staticmethod(expit)
|
||||
|
||||
def pointwise_loss(self, y_true, raw_predictions):
|
||||
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
|
||||
# return a view.
|
||||
raw_predictions = raw_predictions.reshape(-1)
|
||||
# logaddexp(0, x) = log(1 + exp(x))
|
||||
loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions
|
||||
return loss
|
||||
|
||||
def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
|
||||
if prediction_dim > 2:
|
||||
raise ValueError(
|
||||
"loss='binary_crossentropy' is not defined for multiclass"
|
||||
" classification with n_classes=%d, use"
|
||||
" loss='categorical_crossentropy' instead" % prediction_dim)
|
||||
proba_positive_class = np.average(y_train, weights=sample_weight)
|
||||
eps = np.finfo(y_train.dtype).eps
|
||||
proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps)
|
||||
# log(x / 1 - x) is the anti function of sigmoid, or the link function
|
||||
# of the Binomial model.
|
||||
return np.log(proba_positive_class / (1 - proba_positive_class))
|
||||
|
||||
def update_gradients_and_hessians(self, gradients, hessians, y_true,
|
||||
raw_predictions, sample_weight):
|
||||
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
|
||||
# return a view.
|
||||
raw_predictions = raw_predictions.reshape(-1)
|
||||
gradients = gradients.reshape(-1)
|
||||
hessians = hessians.reshape(-1)
|
||||
_update_gradients_hessians_binary_crossentropy(
|
||||
gradients, hessians, y_true, raw_predictions, sample_weight)
|
||||
|
||||
def predict_proba(self, raw_predictions):
|
||||
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
|
||||
# return a view.
|
||||
raw_predictions = raw_predictions.reshape(-1)
|
||||
proba = np.empty((raw_predictions.shape[0], 2), dtype=Y_DTYPE)
|
||||
proba[:, 1] = expit(raw_predictions)
|
||||
proba[:, 0] = 1 - proba[:, 1]
|
||||
return proba
|
||||
|
||||
|
||||
class CategoricalCrossEntropy(BaseLoss):
|
||||
"""Categorical cross-entropy loss, for multiclass classification.
|
||||
|
||||
For a given sample x_i, the categorical cross-entropy loss is defined as
|
||||
the negative log-likelihood of the model and generalizes the binary
|
||||
cross-entropy to more than 2 classes.
|
||||
"""
|
||||
|
||||
def __init__(self, sample_weight):
|
||||
super().__init__(hessians_are_constant=False)
|
||||
|
||||
def pointwise_loss(self, y_true, raw_predictions):
|
||||
one_hot_true = np.zeros_like(raw_predictions)
|
||||
prediction_dim = raw_predictions.shape[0]
|
||||
for k in range(prediction_dim):
|
||||
one_hot_true[k, :] = (y_true == k)
|
||||
|
||||
loss = (logsumexp(raw_predictions, axis=0) -
|
||||
(one_hot_true * raw_predictions).sum(axis=0))
|
||||
return loss
|
||||
|
||||
def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
|
||||
init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE)
|
||||
eps = np.finfo(y_train.dtype).eps
|
||||
for k in range(prediction_dim):
|
||||
proba_kth_class = np.average(y_train == k,
|
||||
weights=sample_weight)
|
||||
proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps)
|
||||
init_value[k, :] += np.log(proba_kth_class)
|
||||
|
||||
return init_value
|
||||
|
||||
def update_gradients_and_hessians(self, gradients, hessians, y_true,
|
||||
raw_predictions, sample_weight):
|
||||
_update_gradients_hessians_categorical_crossentropy(
|
||||
gradients, hessians, y_true, raw_predictions, sample_weight)
|
||||
|
||||
def predict_proba(self, raw_predictions):
|
||||
# TODO: This could be done in parallel
|
||||
# compute softmax (using exp(log(softmax)))
|
||||
proba = np.exp(raw_predictions -
|
||||
logsumexp(raw_predictions, axis=0)[np.newaxis, :])
|
||||
return proba.T
|
||||
|
||||
|
||||
_LOSSES = {
|
||||
'least_squares': LeastSquares,
|
||||
'least_absolute_deviation': LeastAbsoluteDeviation,
|
||||
'binary_crossentropy': BinaryCrossEntropy,
|
||||
'categorical_crossentropy': CategoricalCrossEntropy,
|
||||
'poisson': Poisson,
|
||||
}
|
|
@ -0,0 +1,86 @@
|
|||
"""
|
||||
This module contains the TreePredictor class which is used for prediction.
|
||||
"""
|
||||
# Author: Nicolas Hug
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .common import Y_DTYPE
|
||||
from ._predictor import _predict_from_numeric_data
|
||||
from ._predictor import _predict_from_binned_data
|
||||
from ._predictor import _compute_partial_dependence
|
||||
|
||||
|
||||
class TreePredictor:
|
||||
"""Tree class used for predictions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
nodes : ndarray of PREDICTOR_RECORD_DTYPE
|
||||
The nodes of the tree.
|
||||
"""
|
||||
def __init__(self, nodes):
|
||||
self.nodes = nodes
|
||||
|
||||
def get_n_leaf_nodes(self):
|
||||
"""Return number of leaves."""
|
||||
return int(self.nodes['is_leaf'].sum())
|
||||
|
||||
def get_max_depth(self):
|
||||
"""Return maximum depth among all leaves."""
|
||||
return int(self.nodes['depth'].max())
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict raw values for non-binned data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray, shape (n_samples, n_features)
|
||||
The input samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray, shape (n_samples,)
|
||||
The raw predicted values.
|
||||
"""
|
||||
out = np.empty(X.shape[0], dtype=Y_DTYPE)
|
||||
_predict_from_numeric_data(self.nodes, X, out)
|
||||
return out
|
||||
|
||||
def predict_binned(self, X, missing_values_bin_idx):
|
||||
"""Predict raw values for binned data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray, shape (n_samples, n_features)
|
||||
The input samples.
|
||||
missing_values_bin_idx : uint8
|
||||
Index of the bin that is used for missing values. This is the
|
||||
index of the last bin and is always equal to max_bins (as passed
|
||||
to the GBDT classes), or equivalently to n_bins - 1.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray, shape (n_samples,)
|
||||
The raw predicted values.
|
||||
"""
|
||||
out = np.empty(X.shape[0], dtype=Y_DTYPE)
|
||||
_predict_from_binned_data(self.nodes, X, missing_values_bin_idx, out)
|
||||
return out
|
||||
|
||||
def compute_partial_dependence(self, grid, target_features, out):
|
||||
"""Fast partial dependence computation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
grid : ndarray, shape (n_samples, n_target_features)
|
||||
The grid points on which the partial dependence should be
|
||||
evaluated.
|
||||
target_features : ndarray, shape (n_target_features)
|
||||
The set of target features for which the partial dependence
|
||||
should be evaluated.
|
||||
out : ndarray, shape (n_samples)
|
||||
The value of the partial dependence function on each grid
|
||||
point.
|
||||
"""
|
||||
_compute_partial_dependence(self.nodes, grid, target_features, out)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,314 @@
|
|||
import numpy as np
|
||||
from numpy.testing import assert_array_equal, assert_allclose
|
||||
import pytest
|
||||
|
||||
from sklearn.ensemble._hist_gradient_boosting.binning import (
|
||||
_BinMapper,
|
||||
_find_binning_thresholds as _find_binning_thresholds_orig,
|
||||
_map_to_bins
|
||||
)
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import ALMOST_INF
|
||||
|
||||
|
||||
DATA = np.random.RandomState(42).normal(
|
||||
loc=[0, 10], scale=[1, 0.01], size=(int(1e6), 2)
|
||||
).astype(X_DTYPE)
|
||||
|
||||
|
||||
def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5),
|
||||
random_state=None):
|
||||
# Just a redef to avoid having to pass arguments all the time (as the
|
||||
# function is private we don't use default values for parameters)
|
||||
return _find_binning_thresholds_orig(data, max_bins, subsample,
|
||||
random_state)
|
||||
|
||||
|
||||
def test_find_binning_thresholds_regular_data():
|
||||
data = np.linspace(0, 10, 1001).reshape(-1, 1)
|
||||
bin_thresholds = _find_binning_thresholds(data, max_bins=10)
|
||||
assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9])
|
||||
assert len(bin_thresholds) == 1
|
||||
|
||||
bin_thresholds = _find_binning_thresholds(data, max_bins=5)
|
||||
assert_allclose(bin_thresholds[0], [2, 4, 6, 8])
|
||||
assert len(bin_thresholds) == 1
|
||||
|
||||
|
||||
def test_find_binning_thresholds_small_regular_data():
|
||||
data = np.linspace(0, 10, 11).reshape(-1, 1)
|
||||
|
||||
bin_thresholds = _find_binning_thresholds(data, max_bins=5)
|
||||
assert_allclose(bin_thresholds[0], [2, 4, 6, 8])
|
||||
|
||||
bin_thresholds = _find_binning_thresholds(data, max_bins=10)
|
||||
assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9])
|
||||
|
||||
bin_thresholds = _find_binning_thresholds(data, max_bins=11)
|
||||
assert_allclose(bin_thresholds[0], np.arange(10) + .5)
|
||||
|
||||
bin_thresholds = _find_binning_thresholds(data, max_bins=255)
|
||||
assert_allclose(bin_thresholds[0], np.arange(10) + .5)
|
||||
|
||||
|
||||
def test_find_binning_thresholds_random_data():
|
||||
bin_thresholds = _find_binning_thresholds(DATA, max_bins=255,
|
||||
random_state=0)
|
||||
assert len(bin_thresholds) == 2
|
||||
for i in range(len(bin_thresholds)):
|
||||
assert bin_thresholds[i].shape == (254,) # 255 - 1
|
||||
assert bin_thresholds[i].dtype == DATA.dtype
|
||||
|
||||
assert_allclose(bin_thresholds[0][[64, 128, 192]],
|
||||
np.array([-0.7, 0.0, 0.7]), atol=1e-1)
|
||||
|
||||
assert_allclose(bin_thresholds[1][[64, 128, 192]],
|
||||
np.array([9.99, 10.00, 10.01]), atol=1e-2)
|
||||
|
||||
|
||||
def test_find_binning_thresholds_low_n_bins():
|
||||
bin_thresholds = _find_binning_thresholds(DATA, max_bins=128,
|
||||
random_state=0)
|
||||
assert len(bin_thresholds) == 2
|
||||
for i in range(len(bin_thresholds)):
|
||||
assert bin_thresholds[i].shape == (127,) # 128 - 1
|
||||
assert bin_thresholds[i].dtype == DATA.dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize('n_bins', (2, 257))
|
||||
def test_invalid_n_bins(n_bins):
|
||||
err_msg = (
|
||||
'n_bins={} should be no smaller than 3 and no larger than 256'
|
||||
.format(n_bins))
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_BinMapper(n_bins=n_bins).fit(DATA)
|
||||
|
||||
|
||||
def test_bin_mapper_n_features_transform():
|
||||
mapper = _BinMapper(n_bins=42, random_state=42).fit(DATA)
|
||||
err_msg = 'This estimator was fitted with 2 features but 4 got passed'
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
mapper.transform(np.repeat(DATA, 2, axis=1))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('max_bins', [16, 128, 255])
|
||||
def test_map_to_bins(max_bins):
|
||||
bin_thresholds = _find_binning_thresholds(DATA, max_bins=max_bins,
|
||||
random_state=0)
|
||||
binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order='F')
|
||||
last_bin_idx = max_bins
|
||||
_map_to_bins(DATA, bin_thresholds, last_bin_idx, binned)
|
||||
assert binned.shape == DATA.shape
|
||||
assert binned.dtype == np.uint8
|
||||
assert binned.flags.f_contiguous
|
||||
|
||||
min_indices = DATA.argmin(axis=0)
|
||||
max_indices = DATA.argmax(axis=0)
|
||||
|
||||
for feature_idx, min_idx in enumerate(min_indices):
|
||||
assert binned[min_idx, feature_idx] == 0
|
||||
for feature_idx, max_idx in enumerate(max_indices):
|
||||
assert binned[max_idx, feature_idx] == max_bins - 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_bins", [5, 10, 42])
|
||||
def test_bin_mapper_random_data(max_bins):
|
||||
n_samples, n_features = DATA.shape
|
||||
|
||||
expected_count_per_bin = n_samples // max_bins
|
||||
tol = int(0.05 * expected_count_per_bin)
|
||||
|
||||
# max_bins is the number of bins for non-missing values
|
||||
n_bins = max_bins + 1
|
||||
mapper = _BinMapper(n_bins=n_bins, random_state=42).fit(DATA)
|
||||
binned = mapper.transform(DATA)
|
||||
|
||||
assert binned.shape == (n_samples, n_features)
|
||||
assert binned.dtype == np.uint8
|
||||
assert_array_equal(binned.min(axis=0), np.array([0, 0]))
|
||||
assert_array_equal(binned.max(axis=0),
|
||||
np.array([max_bins - 1, max_bins - 1]))
|
||||
assert len(mapper.bin_thresholds_) == n_features
|
||||
for bin_thresholds_feature in mapper.bin_thresholds_:
|
||||
assert bin_thresholds_feature.shape == (max_bins - 1,)
|
||||
assert bin_thresholds_feature.dtype == DATA.dtype
|
||||
assert np.all(mapper.n_bins_non_missing_ == max_bins)
|
||||
|
||||
# Check that the binned data is approximately balanced across bins.
|
||||
for feature_idx in range(n_features):
|
||||
for bin_idx in range(max_bins):
|
||||
count = (binned[:, feature_idx] == bin_idx).sum()
|
||||
assert abs(count - expected_count_per_bin) < tol
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_samples, max_bins", [
|
||||
(5, 5),
|
||||
(5, 10),
|
||||
(5, 11),
|
||||
(42, 255)
|
||||
])
|
||||
def test_bin_mapper_small_random_data(n_samples, max_bins):
|
||||
data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1)
|
||||
assert len(np.unique(data)) == n_samples
|
||||
|
||||
# max_bins is the number of bins for non-missing values
|
||||
n_bins = max_bins + 1
|
||||
mapper = _BinMapper(n_bins=n_bins, random_state=42)
|
||||
binned = mapper.fit_transform(data)
|
||||
|
||||
assert binned.shape == data.shape
|
||||
assert binned.dtype == np.uint8
|
||||
assert_array_equal(binned.ravel()[np.argsort(data.ravel())],
|
||||
np.arange(n_samples))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_bins, n_distinct, multiplier", [
|
||||
(5, 5, 1),
|
||||
(5, 5, 3),
|
||||
(255, 12, 42),
|
||||
])
|
||||
def test_bin_mapper_identity_repeated_values(max_bins, n_distinct, multiplier):
|
||||
data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1)
|
||||
# max_bins is the number of bins for non-missing values
|
||||
n_bins = max_bins + 1
|
||||
binned = _BinMapper(n_bins=n_bins).fit_transform(data)
|
||||
assert_array_equal(data, binned)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('n_distinct', [2, 7, 42])
|
||||
def test_bin_mapper_repeated_values_invariance(n_distinct):
|
||||
rng = np.random.RandomState(42)
|
||||
distinct_values = rng.normal(size=n_distinct)
|
||||
assert len(np.unique(distinct_values)) == n_distinct
|
||||
|
||||
repeated_indices = rng.randint(low=0, high=n_distinct, size=1000)
|
||||
data = distinct_values[repeated_indices]
|
||||
rng.shuffle(data)
|
||||
assert_array_equal(np.unique(data), np.sort(distinct_values))
|
||||
|
||||
data = data.reshape(-1, 1)
|
||||
|
||||
mapper_1 = _BinMapper(n_bins=n_distinct + 1)
|
||||
binned_1 = mapper_1.fit_transform(data)
|
||||
assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct))
|
||||
|
||||
# Adding more bins to the mapper yields the same results (same thresholds)
|
||||
mapper_2 = _BinMapper(n_bins=min(256, n_distinct * 3) + 1)
|
||||
binned_2 = mapper_2.fit_transform(data)
|
||||
|
||||
assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0])
|
||||
assert_array_equal(binned_1, binned_2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_bins, scale, offset", [
|
||||
(3, 2, -1),
|
||||
(42, 1, 0),
|
||||
(255, 0.3, 42),
|
||||
])
|
||||
def test_bin_mapper_identity_small(max_bins, scale, offset):
|
||||
data = np.arange(max_bins).reshape(-1, 1) * scale + offset
|
||||
# max_bins is the number of bins for non-missing values
|
||||
n_bins = max_bins + 1
|
||||
binned = _BinMapper(n_bins=n_bins).fit_transform(data)
|
||||
assert_array_equal(binned, np.arange(max_bins).reshape(-1, 1))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('max_bins_small, max_bins_large', [
|
||||
(2, 2),
|
||||
(3, 3),
|
||||
(4, 4),
|
||||
(42, 42),
|
||||
(255, 255),
|
||||
(5, 17),
|
||||
(42, 255),
|
||||
])
|
||||
def test_bin_mapper_idempotence(max_bins_small, max_bins_large):
|
||||
assert max_bins_large >= max_bins_small
|
||||
data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1)
|
||||
mapper_small = _BinMapper(n_bins=max_bins_small + 1)
|
||||
mapper_large = _BinMapper(n_bins=max_bins_small + 1)
|
||||
binned_small = mapper_small.fit_transform(data)
|
||||
binned_large = mapper_large.fit_transform(binned_small)
|
||||
assert_array_equal(binned_small, binned_large)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('n_bins', [10, 100, 256])
|
||||
@pytest.mark.parametrize('diff', [-5, 0, 5])
|
||||
def test_n_bins_non_missing(n_bins, diff):
|
||||
# Check that n_bins_non_missing is n_unique_values when
|
||||
# there are not a lot of unique values, else n_bins - 1.
|
||||
|
||||
n_unique_values = n_bins + diff
|
||||
X = list(range(n_unique_values)) * 2
|
||||
X = np.array(X).reshape(-1, 1)
|
||||
mapper = _BinMapper(n_bins=n_bins).fit(X)
|
||||
assert np.all(mapper.n_bins_non_missing_ == min(
|
||||
n_bins - 1, n_unique_values))
|
||||
|
||||
|
||||
def test_subsample():
|
||||
# Make sure bin thresholds are different when applying subsampling
|
||||
mapper_no_subsample = _BinMapper(subsample=None, random_state=0).fit(DATA)
|
||||
mapper_subsample = _BinMapper(subsample=256, random_state=0).fit(DATA)
|
||||
|
||||
for feature in range(DATA.shape[1]):
|
||||
assert not np.allclose(mapper_no_subsample.bin_thresholds_[feature],
|
||||
mapper_subsample.bin_thresholds_[feature],
|
||||
rtol=1e-4)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'n_bins, n_bins_non_missing, X_trans_expected', [
|
||||
(256, [4, 2, 2], [[0, 0, 0], # 255 <=> missing value
|
||||
[255, 255, 0],
|
||||
[1, 0, 0],
|
||||
[255, 1, 1],
|
||||
[2, 1, 1],
|
||||
[3, 0, 0]]),
|
||||
(3, [2, 2, 2], [[0, 0, 0], # 2 <=> missing value
|
||||
[2, 2, 0],
|
||||
[0, 0, 0],
|
||||
[2, 1, 1],
|
||||
[1, 1, 1],
|
||||
[1, 0, 0]])])
|
||||
def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected):
|
||||
# check for missing values: make sure nans are mapped to the last bin
|
||||
# and that the _BinMapper attributes are correct
|
||||
|
||||
X = [[1, 1, 0],
|
||||
[np.NaN, np.NaN, 0],
|
||||
[2, 1, 0],
|
||||
[np.NaN, 2, 1],
|
||||
[3, 2, 1],
|
||||
[4, 1, 0]]
|
||||
|
||||
X = np.array(X)
|
||||
|
||||
mapper = _BinMapper(n_bins=n_bins)
|
||||
mapper.fit(X)
|
||||
|
||||
assert_array_equal(mapper.n_bins_non_missing_, n_bins_non_missing)
|
||||
|
||||
for feature_idx in range(X.shape[1]):
|
||||
assert len(mapper.bin_thresholds_[feature_idx]) == \
|
||||
n_bins_non_missing[feature_idx] - 1
|
||||
|
||||
assert mapper.missing_values_bin_idx_ == n_bins - 1
|
||||
|
||||
X_trans = mapper.transform(X)
|
||||
assert_array_equal(X_trans, X_trans_expected)
|
||||
|
||||
|
||||
def test_infinite_values():
|
||||
# Make sure infinite values are properly handled.
|
||||
bin_mapper = _BinMapper()
|
||||
|
||||
X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
|
||||
|
||||
bin_mapper.fit(X)
|
||||
assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, .5, ALMOST_INF])
|
||||
assert bin_mapper.n_bins_non_missing_ == [4]
|
||||
|
||||
expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1)
|
||||
assert_array_equal(bin_mapper.transform(X), expected_binned_X)
|
|
@ -0,0 +1,223 @@
|
|||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.datasets import make_classification, make_regression
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
# To use this experimental feature, we need to explicitly ask for it:
|
||||
from sklearn.experimental import enable_hist_gradient_boosting # noqa
|
||||
from sklearn.ensemble import HistGradientBoostingRegressor
|
||||
from sklearn.ensemble import HistGradientBoostingClassifier
|
||||
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
|
||||
from sklearn.ensemble._hist_gradient_boosting.utils import (
|
||||
get_equivalent_estimator)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('seed', range(5))
|
||||
@pytest.mark.parametrize('min_samples_leaf', (1, 20))
|
||||
@pytest.mark.parametrize('n_samples, max_leaf_nodes', [
|
||||
(255, 4096),
|
||||
(1000, 8),
|
||||
])
|
||||
def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
|
||||
max_leaf_nodes):
|
||||
# Make sure sklearn has the same predictions as lightgbm for easy targets.
|
||||
#
|
||||
# In particular when the size of the trees are bound and the number of
|
||||
# samples is large enough, the structure of the prediction trees found by
|
||||
# LightGBM and sklearn should be exactly identical.
|
||||
#
|
||||
# Notes:
|
||||
# - Several candidate splits may have equal gains when the number of
|
||||
# samples in a node is low (and because of float errors). Therefore the
|
||||
# predictions on the test set might differ if the structure of the tree
|
||||
# is not exactly the same. To avoid this issue we only compare the
|
||||
# predictions on the test set when the number of samples is large enough
|
||||
# and max_leaf_nodes is low enough.
|
||||
# - To ignore discrepancies caused by small differences the binning
|
||||
# strategy, data is pre-binned if n_samples > 255.
|
||||
# - We don't check the least_absolute_deviation loss here. This is because
|
||||
# LightGBM's computation of the median (used for the initial value of
|
||||
# raw_prediction) is a bit off (they'll e.g. return midpoints when there
|
||||
# is no need to.). Since these tests only run 1 iteration, the
|
||||
# discrepancy between the initial values leads to biggish differences in
|
||||
# the predictions. These differences are much smaller with more
|
||||
# iterations.
|
||||
pytest.importorskip("lightgbm")
|
||||
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
n_samples = n_samples
|
||||
max_iter = 1
|
||||
max_bins = 255
|
||||
|
||||
X, y = make_regression(n_samples=n_samples, n_features=5,
|
||||
n_informative=5, random_state=0)
|
||||
|
||||
if n_samples > 255:
|
||||
# bin data and convert it to float32 so that the estimator doesn't
|
||||
# treat it as pre-binned
|
||||
X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
|
||||
|
||||
est_sklearn = HistGradientBoostingRegressor(
|
||||
max_iter=max_iter,
|
||||
max_bins=max_bins,
|
||||
learning_rate=1,
|
||||
early_stopping=False,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
max_leaf_nodes=max_leaf_nodes)
|
||||
est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
|
||||
|
||||
est_lightgbm.fit(X_train, y_train)
|
||||
est_sklearn.fit(X_train, y_train)
|
||||
|
||||
# We need X to be treated an numerical data, not pre-binned data.
|
||||
X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
|
||||
|
||||
pred_lightgbm = est_lightgbm.predict(X_train)
|
||||
pred_sklearn = est_sklearn.predict(X_train)
|
||||
# less than 1% of the predictions are different up to the 3rd decimal
|
||||
assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < .011
|
||||
|
||||
if max_leaf_nodes < 10 and n_samples >= 1000:
|
||||
pred_lightgbm = est_lightgbm.predict(X_test)
|
||||
pred_sklearn = est_sklearn.predict(X_test)
|
||||
# less than 1% of the predictions are different up to the 4th decimal
|
||||
assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < .01
|
||||
|
||||
|
||||
@pytest.mark.parametrize('seed', range(5))
|
||||
@pytest.mark.parametrize('min_samples_leaf', (1, 20))
|
||||
@pytest.mark.parametrize('n_samples, max_leaf_nodes', [
|
||||
(255, 4096),
|
||||
(1000, 8),
|
||||
])
|
||||
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
|
||||
max_leaf_nodes):
|
||||
# Same as test_same_predictions_regression but for classification
|
||||
pytest.importorskip("lightgbm")
|
||||
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
n_samples = n_samples
|
||||
max_iter = 1
|
||||
max_bins = 255
|
||||
|
||||
X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5,
|
||||
n_informative=5, n_redundant=0, random_state=0)
|
||||
|
||||
if n_samples > 255:
|
||||
# bin data and convert it to float32 so that the estimator doesn't
|
||||
# treat it as pre-binned
|
||||
X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
|
||||
|
||||
est_sklearn = HistGradientBoostingClassifier(
|
||||
loss='binary_crossentropy',
|
||||
max_iter=max_iter,
|
||||
max_bins=max_bins,
|
||||
learning_rate=1,
|
||||
early_stopping=False,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
max_leaf_nodes=max_leaf_nodes)
|
||||
est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
|
||||
|
||||
est_lightgbm.fit(X_train, y_train)
|
||||
est_sklearn.fit(X_train, y_train)
|
||||
|
||||
# We need X to be treated an numerical data, not pre-binned data.
|
||||
X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
|
||||
|
||||
pred_lightgbm = est_lightgbm.predict(X_train)
|
||||
pred_sklearn = est_sklearn.predict(X_train)
|
||||
assert np.mean(pred_sklearn == pred_lightgbm) > .89
|
||||
|
||||
acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
|
||||
acc_sklearn = accuracy_score(y_train, pred_sklearn)
|
||||
np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)
|
||||
|
||||
if max_leaf_nodes < 10 and n_samples >= 1000:
|
||||
|
||||
pred_lightgbm = est_lightgbm.predict(X_test)
|
||||
pred_sklearn = est_sklearn.predict(X_test)
|
||||
assert np.mean(pred_sklearn == pred_lightgbm) > .89
|
||||
|
||||
acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
|
||||
acc_sklearn = accuracy_score(y_test, pred_sklearn)
|
||||
np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('seed', range(5))
|
||||
@pytest.mark.parametrize('min_samples_leaf', (1, 20))
|
||||
@pytest.mark.parametrize('n_samples, max_leaf_nodes', [
|
||||
(255, 4096),
|
||||
(10000, 8),
|
||||
])
|
||||
def test_same_predictions_multiclass_classification(
|
||||
seed, min_samples_leaf, n_samples, max_leaf_nodes):
|
||||
# Same as test_same_predictions_regression but for classification
|
||||
pytest.importorskip("lightgbm")
|
||||
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
n_samples = n_samples
|
||||
max_iter = 1
|
||||
max_bins = 255
|
||||
lr = 1
|
||||
|
||||
X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5,
|
||||
n_informative=5, n_redundant=0,
|
||||
n_clusters_per_class=1, random_state=0)
|
||||
|
||||
if n_samples > 255:
|
||||
# bin data and convert it to float32 so that the estimator doesn't
|
||||
# treat it as pre-binned
|
||||
X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
|
||||
|
||||
est_sklearn = HistGradientBoostingClassifier(
|
||||
loss='categorical_crossentropy',
|
||||
max_iter=max_iter,
|
||||
max_bins=max_bins,
|
||||
learning_rate=lr,
|
||||
early_stopping=False,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
max_leaf_nodes=max_leaf_nodes)
|
||||
est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
|
||||
|
||||
est_lightgbm.fit(X_train, y_train)
|
||||
est_sklearn.fit(X_train, y_train)
|
||||
|
||||
# We need X to be treated an numerical data, not pre-binned data.
|
||||
X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
|
||||
|
||||
pred_lightgbm = est_lightgbm.predict(X_train)
|
||||
pred_sklearn = est_sklearn.predict(X_train)
|
||||
assert np.mean(pred_sklearn == pred_lightgbm) > .89
|
||||
|
||||
proba_lightgbm = est_lightgbm.predict_proba(X_train)
|
||||
proba_sklearn = est_sklearn.predict_proba(X_train)
|
||||
# assert more than 75% of the predicted probabilities are the same up to
|
||||
# the second decimal
|
||||
assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75
|
||||
|
||||
acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
|
||||
acc_sklearn = accuracy_score(y_train, pred_sklearn)
|
||||
np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
|
||||
|
||||
if max_leaf_nodes < 10 and n_samples >= 1000:
|
||||
|
||||
pred_lightgbm = est_lightgbm.predict(X_test)
|
||||
pred_sklearn = est_sklearn.predict(X_test)
|
||||
assert np.mean(pred_sklearn == pred_lightgbm) > .89
|
||||
|
||||
proba_lightgbm = est_lightgbm.predict_proba(X_train)
|
||||
proba_sklearn = est_sklearn.predict_proba(X_train)
|
||||
# assert more than 75% of the predicted probabilities are the same up
|
||||
# to the second decimal
|
||||
assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75
|
||||
|
||||
acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
|
||||
acc_sklearn = accuracy_score(y_test, pred_sklearn)
|
||||
np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
|
|
@ -0,0 +1,746 @@
|
|||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_array_equal
|
||||
from sklearn.datasets import make_classification, make_regression
|
||||
from sklearn.datasets import make_low_rank_matrix
|
||||
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.base import clone, BaseEstimator, TransformerMixin
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.metrics import mean_poisson_deviance
|
||||
from sklearn.dummy import DummyRegressor
|
||||
|
||||
# To use this experimental feature, we need to explicitly ask for it:
|
||||
from sklearn.experimental import enable_hist_gradient_boosting # noqa
|
||||
from sklearn.ensemble import HistGradientBoostingRegressor
|
||||
from sklearn.ensemble import HistGradientBoostingClassifier
|
||||
from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
|
||||
from sklearn.ensemble._hist_gradient_boosting.loss import LeastSquares
|
||||
from sklearn.ensemble._hist_gradient_boosting.loss import BinaryCrossEntropy
|
||||
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
|
||||
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
|
||||
from sklearn.utils import shuffle
|
||||
|
||||
|
||||
X_classification, y_classification = make_classification(random_state=0)
|
||||
X_regression, y_regression = make_regression(random_state=0)
|
||||
|
||||
|
||||
def _make_dumb_dataset(n_samples):
|
||||
"""Make a dumb dataset to test early stopping."""
|
||||
rng = np.random.RandomState(42)
|
||||
X_dumb = rng.randn(n_samples, 1)
|
||||
y_dumb = (X_dumb[:, 0] > 0).astype('int64')
|
||||
return X_dumb, y_dumb
|
||||
|
||||
|
||||
@pytest.mark.parametrize('GradientBoosting, X, y', [
|
||||
(HistGradientBoostingClassifier, X_classification, y_classification),
|
||||
(HistGradientBoostingRegressor, X_regression, y_regression)
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
'params, err_msg',
|
||||
[({'loss': 'blah'}, 'Loss blah is not supported for'),
|
||||
({'learning_rate': 0}, 'learning_rate=0 must be strictly positive'),
|
||||
({'learning_rate': -1}, 'learning_rate=-1 must be strictly positive'),
|
||||
({'max_iter': 0}, 'max_iter=0 must not be smaller than 1'),
|
||||
({'max_leaf_nodes': 0}, 'max_leaf_nodes=0 should not be smaller than 2'),
|
||||
({'max_leaf_nodes': 1}, 'max_leaf_nodes=1 should not be smaller than 2'),
|
||||
({'max_depth': 0}, 'max_depth=0 should not be smaller than 1'),
|
||||
({'min_samples_leaf': 0}, 'min_samples_leaf=0 should not be smaller'),
|
||||
({'l2_regularization': -1}, 'l2_regularization=-1 must be positive'),
|
||||
({'max_bins': 1}, 'max_bins=1 should be no smaller than 2 and no larger'),
|
||||
({'max_bins': 256}, 'max_bins=256 should be no smaller than 2 and no'),
|
||||
({'n_iter_no_change': -1}, 'n_iter_no_change=-1 must be positive'),
|
||||
({'validation_fraction': -1}, 'validation_fraction=-1 must be strictly'),
|
||||
({'validation_fraction': 0}, 'validation_fraction=0 must be strictly'),
|
||||
({'tol': -1}, 'tol=-1 must not be smaller than 0')]
|
||||
)
|
||||
def test_init_parameters_validation(GradientBoosting, X, y, params, err_msg):
|
||||
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
GradientBoosting(**params).fit(X, y)
|
||||
|
||||
|
||||
def test_invalid_classification_loss():
|
||||
binary_clf = HistGradientBoostingClassifier(loss="binary_crossentropy")
|
||||
err_msg = ("loss='binary_crossentropy' is not defined for multiclass "
|
||||
"classification with n_classes=3, use "
|
||||
"loss='categorical_crossentropy' instead")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
binary_clf.fit(np.zeros(shape=(3, 2)), np.arange(3))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'scoring, validation_fraction, early_stopping, n_iter_no_change, tol', [
|
||||
('neg_mean_squared_error', .1, True, 5, 1e-7), # use scorer
|
||||
('neg_mean_squared_error', None, True, 5, 1e-1), # use scorer on train
|
||||
(None, .1, True, 5, 1e-7), # same with default scorer
|
||||
(None, None, True, 5, 1e-1),
|
||||
('loss', .1, True, 5, 1e-7), # use loss
|
||||
('loss', None, True, 5, 1e-1), # use loss on training data
|
||||
(None, None, False, 5, None), # no early stopping
|
||||
])
|
||||
def test_early_stopping_regression(scoring, validation_fraction,
|
||||
early_stopping, n_iter_no_change, tol):
|
||||
|
||||
max_iter = 200
|
||||
|
||||
X, y = make_regression(n_samples=50, random_state=0)
|
||||
|
||||
gb = HistGradientBoostingRegressor(
|
||||
verbose=1, # just for coverage
|
||||
min_samples_leaf=5, # easier to overfit fast
|
||||
scoring=scoring,
|
||||
tol=tol,
|
||||
early_stopping=early_stopping,
|
||||
validation_fraction=validation_fraction,
|
||||
max_iter=max_iter,
|
||||
n_iter_no_change=n_iter_no_change,
|
||||
random_state=0
|
||||
)
|
||||
gb.fit(X, y)
|
||||
|
||||
if early_stopping:
|
||||
assert n_iter_no_change <= gb.n_iter_ < max_iter
|
||||
else:
|
||||
assert gb.n_iter_ == max_iter
|
||||
|
||||
|
||||
@pytest.mark.parametrize('data', (
|
||||
make_classification(n_samples=30, random_state=0),
|
||||
make_classification(n_samples=30, n_classes=3, n_clusters_per_class=1,
|
||||
random_state=0)
|
||||
))
|
||||
@pytest.mark.parametrize(
|
||||
'scoring, validation_fraction, early_stopping, n_iter_no_change, tol', [
|
||||
('accuracy', .1, True, 5, 1e-7), # use scorer
|
||||
('accuracy', None, True, 5, 1e-1), # use scorer on training data
|
||||
(None, .1, True, 5, 1e-7), # same with default scorer
|
||||
(None, None, True, 5, 1e-1),
|
||||
('loss', .1, True, 5, 1e-7), # use loss
|
||||
('loss', None, True, 5, 1e-1), # use loss on training data
|
||||
(None, None, False, 5, None), # no early stopping
|
||||
])
|
||||
def test_early_stopping_classification(data, scoring, validation_fraction,
|
||||
early_stopping, n_iter_no_change, tol):
|
||||
|
||||
max_iter = 50
|
||||
|
||||
X, y = data
|
||||
|
||||
gb = HistGradientBoostingClassifier(
|
||||
verbose=1, # just for coverage
|
||||
min_samples_leaf=5, # easier to overfit fast
|
||||
scoring=scoring,
|
||||
tol=tol,
|
||||
early_stopping=early_stopping,
|
||||
validation_fraction=validation_fraction,
|
||||
max_iter=max_iter,
|
||||
n_iter_no_change=n_iter_no_change,
|
||||
random_state=0
|
||||
)
|
||||
gb.fit(X, y)
|
||||
|
||||
if early_stopping is True:
|
||||
assert n_iter_no_change <= gb.n_iter_ < max_iter
|
||||
else:
|
||||
assert gb.n_iter_ == max_iter
|
||||
|
||||
|
||||
@pytest.mark.parametrize('GradientBoosting, X, y', [
|
||||
(HistGradientBoostingClassifier, *_make_dumb_dataset(10000)),
|
||||
(HistGradientBoostingClassifier, *_make_dumb_dataset(10001)),
|
||||
(HistGradientBoostingRegressor, *_make_dumb_dataset(10000)),
|
||||
(HistGradientBoostingRegressor, *_make_dumb_dataset(10001))
|
||||
])
|
||||
def test_early_stopping_default(GradientBoosting, X, y):
|
||||
# Test that early stopping is enabled by default if and only if there
|
||||
# are more than 10000 samples
|
||||
gb = GradientBoosting(max_iter=10, n_iter_no_change=2, tol=1e-1)
|
||||
gb.fit(X, y)
|
||||
if X.shape[0] > 10000:
|
||||
assert gb.n_iter_ < gb.max_iter
|
||||
else:
|
||||
assert gb.n_iter_ == gb.max_iter
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'scores, n_iter_no_change, tol, stopping',
|
||||
[
|
||||
([], 1, 0.001, False), # not enough iterations
|
||||
([1, 1, 1], 5, 0.001, False), # not enough iterations
|
||||
([1, 1, 1, 1, 1], 5, 0.001, False), # not enough iterations
|
||||
([1, 2, 3, 4, 5, 6], 5, 0.001, False), # significant improvement
|
||||
([1, 2, 3, 4, 5, 6], 5, 0., False), # significant improvement
|
||||
([1, 2, 3, 4, 5, 6], 5, 0.999, False), # significant improvement
|
||||
([1, 2, 3, 4, 5, 6], 5, 5 - 1e-5, False), # significant improvement
|
||||
([1] * 6, 5, 0., True), # no significant improvement
|
||||
([1] * 6, 5, 0.001, True), # no significant improvement
|
||||
([1] * 6, 5, 5, True), # no significant improvement
|
||||
]
|
||||
)
|
||||
def test_should_stop(scores, n_iter_no_change, tol, stopping):
|
||||
|
||||
gbdt = HistGradientBoostingClassifier(
|
||||
n_iter_no_change=n_iter_no_change, tol=tol
|
||||
)
|
||||
assert gbdt._should_stop(scores) == stopping
|
||||
|
||||
|
||||
def test_least_absolute_deviation():
|
||||
# For coverage only.
|
||||
X, y = make_regression(n_samples=500, random_state=0)
|
||||
gbdt = HistGradientBoostingRegressor(loss='least_absolute_deviation',
|
||||
random_state=0)
|
||||
gbdt.fit(X, y)
|
||||
assert gbdt.score(X, y) > .9
|
||||
|
||||
|
||||
@pytest.mark.parametrize('y', [([1., -2., 0.]), ([0., 0., 0.])])
|
||||
def test_poisson_y_positive(y):
|
||||
# Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0.
|
||||
err_msg = r"loss='poisson' requires non-negative y and sum\(y\) > 0."
|
||||
gbdt = HistGradientBoostingRegressor(loss='poisson', random_state=0)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
gbdt.fit(np.zeros(shape=(len(y), 1)), y)
|
||||
|
||||
|
||||
def test_poisson():
|
||||
# For Poisson distributed target, Poisson loss should give better results
|
||||
# than least squares measured in Poisson deviance as metric.
|
||||
rng = np.random.RandomState(42)
|
||||
n_train, n_test, n_features = 500, 100, 100
|
||||
X = make_low_rank_matrix(n_samples=n_train+n_test, n_features=n_features,
|
||||
random_state=rng)
|
||||
# We create a log-linear Poisson model and downscale coef as it will get
|
||||
# exponentiated.
|
||||
coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
|
||||
y = rng.poisson(lam=np.exp(X @ coef))
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test,
|
||||
random_state=rng)
|
||||
gbdt_pois = HistGradientBoostingRegressor(loss='poisson', random_state=rng)
|
||||
gbdt_ls = HistGradientBoostingRegressor(loss='least_squares',
|
||||
random_state=rng)
|
||||
gbdt_pois.fit(X_train, y_train)
|
||||
gbdt_ls.fit(X_train, y_train)
|
||||
dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)
|
||||
|
||||
for X, y in [(X_train, y_train), (X_test, y_test)]:
|
||||
metric_pois = mean_poisson_deviance(y, gbdt_pois.predict(X))
|
||||
# least_squares might produce non-positive predictions => clip
|
||||
metric_ls = mean_poisson_deviance(y, np.clip(gbdt_ls.predict(X), 1e-15,
|
||||
None))
|
||||
metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
|
||||
assert metric_pois < metric_ls
|
||||
assert metric_pois < metric_dummy
|
||||
|
||||
|
||||
def test_binning_train_validation_are_separated():
|
||||
# Make sure training and validation data are binned separately.
|
||||
# See issue 13926
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
validation_fraction = .2
|
||||
gb = HistGradientBoostingClassifier(
|
||||
early_stopping=True,
|
||||
validation_fraction=validation_fraction,
|
||||
random_state=rng
|
||||
)
|
||||
gb.fit(X_classification, y_classification)
|
||||
mapper_training_data = gb.bin_mapper_
|
||||
|
||||
# Note that since the data is small there is no subsampling and the
|
||||
# random_state doesn't matter
|
||||
mapper_whole_data = _BinMapper(random_state=0)
|
||||
mapper_whole_data.fit(X_classification)
|
||||
|
||||
n_samples = X_classification.shape[0]
|
||||
assert np.all(mapper_training_data.n_bins_non_missing_ ==
|
||||
int((1 - validation_fraction) * n_samples))
|
||||
assert np.all(mapper_training_data.n_bins_non_missing_ !=
|
||||
mapper_whole_data.n_bins_non_missing_)
|
||||
|
||||
|
||||
def test_missing_values_trivial():
|
||||
# sanity check for missing values support. With only one feature and
|
||||
# y == isnan(X), the gbdt is supposed to reach perfect accuracy on the
|
||||
# training set.
|
||||
|
||||
n_samples = 100
|
||||
n_features = 1
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
X = rng.normal(size=(n_samples, n_features))
|
||||
mask = rng.binomial(1, .5, size=X.shape).astype(np.bool)
|
||||
X[mask] = np.nan
|
||||
y = mask.ravel()
|
||||
gb = HistGradientBoostingClassifier()
|
||||
gb.fit(X, y)
|
||||
|
||||
assert gb.score(X, y) == pytest.approx(1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('problem', ('classification', 'regression'))
|
||||
@pytest.mark.parametrize(
|
||||
'missing_proportion, expected_min_score_classification, '
|
||||
'expected_min_score_regression', [
|
||||
(.1, .97, .89),
|
||||
(.2, .93, .81),
|
||||
(.5, .79, .52)])
|
||||
def test_missing_values_resilience(problem, missing_proportion,
|
||||
expected_min_score_classification,
|
||||
expected_min_score_regression):
|
||||
# Make sure the estimators can deal with missing values and still yield
|
||||
# decent predictions
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 1000
|
||||
n_features = 2
|
||||
if problem == 'regression':
|
||||
X, y = make_regression(n_samples=n_samples, n_features=n_features,
|
||||
n_informative=n_features, random_state=rng)
|
||||
gb = HistGradientBoostingRegressor()
|
||||
expected_min_score = expected_min_score_regression
|
||||
else:
|
||||
X, y = make_classification(n_samples=n_samples, n_features=n_features,
|
||||
n_informative=n_features, n_redundant=0,
|
||||
n_repeated=0, random_state=rng)
|
||||
gb = HistGradientBoostingClassifier()
|
||||
expected_min_score = expected_min_score_classification
|
||||
|
||||
mask = rng.binomial(1, missing_proportion, size=X.shape).astype(np.bool)
|
||||
X[mask] = np.nan
|
||||
|
||||
gb.fit(X, y)
|
||||
|
||||
assert gb.score(X, y) > expected_min_score
|
||||
|
||||
|
||||
@pytest.mark.parametrize('data', [
|
||||
make_classification(random_state=0, n_classes=2),
|
||||
make_classification(random_state=0, n_classes=3, n_informative=3)
|
||||
], ids=['binary_crossentropy', 'categorical_crossentropy'])
|
||||
def test_zero_division_hessians(data):
|
||||
# non regression test for issue #14018
|
||||
# make sure we avoid zero division errors when computing the leaves values.
|
||||
|
||||
# If the learning rate is too high, the raw predictions are bad and will
|
||||
# saturate the softmax (or sigmoid in binary classif). This leads to
|
||||
# probabilities being exactly 0 or 1, gradients being constant, and
|
||||
# hessians being zero.
|
||||
X, y = data
|
||||
gb = HistGradientBoostingClassifier(learning_rate=100, max_iter=10)
|
||||
gb.fit(X, y)
|
||||
|
||||
|
||||
def test_small_trainset():
|
||||
# Make sure that the small trainset is stratified and has the expected
|
||||
# length (10k samples)
|
||||
n_samples = 20000
|
||||
original_distrib = {0: 0.1, 1: 0.2, 2: 0.3, 3: 0.4}
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.randn(n_samples).reshape(n_samples, 1)
|
||||
y = [[class_] * int(prop * n_samples) for (class_, prop)
|
||||
in original_distrib.items()]
|
||||
y = shuffle(np.concatenate(y))
|
||||
gb = HistGradientBoostingClassifier()
|
||||
|
||||
# Compute the small training set
|
||||
X_small, y_small, _ = gb._get_small_trainset(X, y, seed=42,
|
||||
sample_weight_train=None)
|
||||
|
||||
# Compute the class distribution in the small training set
|
||||
unique, counts = np.unique(y_small, return_counts=True)
|
||||
small_distrib = {class_: count / 10000 for (class_, count)
|
||||
in zip(unique, counts)}
|
||||
|
||||
# Test that the small training set has the expected length
|
||||
assert X_small.shape[0] == 10000
|
||||
assert y_small.shape[0] == 10000
|
||||
|
||||
# Test that the class distributions in the whole dataset and in the small
|
||||
# training set are identical
|
||||
assert small_distrib == pytest.approx(original_distrib)
|
||||
|
||||
|
||||
def test_missing_values_minmax_imputation():
|
||||
# Compare the buit-in missing value handling of Histogram GBC with an
|
||||
# a-priori missing value imputation strategy that should yield the same
|
||||
# results in terms of decision function.
|
||||
#
|
||||
# Each feature (containing NaNs) is replaced by 2 features:
|
||||
# - one where the nans are replaced by min(feature) - 1
|
||||
# - one where the nans are replaced by max(feature) + 1
|
||||
# A split where nans go to the left has an equivalent split in the
|
||||
# first (min) feature, and a split where nans go to the right has an
|
||||
# equivalent split in the second (max) feature.
|
||||
#
|
||||
# Assuming the data is such that there is never a tie to select the best
|
||||
# feature to split on during training, the learned decision trees should be
|
||||
# strictly equivalent (learn a sequence of splits that encode the same
|
||||
# decision function).
|
||||
#
|
||||
# The MinMaxImputer transformer is meant to be a toy implementation of the
|
||||
# "Missing In Attributes" (MIA) missing value handling for decision trees
|
||||
# https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305
|
||||
# The implementation of MIA as an imputation transformer was suggested by
|
||||
# "Remark 3" in https://arxiv.org/abs/1902.06931
|
||||
|
||||
class MinMaxImputer(BaseEstimator, TransformerMixin):
|
||||
|
||||
def fit(self, X, y=None):
|
||||
mm = MinMaxScaler().fit(X)
|
||||
self.data_min_ = mm.data_min_
|
||||
self.data_max_ = mm.data_max_
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
X_min, X_max = X.copy(), X.copy()
|
||||
|
||||
for feature_idx in range(X.shape[1]):
|
||||
nan_mask = np.isnan(X[:, feature_idx])
|
||||
X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1
|
||||
X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1
|
||||
|
||||
return np.concatenate([X_min, X_max], axis=1)
|
||||
|
||||
def make_missing_value_data(n_samples=int(1e4), seed=0):
|
||||
rng = np.random.RandomState(seed)
|
||||
X, y = make_regression(n_samples=n_samples, n_features=4,
|
||||
random_state=rng)
|
||||
|
||||
# Pre-bin the data to ensure a deterministic handling by the 2
|
||||
# strategies and also make it easier to insert np.nan in a structured
|
||||
# way:
|
||||
X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X)
|
||||
|
||||
# First feature has missing values completely at random:
|
||||
rnd_mask = rng.rand(X.shape[0]) > 0.9
|
||||
X[rnd_mask, 0] = np.nan
|
||||
|
||||
# Second and third features have missing values for extreme values
|
||||
# (censoring missingness):
|
||||
low_mask = X[:, 1] == 0
|
||||
X[low_mask, 1] = np.nan
|
||||
|
||||
high_mask = X[:, 2] == X[:, 2].max()
|
||||
X[high_mask, 2] = np.nan
|
||||
|
||||
# Make the last feature nan pattern very informative:
|
||||
y_max = np.percentile(y, 70)
|
||||
y_max_mask = y >= y_max
|
||||
y[y_max_mask] = y_max
|
||||
X[y_max_mask, 3] = np.nan
|
||||
|
||||
# Check that there is at least one missing value in each feature:
|
||||
for feature_idx in range(X.shape[1]):
|
||||
assert any(np.isnan(X[:, feature_idx]))
|
||||
|
||||
# Let's use a test set to check that the learned decision function is
|
||||
# the same as evaluated on unseen data. Otherwise it could just be the
|
||||
# case that we find two independent ways to overfit the training set.
|
||||
return train_test_split(X, y, random_state=rng)
|
||||
|
||||
# n_samples need to be large enough to minimize the likelihood of having
|
||||
# several candidate splits with the same gain value in a given tree.
|
||||
X_train, X_test, y_train, y_test = make_missing_value_data(
|
||||
n_samples=int(1e4), seed=0)
|
||||
|
||||
# Use a small number of leaf nodes and iterations so as to keep
|
||||
# under-fitting models to minimize the likelihood of ties when training the
|
||||
# model.
|
||||
gbm1 = HistGradientBoostingRegressor(max_iter=100,
|
||||
max_leaf_nodes=5,
|
||||
random_state=0)
|
||||
gbm1.fit(X_train, y_train)
|
||||
|
||||
gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1))
|
||||
gbm2.fit(X_train, y_train)
|
||||
|
||||
# Check that the model reach the same score:
|
||||
assert gbm1.score(X_train, y_train) == \
|
||||
pytest.approx(gbm2.score(X_train, y_train))
|
||||
|
||||
assert gbm1.score(X_test, y_test) == \
|
||||
pytest.approx(gbm2.score(X_test, y_test))
|
||||
|
||||
# Check the individual prediction match as a finer grained
|
||||
# decision function check.
|
||||
assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train))
|
||||
assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test))
|
||||
|
||||
|
||||
def test_infinite_values():
|
||||
# Basic test for infinite values
|
||||
|
||||
X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
|
||||
y = np.array([0, 0, 1, 1])
|
||||
|
||||
gbdt = HistGradientBoostingRegressor(min_samples_leaf=1)
|
||||
gbdt.fit(X, y)
|
||||
np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4)
|
||||
|
||||
|
||||
def test_consistent_lengths():
|
||||
X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
|
||||
y = np.array([0, 0, 1, 1])
|
||||
sample_weight = np.array([.1, .3, .1])
|
||||
gbdt = HistGradientBoostingRegressor()
|
||||
with pytest.raises(ValueError,
|
||||
match=r"sample_weight.shape == \(3,\), expected"):
|
||||
gbdt.fit(X, y, sample_weight)
|
||||
|
||||
with pytest.raises(ValueError,
|
||||
match="Found input variables with inconsistent number"):
|
||||
gbdt.fit(X, y[1:])
|
||||
|
||||
|
||||
def test_infinite_values_missing_values():
|
||||
# High level test making sure that inf and nan values are properly handled
|
||||
# when both are present. This is similar to
|
||||
# test_split_on_nan_with_infinite_values() in test_grower.py, though we
|
||||
# cannot check the predictions for binned values here.
|
||||
|
||||
X = np.asarray([-np.inf, 0, 1, np.inf, np.nan]).reshape(-1, 1)
|
||||
y_isnan = np.isnan(X.ravel())
|
||||
y_isinf = X.ravel() == np.inf
|
||||
|
||||
stump_clf = HistGradientBoostingClassifier(min_samples_leaf=1, max_iter=1,
|
||||
learning_rate=1, max_depth=2)
|
||||
|
||||
assert stump_clf.fit(X, y_isinf).score(X, y_isinf) == 1
|
||||
assert stump_clf.fit(X, y_isnan).score(X, y_isnan) == 1
|
||||
|
||||
|
||||
def test_crossentropy_binary_problem():
|
||||
# categorical_crossentropy should only be used if there are more than two
|
||||
# classes present. PR #14869
|
||||
X = [[1], [0]]
|
||||
y = [0, 1]
|
||||
gbrt = HistGradientBoostingClassifier(loss='categorical_crossentropy')
|
||||
with pytest.raises(ValueError,
|
||||
match="'categorical_crossentropy' is not suitable for"):
|
||||
gbrt.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("scoring", [None, 'loss'])
|
||||
def test_string_target_early_stopping(scoring):
|
||||
# Regression tests for #14709 where the targets need to be encoded before
|
||||
# to compute the score
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.randn(100, 10)
|
||||
y = np.array(['x'] * 50 + ['y'] * 50, dtype=object)
|
||||
gbrt = HistGradientBoostingClassifier(n_iter_no_change=10, scoring=scoring)
|
||||
gbrt.fit(X, y)
|
||||
|
||||
|
||||
def test_zero_sample_weights_regression():
|
||||
# Make sure setting a SW to zero amounts to ignoring the corresponding
|
||||
# sample
|
||||
|
||||
X = [[1, 0],
|
||||
[1, 0],
|
||||
[1, 0],
|
||||
[0, 1]]
|
||||
y = [0, 0, 1, 0]
|
||||
# ignore the first 2 training samples by setting their weight to 0
|
||||
sample_weight = [0, 0, 1, 1]
|
||||
gb = HistGradientBoostingRegressor(min_samples_leaf=1)
|
||||
gb.fit(X, y, sample_weight=sample_weight)
|
||||
assert gb.predict([[1, 0]])[0] > 0.5
|
||||
|
||||
|
||||
def test_zero_sample_weights_classification():
|
||||
# Make sure setting a SW to zero amounts to ignoring the corresponding
|
||||
# sample
|
||||
|
||||
X = [[1, 0],
|
||||
[1, 0],
|
||||
[1, 0],
|
||||
[0, 1]]
|
||||
y = [0, 0, 1, 0]
|
||||
# ignore the first 2 training samples by setting their weight to 0
|
||||
sample_weight = [0, 0, 1, 1]
|
||||
gb = HistGradientBoostingClassifier(loss='binary_crossentropy',
|
||||
min_samples_leaf=1)
|
||||
gb.fit(X, y, sample_weight=sample_weight)
|
||||
assert_array_equal(gb.predict([[1, 0]]), [1])
|
||||
|
||||
X = [[1, 0],
|
||||
[1, 0],
|
||||
[1, 0],
|
||||
[0, 1],
|
||||
[1, 1]]
|
||||
y = [0, 0, 1, 0, 2]
|
||||
# ignore the first 2 training samples by setting their weight to 0
|
||||
sample_weight = [0, 0, 1, 1, 1]
|
||||
gb = HistGradientBoostingClassifier(loss='categorical_crossentropy',
|
||||
min_samples_leaf=1)
|
||||
gb.fit(X, y, sample_weight=sample_weight)
|
||||
assert_array_equal(gb.predict([[1, 0]]), [1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize('problem', (
|
||||
'regression',
|
||||
'binary_classification',
|
||||
'multiclass_classification'
|
||||
))
|
||||
@pytest.mark.parametrize('duplication', ('half', 'all'))
|
||||
def test_sample_weight_effect(problem, duplication):
|
||||
# High level test to make sure that duplicating a sample is equivalent to
|
||||
# giving it weight of 2.
|
||||
|
||||
# fails for n_samples > 255 because binning does not take sample weights
|
||||
# into account. Keeping n_samples <= 255 makes
|
||||
# sure only unique values are used so SW have no effect on binning.
|
||||
n_samples = 255
|
||||
n_features = 2
|
||||
if problem == 'regression':
|
||||
X, y = make_regression(n_samples=n_samples, n_features=n_features,
|
||||
n_informative=n_features, random_state=0)
|
||||
Klass = HistGradientBoostingRegressor
|
||||
else:
|
||||
n_classes = 2 if problem == 'binary_classification' else 3
|
||||
X, y = make_classification(n_samples=n_samples, n_features=n_features,
|
||||
n_informative=n_features, n_redundant=0,
|
||||
n_clusters_per_class=1,
|
||||
n_classes=n_classes, random_state=0)
|
||||
Klass = HistGradientBoostingClassifier
|
||||
|
||||
# This test can't pass if min_samples_leaf > 1 because that would force 2
|
||||
# samples to be in the same node in est_sw, while these samples would be
|
||||
# free to be separate in est_dup: est_dup would just group together the
|
||||
# duplicated samples.
|
||||
est = Klass(min_samples_leaf=1)
|
||||
|
||||
# Create dataset with duplicate and corresponding sample weights
|
||||
if duplication == 'half':
|
||||
lim = n_samples // 2
|
||||
else:
|
||||
lim = n_samples
|
||||
X_dup = np.r_[X, X[:lim]]
|
||||
y_dup = np.r_[y, y[:lim]]
|
||||
sample_weight = np.ones(shape=(n_samples))
|
||||
sample_weight[:lim] = 2
|
||||
|
||||
est_sw = clone(est).fit(X, y, sample_weight=sample_weight)
|
||||
est_dup = clone(est).fit(X_dup, y_dup)
|
||||
|
||||
# checking raw_predict is stricter than just predict for classification
|
||||
assert np.allclose(est_sw._raw_predict(X_dup),
|
||||
est_dup._raw_predict(X_dup))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('loss_name', ('least_squares',
|
||||
'least_absolute_deviation'))
|
||||
def test_sum_hessians_are_sample_weight(loss_name):
|
||||
# For losses with constant hessians, the sum_hessians field of the
|
||||
# histograms must be equal to the sum of the sample weight of samples at
|
||||
# the corresponding bin.
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 1000
|
||||
n_features = 2
|
||||
X, y = make_regression(n_samples=n_samples, n_features=n_features,
|
||||
random_state=rng)
|
||||
bin_mapper = _BinMapper()
|
||||
X_binned = bin_mapper.fit_transform(X)
|
||||
|
||||
sample_weight = rng.normal(size=n_samples)
|
||||
|
||||
loss = _LOSSES[loss_name](sample_weight=sample_weight)
|
||||
gradients, hessians = loss.init_gradients_and_hessians(
|
||||
n_samples=n_samples, prediction_dim=1, sample_weight=sample_weight)
|
||||
raw_predictions = rng.normal(size=(1, n_samples))
|
||||
loss.update_gradients_and_hessians(gradients, hessians, y,
|
||||
raw_predictions, sample_weight)
|
||||
|
||||
# build sum_sample_weight which contains the sum of the sample weights at
|
||||
# each bin (for each feature). This must be equal to the sum_hessians
|
||||
# field of the corresponding histogram
|
||||
sum_sw = np.zeros(shape=(n_features, bin_mapper.n_bins))
|
||||
for feature_idx in range(n_features):
|
||||
for sample_idx in range(n_samples):
|
||||
sum_sw[feature_idx, X_binned[sample_idx, feature_idx]] += (
|
||||
sample_weight[sample_idx])
|
||||
|
||||
# Build histogram
|
||||
grower = TreeGrower(X_binned, gradients[0], hessians[0],
|
||||
n_bins=bin_mapper.n_bins)
|
||||
histograms = grower.histogram_builder.compute_histograms_brute(
|
||||
grower.root.sample_indices)
|
||||
|
||||
for feature_idx in range(n_features):
|
||||
for bin_idx in range(bin_mapper.n_bins):
|
||||
assert histograms[feature_idx, bin_idx]['sum_hessians'] == (
|
||||
pytest.approx(sum_sw[feature_idx, bin_idx], rel=1e-5))
|
||||
|
||||
|
||||
def test_max_depth_max_leaf_nodes():
|
||||
# Non regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/16179
|
||||
# there was a bug when the max_depth and the max_leaf_nodes criteria were
|
||||
# met at the same time, which would lead to max_leaf_nodes not being
|
||||
# respected.
|
||||
X, y = make_classification(random_state=0)
|
||||
est = HistGradientBoostingClassifier(max_depth=2, max_leaf_nodes=3,
|
||||
max_iter=1).fit(X, y)
|
||||
tree = est._predictors[0][0]
|
||||
assert tree.get_max_depth() == 2
|
||||
assert tree.get_n_leaf_nodes() == 3 # would be 4 prior to bug fix
|
||||
|
||||
|
||||
def test_early_stopping_on_test_set_with_warm_start():
|
||||
# Non regression test for #16661 where second fit fails with
|
||||
# warm_start=True, early_stopping is on, and no validation set
|
||||
X, y = make_classification(random_state=0)
|
||||
gb = HistGradientBoostingClassifier(
|
||||
max_iter=1, scoring='loss', warm_start=True, early_stopping=True,
|
||||
n_iter_no_change=1, validation_fraction=None)
|
||||
|
||||
gb.fit(X, y)
|
||||
# does not raise on second call
|
||||
gb.set_params(max_iter=2)
|
||||
gb.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('Est', (HistGradientBoostingClassifier,
|
||||
HistGradientBoostingRegressor))
|
||||
def test_single_node_trees(Est):
|
||||
# Make sure it's still possible to build single-node trees. In that case
|
||||
# the value of the root is set to 0. That's a correct value: if the tree is
|
||||
# single-node that's because min_gain_to_split is not respected right from
|
||||
# the root, so we don't want the tree to have any impact on the
|
||||
# predictions.
|
||||
|
||||
X, y = make_classification(random_state=0)
|
||||
y[:] = 1 # constant target will lead to a single root node
|
||||
|
||||
est = Est(max_iter=20)
|
||||
est.fit(X, y)
|
||||
|
||||
assert all(len(predictor[0].nodes) == 1 for predictor in est._predictors)
|
||||
assert all(predictor[0].nodes[0]['value'] == 0
|
||||
for predictor in est._predictors)
|
||||
# Still gives correct predictions thanks to the baseline prediction
|
||||
assert_allclose(est.predict(X), y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('Est, loss, X, y', [
|
||||
(
|
||||
HistGradientBoostingClassifier,
|
||||
BinaryCrossEntropy(sample_weight=None),
|
||||
X_classification,
|
||||
y_classification
|
||||
),
|
||||
(
|
||||
HistGradientBoostingRegressor,
|
||||
LeastSquares(sample_weight=None),
|
||||
X_regression,
|
||||
y_regression
|
||||
)
|
||||
])
|
||||
def test_custom_loss(Est, loss, X, y):
|
||||
est = Est(loss=loss, max_iter=20)
|
||||
est.fit(X, y)
|
|
@ -0,0 +1,399 @@
|
|||
import numpy as np
|
||||
import pytest
|
||||
from pytest import approx
|
||||
|
||||
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
|
||||
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
|
||||
|
||||
|
||||
def _make_training_data(n_bins=256, constant_hessian=True):
|
||||
rng = np.random.RandomState(42)
|
||||
n_samples = 10000
|
||||
|
||||
# Generate some test data directly binned so as to test the grower code
|
||||
# independently of the binning logic.
|
||||
X_binned = rng.randint(0, n_bins - 1, size=(n_samples, 2),
|
||||
dtype=X_BINNED_DTYPE)
|
||||
X_binned = np.asfortranarray(X_binned)
|
||||
|
||||
def true_decision_function(input_features):
|
||||
"""Ground truth decision function
|
||||
|
||||
This is a very simple yet asymmetric decision tree. Therefore the
|
||||
grower code should have no trouble recovering the decision function
|
||||
from 10000 training samples.
|
||||
"""
|
||||
if input_features[0] <= n_bins // 2:
|
||||
return -1
|
||||
else:
|
||||
return -1 if input_features[1] <= n_bins // 3 else 1
|
||||
|
||||
target = np.array([true_decision_function(x) for x in X_binned],
|
||||
dtype=Y_DTYPE)
|
||||
|
||||
# Assume a square loss applied to an initial model that always predicts 0
|
||||
# (hardcoded for this test):
|
||||
all_gradients = target.astype(G_H_DTYPE)
|
||||
shape_hessians = 1 if constant_hessian else all_gradients.shape
|
||||
all_hessians = np.ones(shape=shape_hessians, dtype=G_H_DTYPE)
|
||||
|
||||
return X_binned, all_gradients, all_hessians
|
||||
|
||||
|
||||
def _check_children_consistency(parent, left, right):
|
||||
# Make sure the samples are correctly dispatched from a parent to its
|
||||
# children
|
||||
assert parent.left_child is left
|
||||
assert parent.right_child is right
|
||||
|
||||
# each sample from the parent is propagated to one of the two children
|
||||
assert (len(left.sample_indices) + len(right.sample_indices)
|
||||
== len(parent.sample_indices))
|
||||
|
||||
assert (set(left.sample_indices).union(set(right.sample_indices))
|
||||
== set(parent.sample_indices))
|
||||
|
||||
# samples are sent either to the left or the right node, never to both
|
||||
assert (set(left.sample_indices).intersection(set(right.sample_indices))
|
||||
== set())
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'n_bins, constant_hessian, stopping_param, shrinkage',
|
||||
[
|
||||
(11, True, "min_gain_to_split", 0.5),
|
||||
(11, False, "min_gain_to_split", 1.),
|
||||
(11, True, "max_leaf_nodes", 1.),
|
||||
(11, False, "max_leaf_nodes", 0.1),
|
||||
(42, True, "max_leaf_nodes", 0.01),
|
||||
(42, False, "max_leaf_nodes", 1.),
|
||||
(256, True, "min_gain_to_split", 1.),
|
||||
(256, True, "max_leaf_nodes", 0.1),
|
||||
]
|
||||
)
|
||||
def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage):
|
||||
X_binned, all_gradients, all_hessians = _make_training_data(
|
||||
n_bins=n_bins, constant_hessian=constant_hessian)
|
||||
n_samples = X_binned.shape[0]
|
||||
|
||||
if stopping_param == "max_leaf_nodes":
|
||||
stopping_param = {"max_leaf_nodes": 3}
|
||||
else:
|
||||
stopping_param = {"min_gain_to_split": 0.01}
|
||||
|
||||
grower = TreeGrower(X_binned, all_gradients, all_hessians,
|
||||
n_bins=n_bins, shrinkage=shrinkage,
|
||||
min_samples_leaf=1, **stopping_param)
|
||||
|
||||
# The root node is not yet splitted, but the best possible split has
|
||||
# already been evaluated:
|
||||
assert grower.root.left_child is None
|
||||
assert grower.root.right_child is None
|
||||
|
||||
root_split = grower.root.split_info
|
||||
assert root_split.feature_idx == 0
|
||||
assert root_split.bin_idx == n_bins // 2
|
||||
assert len(grower.splittable_nodes) == 1
|
||||
|
||||
# Calling split next applies the next split and computes the best split
|
||||
# for each of the two newly introduced children nodes.
|
||||
left_node, right_node = grower.split_next()
|
||||
|
||||
# All training samples have ben splitted in the two nodes, approximately
|
||||
# 50%/50%
|
||||
_check_children_consistency(grower.root, left_node, right_node)
|
||||
assert len(left_node.sample_indices) > 0.4 * n_samples
|
||||
assert len(left_node.sample_indices) < 0.6 * n_samples
|
||||
|
||||
if grower.min_gain_to_split > 0:
|
||||
# The left node is too pure: there is no gain to split it further.
|
||||
assert left_node.split_info.gain < grower.min_gain_to_split
|
||||
assert left_node in grower.finalized_leaves
|
||||
|
||||
# The right node can still be splitted further, this time on feature #1
|
||||
split_info = right_node.split_info
|
||||
assert split_info.gain > 1.
|
||||
assert split_info.feature_idx == 1
|
||||
assert split_info.bin_idx == n_bins // 3
|
||||
assert right_node.left_child is None
|
||||
assert right_node.right_child is None
|
||||
|
||||
# The right split has not been applied yet. Let's do it now:
|
||||
assert len(grower.splittable_nodes) == 1
|
||||
right_left_node, right_right_node = grower.split_next()
|
||||
_check_children_consistency(right_node, right_left_node, right_right_node)
|
||||
assert len(right_left_node.sample_indices) > 0.1 * n_samples
|
||||
assert len(right_left_node.sample_indices) < 0.2 * n_samples
|
||||
|
||||
assert len(right_right_node.sample_indices) > 0.2 * n_samples
|
||||
assert len(right_right_node.sample_indices) < 0.4 * n_samples
|
||||
|
||||
# All the leafs are pure, it is not possible to split any further:
|
||||
assert not grower.splittable_nodes
|
||||
|
||||
grower._apply_shrinkage()
|
||||
|
||||
# Check the values of the leaves:
|
||||
assert grower.root.left_child.value == approx(shrinkage)
|
||||
assert grower.root.right_child.left_child.value == approx(shrinkage)
|
||||
assert grower.root.right_child.right_child.value == approx(-shrinkage,
|
||||
rel=1e-3)
|
||||
|
||||
|
||||
def test_predictor_from_grower():
|
||||
# Build a tree on the toy 3-leaf dataset to extract the predictor.
|
||||
n_bins = 256
|
||||
X_binned, all_gradients, all_hessians = _make_training_data(
|
||||
n_bins=n_bins)
|
||||
grower = TreeGrower(X_binned, all_gradients, all_hessians,
|
||||
n_bins=n_bins, shrinkage=1.,
|
||||
max_leaf_nodes=3, min_samples_leaf=5)
|
||||
grower.grow()
|
||||
assert grower.n_nodes == 5 # (2 decision nodes + 3 leaves)
|
||||
|
||||
# Check that the node structure can be converted into a predictor
|
||||
# object to perform predictions at scale
|
||||
predictor = grower.make_predictor()
|
||||
assert predictor.nodes.shape[0] == 5
|
||||
assert predictor.nodes['is_leaf'].sum() == 3
|
||||
|
||||
# Probe some predictions for each leaf of the tree
|
||||
# each group of 3 samples corresponds to a condition in _make_training_data
|
||||
input_data = np.array([
|
||||
[0, 0],
|
||||
[42, 99],
|
||||
[128, 254],
|
||||
|
||||
[129, 0],
|
||||
[129, 85],
|
||||
[254, 85],
|
||||
|
||||
[129, 86],
|
||||
[129, 254],
|
||||
[242, 100],
|
||||
], dtype=np.uint8)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
predictions = predictor.predict_binned(input_data, missing_values_bin_idx)
|
||||
expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1]
|
||||
assert np.allclose(predictions, expected_targets)
|
||||
|
||||
# Check that training set can be recovered exactly:
|
||||
predictions = predictor.predict_binned(X_binned, missing_values_bin_idx)
|
||||
assert np.allclose(predictions, -all_gradients)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'n_samples, min_samples_leaf, n_bins, constant_hessian, noise',
|
||||
[
|
||||
(11, 10, 7, True, 0),
|
||||
(13, 10, 42, False, 0),
|
||||
(56, 10, 255, True, 0.1),
|
||||
(101, 3, 7, True, 0),
|
||||
(200, 42, 42, False, 0),
|
||||
(300, 55, 255, True, 0.1),
|
||||
(300, 301, 255, True, 0.1),
|
||||
]
|
||||
)
|
||||
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins,
|
||||
constant_hessian, noise):
|
||||
rng = np.random.RandomState(seed=0)
|
||||
# data = linear target, 3 features, 1 irrelevant.
|
||||
X = rng.normal(size=(n_samples, 3))
|
||||
y = X[:, 0] - X[:, 1]
|
||||
if noise:
|
||||
y_scale = y.std()
|
||||
y += rng.normal(scale=noise, size=n_samples) * y_scale
|
||||
mapper = _BinMapper(n_bins=n_bins)
|
||||
X = mapper.fit_transform(X)
|
||||
|
||||
all_gradients = y.astype(G_H_DTYPE)
|
||||
shape_hessian = 1 if constant_hessian else all_gradients.shape
|
||||
all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE)
|
||||
grower = TreeGrower(X, all_gradients, all_hessians,
|
||||
n_bins=n_bins, shrinkage=1.,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
max_leaf_nodes=n_samples)
|
||||
grower.grow()
|
||||
predictor = grower.make_predictor(
|
||||
bin_thresholds=mapper.bin_thresholds_)
|
||||
|
||||
if n_samples >= min_samples_leaf:
|
||||
for node in predictor.nodes:
|
||||
if node['is_leaf']:
|
||||
assert node['count'] >= min_samples_leaf
|
||||
else:
|
||||
assert predictor.nodes.shape[0] == 1
|
||||
assert predictor.nodes[0]['is_leaf']
|
||||
assert predictor.nodes[0]['count'] == n_samples
|
||||
|
||||
|
||||
@pytest.mark.parametrize('n_samples, min_samples_leaf', [
|
||||
(99, 50),
|
||||
(100, 50)])
|
||||
def test_min_samples_leaf_root(n_samples, min_samples_leaf):
|
||||
# Make sure root node isn't split if n_samples is not at least twice
|
||||
# min_samples_leaf
|
||||
rng = np.random.RandomState(seed=0)
|
||||
|
||||
n_bins = 256
|
||||
|
||||
# data = linear target, 3 features, 1 irrelevant.
|
||||
X = rng.normal(size=(n_samples, 3))
|
||||
y = X[:, 0] - X[:, 1]
|
||||
mapper = _BinMapper(n_bins=n_bins)
|
||||
X = mapper.fit_transform(X)
|
||||
|
||||
all_gradients = y.astype(G_H_DTYPE)
|
||||
all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
|
||||
grower = TreeGrower(X, all_gradients, all_hessians,
|
||||
n_bins=n_bins, shrinkage=1.,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
max_leaf_nodes=n_samples)
|
||||
grower.grow()
|
||||
if n_samples >= min_samples_leaf * 2:
|
||||
assert len(grower.finalized_leaves) >= 2
|
||||
else:
|
||||
assert len(grower.finalized_leaves) == 1
|
||||
|
||||
|
||||
def assert_is_stump(grower):
|
||||
# To assert that stumps are created when max_depth=1
|
||||
for leaf in (grower.root.left_child, grower.root.right_child):
|
||||
assert leaf.left_child is None
|
||||
assert leaf.right_child is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize('max_depth', [1, 2, 3])
|
||||
def test_max_depth(max_depth):
|
||||
# Make sure max_depth parameter works as expected
|
||||
rng = np.random.RandomState(seed=0)
|
||||
|
||||
n_bins = 256
|
||||
n_samples = 1000
|
||||
|
||||
# data = linear target, 3 features, 1 irrelevant.
|
||||
X = rng.normal(size=(n_samples, 3))
|
||||
y = X[:, 0] - X[:, 1]
|
||||
mapper = _BinMapper(n_bins=n_bins)
|
||||
X = mapper.fit_transform(X)
|
||||
|
||||
all_gradients = y.astype(G_H_DTYPE)
|
||||
all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
|
||||
grower = TreeGrower(X, all_gradients, all_hessians, max_depth=max_depth)
|
||||
grower.grow()
|
||||
|
||||
depth = max(leaf.depth for leaf in grower.finalized_leaves)
|
||||
assert depth == max_depth
|
||||
|
||||
if max_depth == 1:
|
||||
assert_is_stump(grower)
|
||||
|
||||
|
||||
def test_input_validation():
|
||||
|
||||
X_binned, all_gradients, all_hessians = _make_training_data()
|
||||
|
||||
X_binned_float = X_binned.astype(np.float32)
|
||||
with pytest.raises(NotImplementedError,
|
||||
match="X_binned must be of type uint8"):
|
||||
TreeGrower(X_binned_float, all_gradients, all_hessians)
|
||||
|
||||
X_binned_C_array = np.ascontiguousarray(X_binned)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="X_binned should be passed as Fortran contiguous array"):
|
||||
TreeGrower(X_binned_C_array, all_gradients, all_hessians)
|
||||
|
||||
|
||||
def test_init_parameters_validation():
|
||||
X_binned, all_gradients, all_hessians = _make_training_data()
|
||||
with pytest.raises(ValueError,
|
||||
match="min_gain_to_split=-1 must be positive"):
|
||||
|
||||
TreeGrower(X_binned, all_gradients, all_hessians,
|
||||
min_gain_to_split=-1)
|
||||
|
||||
with pytest.raises(ValueError,
|
||||
match="min_hessian_to_split=-1 must be positive"):
|
||||
TreeGrower(X_binned, all_gradients, all_hessians,
|
||||
min_hessian_to_split=-1)
|
||||
|
||||
|
||||
def test_missing_value_predict_only():
|
||||
# Make sure that missing values are supported at predict time even if they
|
||||
# were not encountered in the training data: the missing values are
|
||||
# assigned to whichever child has the most samples.
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8)
|
||||
X_binned = np.asfortranarray(X_binned)
|
||||
|
||||
gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
|
||||
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
|
||||
|
||||
grower = TreeGrower(X_binned, gradients, hessians, min_samples_leaf=5,
|
||||
has_missing_values=False)
|
||||
grower.grow()
|
||||
|
||||
predictor = grower.make_predictor()
|
||||
|
||||
# go from root to a leaf, always following node with the most samples.
|
||||
# That's the path nans are supposed to take
|
||||
node = predictor.nodes[0]
|
||||
while not node['is_leaf']:
|
||||
left = predictor.nodes[node['left']]
|
||||
right = predictor.nodes[node['right']]
|
||||
node = left if left['count'] > right['count'] else right
|
||||
|
||||
prediction_main_path = node['value']
|
||||
|
||||
# now build X_test with only nans, and make sure all predictions are equal
|
||||
# to prediction_main_path
|
||||
all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan)
|
||||
assert np.all(predictor.predict(all_nans) == prediction_main_path)
|
||||
|
||||
|
||||
def test_split_on_nan_with_infinite_values():
|
||||
# Make sure the split on nan situations are respected even when there are
|
||||
# samples with +inf values (we set the threshold to +inf when we have a
|
||||
# split on nan so this test makes sure this does not introduce edge-case
|
||||
# bugs). We need to use the private API so that we can also test
|
||||
# predict_binned().
|
||||
|
||||
X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1)
|
||||
# the gradient values will force a split on nan situation
|
||||
gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE)
|
||||
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
|
||||
|
||||
bin_mapper = _BinMapper()
|
||||
X_binned = bin_mapper.fit_transform(X)
|
||||
|
||||
n_bins_non_missing = 3
|
||||
has_missing_values = True
|
||||
grower = TreeGrower(X_binned, gradients, hessians,
|
||||
n_bins_non_missing=n_bins_non_missing,
|
||||
has_missing_values=has_missing_values,
|
||||
min_samples_leaf=1)
|
||||
|
||||
grower.grow()
|
||||
|
||||
predictor = grower.make_predictor(
|
||||
bin_thresholds=bin_mapper.bin_thresholds_
|
||||
)
|
||||
|
||||
# sanity check: this was a split on nan
|
||||
assert predictor.nodes[0]['threshold'] == np.inf
|
||||
assert predictor.nodes[0]['bin_threshold'] == n_bins_non_missing - 1
|
||||
|
||||
# Make sure in particular that the +inf sample is mapped to the left child
|
||||
# Note that lightgbm "fails" here and will assign the inf sample to the
|
||||
# right child, even though it's a "split on nan" situation.
|
||||
predictions = predictor.predict(X)
|
||||
predictions_binned = predictor.predict_binned(
|
||||
X_binned, missing_values_bin_idx=bin_mapper.missing_values_bin_idx_)
|
||||
np.testing.assert_allclose(predictions, -gradients)
|
||||
np.testing.assert_allclose(predictions_binned, -gradients)
|
|
@ -0,0 +1,202 @@
|
|||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from numpy.testing import assert_allclose
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn.ensemble._hist_gradient_boosting.histogram import (
|
||||
_build_histogram_naive,
|
||||
_build_histogram,
|
||||
_build_histogram_no_hessian,
|
||||
_build_histogram_root_no_hessian,
|
||||
_build_histogram_root,
|
||||
_subtract_histograms
|
||||
)
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'build_func', [_build_histogram_naive, _build_histogram])
|
||||
def test_build_histogram(build_func):
|
||||
binned_feature = np.array([0, 2, 0, 1, 2, 0, 2, 1], dtype=X_BINNED_DTYPE)
|
||||
|
||||
# Small sample_indices (below unrolling threshold)
|
||||
ordered_gradients = np.array([0, 1, 3], dtype=G_H_DTYPE)
|
||||
ordered_hessians = np.array([1, 1, 2], dtype=G_H_DTYPE)
|
||||
|
||||
sample_indices = np.array([0, 2, 3], dtype=np.uint32)
|
||||
hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE)
|
||||
build_func(0, sample_indices, binned_feature, ordered_gradients,
|
||||
ordered_hessians, hist)
|
||||
hist = hist[0]
|
||||
assert_array_equal(hist['count'], [2, 1, 0])
|
||||
assert_allclose(hist['sum_gradients'], [1, 3, 0])
|
||||
assert_allclose(hist['sum_hessians'], [2, 2, 0])
|
||||
|
||||
# Larger sample_indices (above unrolling threshold)
|
||||
sample_indices = np.array([0, 2, 3, 6, 7], dtype=np.uint32)
|
||||
ordered_gradients = np.array([0, 1, 3, 0, 1], dtype=G_H_DTYPE)
|
||||
ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=G_H_DTYPE)
|
||||
|
||||
hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE)
|
||||
build_func(0, sample_indices, binned_feature, ordered_gradients,
|
||||
ordered_hessians, hist)
|
||||
hist = hist[0]
|
||||
assert_array_equal(hist['count'], [2, 2, 1])
|
||||
assert_allclose(hist['sum_gradients'], [1, 4, 0])
|
||||
assert_allclose(hist['sum_hessians'], [2, 2, 1])
|
||||
|
||||
|
||||
def test_histogram_sample_order_independence():
|
||||
# Make sure the order of the samples has no impact on the histogram
|
||||
# computations
|
||||
rng = np.random.RandomState(42)
|
||||
n_sub_samples = 100
|
||||
n_samples = 1000
|
||||
n_bins = 256
|
||||
|
||||
binned_feature = rng.randint(0, n_bins - 1, size=n_samples,
|
||||
dtype=X_BINNED_DTYPE)
|
||||
sample_indices = rng.choice(np.arange(n_samples, dtype=np.uint32),
|
||||
n_sub_samples, replace=False)
|
||||
ordered_gradients = rng.randn(n_sub_samples).astype(G_H_DTYPE)
|
||||
hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
_build_histogram_no_hessian(0, sample_indices, binned_feature,
|
||||
ordered_gradients, hist_gc)
|
||||
|
||||
ordered_hessians = rng.exponential(size=n_sub_samples).astype(G_H_DTYPE)
|
||||
hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
_build_histogram(0, sample_indices, binned_feature,
|
||||
ordered_gradients, ordered_hessians, hist_ghc)
|
||||
|
||||
permutation = rng.permutation(n_sub_samples)
|
||||
hist_gc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
_build_histogram_no_hessian(0, sample_indices[permutation],
|
||||
binned_feature, ordered_gradients[permutation],
|
||||
hist_gc_perm)
|
||||
|
||||
hist_ghc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
_build_histogram(0, sample_indices[permutation], binned_feature,
|
||||
ordered_gradients[permutation],
|
||||
ordered_hessians[permutation], hist_ghc_perm)
|
||||
|
||||
hist_gc = hist_gc[0]
|
||||
hist_ghc = hist_ghc[0]
|
||||
hist_gc_perm = hist_gc_perm[0]
|
||||
hist_ghc_perm = hist_ghc_perm[0]
|
||||
|
||||
assert_allclose(hist_gc['sum_gradients'], hist_gc_perm['sum_gradients'])
|
||||
assert_array_equal(hist_gc['count'], hist_gc_perm['count'])
|
||||
|
||||
assert_allclose(hist_ghc['sum_gradients'], hist_ghc_perm['sum_gradients'])
|
||||
assert_allclose(hist_ghc['sum_hessians'], hist_ghc_perm['sum_hessians'])
|
||||
assert_array_equal(hist_ghc['count'], hist_ghc_perm['count'])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constant_hessian", [True, False])
|
||||
def test_unrolled_equivalent_to_naive(constant_hessian):
|
||||
# Make sure the different unrolled histogram computations give the same
|
||||
# results as the naive one.
|
||||
rng = np.random.RandomState(42)
|
||||
n_samples = 10
|
||||
n_bins = 5
|
||||
sample_indices = np.arange(n_samples).astype(np.uint32)
|
||||
binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8)
|
||||
ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
|
||||
if constant_hessian:
|
||||
ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
|
||||
else:
|
||||
ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
|
||||
|
||||
hist_gc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
hist_ghc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
hist_naive = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
|
||||
_build_histogram_root_no_hessian(0, binned_feature,
|
||||
ordered_gradients, hist_gc_root)
|
||||
_build_histogram_root(0, binned_feature, ordered_gradients,
|
||||
ordered_hessians, hist_ghc_root)
|
||||
_build_histogram_no_hessian(0, sample_indices, binned_feature,
|
||||
ordered_gradients, hist_gc)
|
||||
_build_histogram(0, sample_indices, binned_feature,
|
||||
ordered_gradients, ordered_hessians, hist_ghc)
|
||||
_build_histogram_naive(0, sample_indices, binned_feature,
|
||||
ordered_gradients, ordered_hessians, hist_naive)
|
||||
|
||||
hist_naive = hist_naive[0]
|
||||
hist_gc_root = hist_gc_root[0]
|
||||
hist_ghc_root = hist_ghc_root[0]
|
||||
hist_gc = hist_gc[0]
|
||||
hist_ghc = hist_ghc[0]
|
||||
for hist in (hist_gc_root, hist_ghc_root, hist_gc, hist_ghc):
|
||||
assert_array_equal(hist['count'], hist_naive['count'])
|
||||
assert_allclose(hist['sum_gradients'], hist_naive['sum_gradients'])
|
||||
for hist in (hist_ghc_root, hist_ghc):
|
||||
assert_allclose(hist['sum_hessians'], hist_naive['sum_hessians'])
|
||||
for hist in (hist_gc_root, hist_gc):
|
||||
assert_array_equal(hist['sum_hessians'], np.zeros(n_bins))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constant_hessian", [True, False])
|
||||
def test_hist_subtraction(constant_hessian):
|
||||
# Make sure the histogram subtraction trick gives the same result as the
|
||||
# classical method.
|
||||
rng = np.random.RandomState(42)
|
||||
n_samples = 10
|
||||
n_bins = 5
|
||||
sample_indices = np.arange(n_samples).astype(np.uint32)
|
||||
binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8)
|
||||
ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
|
||||
if constant_hessian:
|
||||
ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
|
||||
else:
|
||||
ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
|
||||
|
||||
hist_parent = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
if constant_hessian:
|
||||
_build_histogram_no_hessian(0, sample_indices, binned_feature,
|
||||
ordered_gradients, hist_parent)
|
||||
else:
|
||||
_build_histogram(0, sample_indices, binned_feature,
|
||||
ordered_gradients, ordered_hessians, hist_parent)
|
||||
|
||||
mask = rng.randint(0, 2, n_samples).astype(np.bool)
|
||||
|
||||
sample_indices_left = sample_indices[mask]
|
||||
ordered_gradients_left = ordered_gradients[mask]
|
||||
ordered_hessians_left = ordered_hessians[mask]
|
||||
hist_left = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
if constant_hessian:
|
||||
_build_histogram_no_hessian(0, sample_indices_left,
|
||||
binned_feature, ordered_gradients_left,
|
||||
hist_left)
|
||||
else:
|
||||
_build_histogram(0, sample_indices_left, binned_feature,
|
||||
ordered_gradients_left, ordered_hessians_left,
|
||||
hist_left)
|
||||
|
||||
sample_indices_right = sample_indices[~mask]
|
||||
ordered_gradients_right = ordered_gradients[~mask]
|
||||
ordered_hessians_right = ordered_hessians[~mask]
|
||||
hist_right = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
if constant_hessian:
|
||||
_build_histogram_no_hessian(0, sample_indices_right,
|
||||
binned_feature, ordered_gradients_right,
|
||||
hist_right)
|
||||
else:
|
||||
_build_histogram(0, sample_indices_right, binned_feature,
|
||||
ordered_gradients_right, ordered_hessians_right,
|
||||
hist_right)
|
||||
|
||||
hist_left_sub = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
hist_right_sub = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
_subtract_histograms(0, n_bins, hist_parent, hist_right, hist_left_sub)
|
||||
_subtract_histograms(0, n_bins, hist_parent, hist_left, hist_right_sub)
|
||||
|
||||
for key in ('count', 'sum_hessians', 'sum_gradients'):
|
||||
assert_allclose(hist_left[key], hist_left_sub[key], rtol=1e-6)
|
||||
assert_allclose(hist_right[key], hist_right_sub[key], rtol=1e-6)
|
|
@ -0,0 +1,318 @@
|
|||
import numpy as np
|
||||
from numpy.testing import assert_almost_equal
|
||||
from numpy.testing import assert_allclose
|
||||
from scipy.optimize import newton
|
||||
from sklearn.utils import assert_all_finite
|
||||
from sklearn.utils.fixes import sp_version, parse_version
|
||||
import pytest
|
||||
|
||||
from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
|
||||
from sklearn.utils._testing import skip_if_32bit
|
||||
|
||||
|
||||
def get_derivatives_helper(loss):
|
||||
"""Return get_gradients() and get_hessians() functions for a given loss.
|
||||
"""
|
||||
|
||||
def get_gradients(y_true, raw_predictions):
|
||||
# create gradients and hessians array, update inplace, and return
|
||||
gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
|
||||
hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
|
||||
loss.update_gradients_and_hessians(gradients, hessians, y_true,
|
||||
raw_predictions, None)
|
||||
return gradients
|
||||
|
||||
def get_hessians(y_true, raw_predictions):
|
||||
# create gradients and hessians array, update inplace, and return
|
||||
gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
|
||||
hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
|
||||
loss.update_gradients_and_hessians(gradients, hessians, y_true,
|
||||
raw_predictions, None)
|
||||
|
||||
if loss.__class__.__name__ == 'LeastSquares':
|
||||
# hessians aren't updated because they're constant:
|
||||
# the value is 1 (and not 2) because the loss is actually an half
|
||||
# least squares loss.
|
||||
hessians = np.full_like(raw_predictions, fill_value=1)
|
||||
elif loss.__class__.__name__ == 'LeastAbsoluteDeviation':
|
||||
# hessians aren't updated because they're constant
|
||||
hessians = np.full_like(raw_predictions, fill_value=0)
|
||||
|
||||
return hessians
|
||||
|
||||
return get_gradients, get_hessians
|
||||
|
||||
|
||||
@pytest.mark.parametrize('loss, x0, y_true', [
|
||||
('least_squares', -2., 42),
|
||||
('least_squares', 117., 1.05),
|
||||
('least_squares', 0., 0.),
|
||||
# I don't understand why but y_true == 0 fails :/
|
||||
# ('binary_crossentropy', 0.3, 0),
|
||||
('binary_crossentropy', -12, 1),
|
||||
('binary_crossentropy', 30, 1),
|
||||
('poisson', 12., 1.),
|
||||
('poisson', 0., 2.),
|
||||
('poisson', -22., 10.),
|
||||
])
|
||||
@pytest.mark.skipif(sp_version == parse_version('1.2.0'),
|
||||
reason='bug in scipy 1.2.0, see scipy issue #9608')
|
||||
@skip_if_32bit
|
||||
def test_derivatives(loss, x0, y_true):
|
||||
# Check that gradients are zero when the loss is minimized on 1D array
|
||||
# using Halley's method with the first and second order derivatives
|
||||
# computed by the Loss instance.
|
||||
|
||||
loss = _LOSSES[loss](sample_weight=None)
|
||||
y_true = np.array([y_true], dtype=Y_DTYPE)
|
||||
x0 = np.array([x0], dtype=Y_DTYPE).reshape(1, 1)
|
||||
get_gradients, get_hessians = get_derivatives_helper(loss)
|
||||
|
||||
def func(x):
|
||||
return loss.pointwise_loss(y_true, x)
|
||||
|
||||
def fprime(x):
|
||||
return get_gradients(y_true, x)
|
||||
|
||||
def fprime2(x):
|
||||
return get_hessians(y_true, x)
|
||||
|
||||
optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2,
|
||||
maxiter=70, tol=2e-8)
|
||||
assert np.allclose(loss.inverse_link_function(optimum), y_true)
|
||||
assert np.allclose(loss.pointwise_loss(y_true, optimum), 0)
|
||||
assert np.allclose(get_gradients(y_true, optimum), 0, atol=1e-7)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('loss, n_classes, prediction_dim', [
|
||||
('least_squares', 0, 1),
|
||||
('least_absolute_deviation', 0, 1),
|
||||
('binary_crossentropy', 2, 1),
|
||||
('categorical_crossentropy', 3, 3),
|
||||
('poisson', 0, 1),
|
||||
])
|
||||
@pytest.mark.skipif(Y_DTYPE != np.float64,
|
||||
reason='Need 64 bits float precision for numerical checks')
|
||||
def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
|
||||
# Make sure gradients and hessians computed in the loss are correct, by
|
||||
# comparing with their approximations computed with finite central
|
||||
# differences.
|
||||
# See https://en.wikipedia.org/wiki/Finite_difference.
|
||||
|
||||
rng = np.random.RandomState(seed)
|
||||
n_samples = 100
|
||||
if loss in ('least_squares', 'least_absolute_deviation'):
|
||||
y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
|
||||
elif loss in ('poisson'):
|
||||
y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
|
||||
else:
|
||||
y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
|
||||
raw_predictions = rng.normal(
|
||||
size=(prediction_dim, n_samples)
|
||||
).astype(Y_DTYPE)
|
||||
loss = _LOSSES[loss](sample_weight=None)
|
||||
get_gradients, get_hessians = get_derivatives_helper(loss)
|
||||
|
||||
# only take gradients and hessians of first tree / class.
|
||||
gradients = get_gradients(y_true, raw_predictions)[0, :].ravel()
|
||||
hessians = get_hessians(y_true, raw_predictions)[0, :].ravel()
|
||||
|
||||
# Approximate gradients
|
||||
# For multiclass loss, we should only change the predictions of one tree
|
||||
# (here the first), hence the use of offset[0, :] += eps
|
||||
# As a softmax is computed, offsetting the whole array by a constant would
|
||||
# have no effect on the probabilities, and thus on the loss
|
||||
eps = 1e-9
|
||||
offset = np.zeros_like(raw_predictions)
|
||||
offset[0, :] = eps
|
||||
f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset / 2)
|
||||
f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset / 2)
|
||||
numerical_gradients = (f_plus_eps - f_minus_eps) / eps
|
||||
|
||||
# Approximate hessians
|
||||
eps = 1e-4 # need big enough eps as we divide by its square
|
||||
offset[0, :] = eps
|
||||
f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset)
|
||||
f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset)
|
||||
f = loss.pointwise_loss(y_true, raw_predictions)
|
||||
numerical_hessians = (f_plus_eps + f_minus_eps - 2 * f) / eps**2
|
||||
|
||||
assert_allclose(numerical_gradients, gradients, rtol=1e-4, atol=1e-7)
|
||||
assert_allclose(numerical_hessians, hessians, rtol=1e-4, atol=1e-7)
|
||||
|
||||
|
||||
def test_baseline_least_squares():
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
loss = _LOSSES['least_squares'](sample_weight=None)
|
||||
y_train = rng.normal(size=100)
|
||||
baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
|
||||
assert baseline_prediction.shape == tuple() # scalar
|
||||
assert baseline_prediction.dtype == y_train.dtype
|
||||
# Make sure baseline prediction is the mean of all targets
|
||||
assert_almost_equal(baseline_prediction, y_train.mean())
|
||||
assert np.allclose(loss.inverse_link_function(baseline_prediction),
|
||||
baseline_prediction)
|
||||
|
||||
|
||||
def test_baseline_least_absolute_deviation():
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
loss = _LOSSES['least_absolute_deviation'](sample_weight=None)
|
||||
y_train = rng.normal(size=100)
|
||||
baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
|
||||
assert baseline_prediction.shape == tuple() # scalar
|
||||
assert baseline_prediction.dtype == y_train.dtype
|
||||
# Make sure baseline prediction is the median of all targets
|
||||
assert np.allclose(loss.inverse_link_function(baseline_prediction),
|
||||
baseline_prediction)
|
||||
assert baseline_prediction == pytest.approx(np.median(y_train))
|
||||
|
||||
|
||||
def test_baseline_poisson():
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
loss = _LOSSES['poisson'](sample_weight=None)
|
||||
y_train = rng.poisson(size=100).astype(np.float64)
|
||||
# Sanity check, make sure at least one sample is non-zero so we don't take
|
||||
# log(0)
|
||||
assert y_train.sum() > 0
|
||||
baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
|
||||
assert np.isscalar(baseline_prediction)
|
||||
assert baseline_prediction.dtype == y_train.dtype
|
||||
assert_all_finite(baseline_prediction)
|
||||
# Make sure baseline prediction produces the log of the mean of all targets
|
||||
assert_almost_equal(np.log(y_train.mean()), baseline_prediction)
|
||||
|
||||
# Test baseline for y_true = 0
|
||||
y_train.fill(0.)
|
||||
baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
|
||||
assert_all_finite(baseline_prediction)
|
||||
|
||||
|
||||
def test_baseline_binary_crossentropy():
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
loss = _LOSSES['binary_crossentropy'](sample_weight=None)
|
||||
for y_train in (np.zeros(shape=100), np.ones(shape=100)):
|
||||
y_train = y_train.astype(np.float64)
|
||||
baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
|
||||
assert_all_finite(baseline_prediction)
|
||||
assert np.allclose(loss.inverse_link_function(baseline_prediction),
|
||||
y_train[0])
|
||||
|
||||
# Make sure baseline prediction is equal to link_function(p), where p
|
||||
# is the proba of the positive class. We want predict_proba() to return p,
|
||||
# and by definition
|
||||
# p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction)
|
||||
# So we want raw_prediction = link_function(p) = log(p / (1 - p))
|
||||
y_train = rng.randint(0, 2, size=100).astype(np.float64)
|
||||
baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
|
||||
assert baseline_prediction.shape == tuple() # scalar
|
||||
assert baseline_prediction.dtype == y_train.dtype
|
||||
p = y_train.mean()
|
||||
assert np.allclose(baseline_prediction, np.log(p / (1 - p)))
|
||||
|
||||
|
||||
def test_baseline_categorical_crossentropy():
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
prediction_dim = 4
|
||||
loss = _LOSSES['categorical_crossentropy'](sample_weight=None)
|
||||
for y_train in (np.zeros(shape=100), np.ones(shape=100)):
|
||||
y_train = y_train.astype(np.float64)
|
||||
baseline_prediction = loss.get_baseline_prediction(y_train, None,
|
||||
prediction_dim)
|
||||
assert baseline_prediction.dtype == y_train.dtype
|
||||
assert_all_finite(baseline_prediction)
|
||||
|
||||
# Same logic as for above test. Here inverse_link_function = softmax and
|
||||
# link_function = log
|
||||
y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32)
|
||||
baseline_prediction = loss.get_baseline_prediction(y_train, None,
|
||||
prediction_dim)
|
||||
assert baseline_prediction.shape == (prediction_dim, 1)
|
||||
for k in range(prediction_dim):
|
||||
p = (y_train == k).mean()
|
||||
assert np.allclose(baseline_prediction[k, :], np.log(p))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('loss, problem', [
|
||||
('least_squares', 'regression'),
|
||||
('least_absolute_deviation', 'regression'),
|
||||
('binary_crossentropy', 'classification'),
|
||||
('categorical_crossentropy', 'classification'),
|
||||
('poisson', 'poisson_regression'),
|
||||
])
|
||||
@pytest.mark.parametrize('sample_weight', ['ones', 'random'])
|
||||
def test_sample_weight_multiplies_gradients(loss, problem, sample_weight):
|
||||
# Make sure that passing sample weights to the gradient and hessians
|
||||
# computation methods is equivalent to multiplying by the weights.
|
||||
|
||||
rng = np.random.RandomState(42)
|
||||
n_samples = 1000
|
||||
|
||||
if loss == 'categorical_crossentropy':
|
||||
n_classes = prediction_dim = 3
|
||||
else:
|
||||
n_classes = prediction_dim = 1
|
||||
|
||||
if problem == 'regression':
|
||||
y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
|
||||
elif problem == 'poisson_regression':
|
||||
y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
|
||||
else:
|
||||
y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
|
||||
|
||||
if sample_weight == 'ones':
|
||||
sample_weight = np.ones(shape=n_samples, dtype=Y_DTYPE)
|
||||
else:
|
||||
sample_weight = rng.normal(size=n_samples).astype(Y_DTYPE)
|
||||
|
||||
loss_ = _LOSSES[loss](sample_weight=sample_weight)
|
||||
|
||||
baseline_prediction = loss_.get_baseline_prediction(
|
||||
y_true, None, prediction_dim
|
||||
)
|
||||
raw_predictions = np.zeros(shape=(prediction_dim, n_samples),
|
||||
dtype=baseline_prediction.dtype)
|
||||
raw_predictions += baseline_prediction
|
||||
|
||||
gradients = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
|
||||
hessians = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
|
||||
loss_.update_gradients_and_hessians(gradients, hessians, y_true,
|
||||
raw_predictions, None)
|
||||
|
||||
gradients_sw = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
|
||||
hessians_sw = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
|
||||
loss_.update_gradients_and_hessians(gradients_sw, hessians_sw, y_true,
|
||||
raw_predictions, sample_weight)
|
||||
|
||||
assert np.allclose(gradients * sample_weight, gradients_sw)
|
||||
assert np.allclose(hessians * sample_weight, hessians_sw)
|
||||
|
||||
|
||||
def test_init_gradient_and_hessians_sample_weight():
|
||||
# Make sure that passing sample_weight to a loss correctly influences the
|
||||
# hessians_are_constant attribute, and consequently the shape of the
|
||||
# hessians array.
|
||||
|
||||
prediction_dim = 2
|
||||
n_samples = 5
|
||||
sample_weight = None
|
||||
loss = _LOSSES['least_squares'](sample_weight=sample_weight)
|
||||
_, hessians = loss.init_gradients_and_hessians(
|
||||
n_samples=n_samples, prediction_dim=prediction_dim,
|
||||
sample_weight=None)
|
||||
assert loss.hessians_are_constant
|
||||
assert hessians.shape == (1, 1)
|
||||
|
||||
sample_weight = np.ones(n_samples)
|
||||
loss = _LOSSES['least_squares'](sample_weight=sample_weight)
|
||||
_, hessians = loss.init_gradients_and_hessians(
|
||||
n_samples=n_samples, prediction_dim=prediction_dim,
|
||||
sample_weight=sample_weight)
|
||||
assert not loss.hessians_are_constant
|
||||
assert hessians.shape == (prediction_dim, n_samples)
|
|
@ -0,0 +1,341 @@
|
|||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
|
||||
from sklearn.ensemble._hist_gradient_boosting.splitting import (
|
||||
Splitter,
|
||||
compute_node_value
|
||||
)
|
||||
from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
|
||||
from sklearn.experimental import enable_hist_gradient_boosting # noqa
|
||||
from sklearn.ensemble import HistGradientBoostingRegressor
|
||||
from sklearn.ensemble import HistGradientBoostingClassifier
|
||||
|
||||
|
||||
def is_increasing(a):
|
||||
return (np.diff(a) >= 0.0).all()
|
||||
|
||||
|
||||
def is_decreasing(a):
|
||||
return (np.diff(a) <= 0.0).all()
|
||||
|
||||
|
||||
def assert_leaves_values_monotonic(predictor, monotonic_cst):
|
||||
# make sure leaves values (from left to right) are either all increasing
|
||||
# or all decreasing (or neither) depending on the monotonic constraint.
|
||||
nodes = predictor.nodes
|
||||
|
||||
def get_leaves_values():
|
||||
"""get leaves values from left to right"""
|
||||
values = []
|
||||
|
||||
def depth_first_collect_leaf_values(node_idx):
|
||||
node = nodes[node_idx]
|
||||
if node['is_leaf']:
|
||||
values.append(node['value'])
|
||||
return
|
||||
depth_first_collect_leaf_values(node['left'])
|
||||
depth_first_collect_leaf_values(node['right'])
|
||||
|
||||
depth_first_collect_leaf_values(0) # start at root (0)
|
||||
return values
|
||||
|
||||
values = get_leaves_values()
|
||||
|
||||
if monotonic_cst == MonotonicConstraint.NO_CST:
|
||||
# some increasing, some decreasing
|
||||
assert not is_increasing(values) and not is_decreasing(values)
|
||||
elif monotonic_cst == MonotonicConstraint.POS:
|
||||
# all increasing
|
||||
assert is_increasing(values)
|
||||
else: # NEG
|
||||
# all decreasing
|
||||
assert is_decreasing(values)
|
||||
|
||||
|
||||
def assert_children_values_monotonic(predictor, monotonic_cst):
|
||||
# Make sure siblings values respect the monotonic constraints. Left should
|
||||
# be lower (resp greater) than right child if constraint is POS (resp.
|
||||
# NEG).
|
||||
# Note that this property alone isn't enough to ensure full monotonicity,
|
||||
# since we also need to guanrantee that all the descendents of the left
|
||||
# child won't be greater (resp. lower) than the right child, or its
|
||||
# descendents. That's why we need to bound the predicted values (this is
|
||||
# tested in assert_children_values_bounded)
|
||||
nodes = predictor.nodes
|
||||
left_lower = []
|
||||
left_greater = []
|
||||
for node in nodes:
|
||||
if node['is_leaf']:
|
||||
continue
|
||||
|
||||
left_idx = node['left']
|
||||
right_idx = node['right']
|
||||
|
||||
if nodes[left_idx]['value'] < nodes[right_idx]['value']:
|
||||
left_lower.append(node)
|
||||
elif nodes[left_idx]['value'] > nodes[right_idx]['value']:
|
||||
left_greater.append(node)
|
||||
|
||||
if monotonic_cst == MonotonicConstraint.NO_CST:
|
||||
assert left_lower and left_greater
|
||||
elif monotonic_cst == MonotonicConstraint.POS:
|
||||
assert left_lower and not left_greater
|
||||
else: # NEG
|
||||
assert not left_lower and left_greater
|
||||
|
||||
|
||||
def assert_children_values_bounded(grower, monotonic_cst):
|
||||
# Make sure that the values of the children of a node are bounded by the
|
||||
# middle value between that node and its sibling (if there is a monotonic
|
||||
# constraint).
|
||||
# As a bonus, we also check that the siblings values are properly ordered
|
||||
# which is slightly redundant with assert_children_values_monotonic (but
|
||||
# this check is done on the grower nodes whereas
|
||||
# assert_children_values_monotonic is done on the predictor nodes)
|
||||
|
||||
if monotonic_cst == MonotonicConstraint.NO_CST:
|
||||
return
|
||||
|
||||
def recursively_check_children_node_values(node):
|
||||
if node.is_leaf:
|
||||
return
|
||||
if node is not grower.root and node is node.parent.left_child:
|
||||
sibling = node.sibling # on the right
|
||||
middle = (node.value + sibling.value) / 2
|
||||
if monotonic_cst == MonotonicConstraint.POS:
|
||||
assert (node.left_child.value <=
|
||||
node.right_child.value <=
|
||||
middle)
|
||||
if not sibling.is_leaf:
|
||||
assert (middle <=
|
||||
sibling.left_child.value <=
|
||||
sibling.right_child.value)
|
||||
else: # NEG
|
||||
assert (node.left_child.value >=
|
||||
node.right_child.value >=
|
||||
middle)
|
||||
if not sibling.is_leaf:
|
||||
assert (middle >=
|
||||
sibling.left_child.value >=
|
||||
sibling.right_child.value)
|
||||
|
||||
recursively_check_children_node_values(node.left_child)
|
||||
recursively_check_children_node_values(node.right_child)
|
||||
|
||||
recursively_check_children_node_values(grower.root)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('seed', range(3))
|
||||
@pytest.mark.parametrize('monotonic_cst', (
|
||||
MonotonicConstraint.NO_CST,
|
||||
MonotonicConstraint.POS,
|
||||
MonotonicConstraint.NEG,
|
||||
))
|
||||
def test_nodes_values(monotonic_cst, seed):
|
||||
# Build a single tree with only one feature, and make sure the nodes
|
||||
# values respect the monotonic constraints.
|
||||
|
||||
# Considering the following tree with a monotonic POS constraint, we
|
||||
# should have:
|
||||
#
|
||||
# root
|
||||
# / \
|
||||
# 5 10 # middle = 7.5
|
||||
# / \ / \
|
||||
# a b c d
|
||||
#
|
||||
# a <= b and c <= d (assert_children_values_monotonic)
|
||||
# a, b <= middle <= c, d (assert_children_values_bounded)
|
||||
# a <= b <= c <= d (assert_leaves_values_monotonic)
|
||||
#
|
||||
# The last one is a consequence of the others, but can't hurt to check
|
||||
|
||||
rng = np.random.RandomState(seed)
|
||||
n_samples = 1000
|
||||
n_features = 1
|
||||
X_binned = rng.randint(0, 255, size=(n_samples, n_features),
|
||||
dtype=np.uint8)
|
||||
X_binned = np.asfortranarray(X_binned)
|
||||
|
||||
gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
|
||||
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
|
||||
|
||||
grower = TreeGrower(X_binned, gradients, hessians,
|
||||
monotonic_cst=[monotonic_cst],
|
||||
shrinkage=.1)
|
||||
grower.grow()
|
||||
|
||||
# grow() will shrink the leaves values at the very end. For our comparison
|
||||
# tests, we need to revert the shrinkage of the leaves, else we would
|
||||
# compare the value of a leaf (shrunk) with a node (not shrunk) and the
|
||||
# test would not be correct.
|
||||
for leave in grower.finalized_leaves:
|
||||
leave.value /= grower.shrinkage
|
||||
|
||||
# The consistency of the bounds can only be checked on the tree grower
|
||||
# as the node bounds are not copied into the predictor tree. The
|
||||
# consistency checks on the values of node children and leaves can be
|
||||
# done either on the grower tree or on the predictor tree. We only
|
||||
# do those checks on the predictor tree as the latter is derived from
|
||||
# the former.
|
||||
predictor = grower.make_predictor()
|
||||
assert_children_values_monotonic(predictor, monotonic_cst)
|
||||
assert_children_values_bounded(grower, monotonic_cst)
|
||||
assert_leaves_values_monotonic(predictor, monotonic_cst)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('seed', range(3))
|
||||
def test_predictions(seed):
|
||||
# Train a model with a POS constraint on the first feature and a NEG
|
||||
# constraint on the second feature, and make sure the constraints are
|
||||
# respected by checking the predictions.
|
||||
# test adapted from lightgbm's test_monotone_constraint(), itself inspired
|
||||
# by https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html
|
||||
|
||||
rng = np.random.RandomState(seed)
|
||||
|
||||
n_samples = 1000
|
||||
f_0 = rng.rand(n_samples) # positive correlation with y
|
||||
f_1 = rng.rand(n_samples) # negative correslation with y
|
||||
X = np.c_[f_0, f_1]
|
||||
noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
|
||||
y = (5 * f_0 + np.sin(10 * np.pi * f_0) -
|
||||
5 * f_1 - np.cos(10 * np.pi * f_1) +
|
||||
noise)
|
||||
|
||||
gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1])
|
||||
gbdt.fit(X, y)
|
||||
|
||||
linspace = np.linspace(0, 1, 100)
|
||||
sin = np.sin(linspace)
|
||||
constant = np.full_like(linspace, fill_value=.5)
|
||||
|
||||
# We now assert the predictions properly respect the constraints, on each
|
||||
# feature. When testing for a feature we need to set the other one to a
|
||||
# constant, because the monotonic constraints are only a "all else being
|
||||
# equal" type of constraints:
|
||||
# a constraint on the first feature only means that
|
||||
# x0 < x0' => f(x0, x1) < f(x0', x1)
|
||||
# while x1 stays constant.
|
||||
# The constraint does not guanrantee that
|
||||
# x0 < x0' => f(x0, x1) < f(x0', x1')
|
||||
|
||||
# First feature (POS)
|
||||
# assert pred is all increasing when f_0 is all increasing
|
||||
X = np.c_[linspace, constant]
|
||||
pred = gbdt.predict(X)
|
||||
assert is_increasing(pred)
|
||||
# assert pred actually follows the variations of f_0
|
||||
X = np.c_[sin, constant]
|
||||
pred = gbdt.predict(X)
|
||||
assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0))
|
||||
|
||||
# Second feature (NEG)
|
||||
# assert pred is all decreasing when f_1 is all increasing
|
||||
X = np.c_[constant, linspace]
|
||||
pred = gbdt.predict(X)
|
||||
assert is_decreasing(pred)
|
||||
# assert pred actually follows the inverse variations of f_1
|
||||
X = np.c_[constant, sin]
|
||||
pred = gbdt.predict(X)
|
||||
assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all()
|
||||
|
||||
|
||||
def test_input_error():
|
||||
X = [[1, 2], [2, 3], [3, 4]]
|
||||
y = [0, 1, 2]
|
||||
|
||||
gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, 0, -1])
|
||||
with pytest.raises(ValueError,
|
||||
match='monotonic_cst has shape 3 but the input data'):
|
||||
gbdt.fit(X, y)
|
||||
|
||||
for monotonic_cst in ([1, 3], [1, -3]):
|
||||
gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
|
||||
with pytest.raises(ValueError,
|
||||
match='must be None or an array-like of '
|
||||
'-1, 0 or 1'):
|
||||
gbdt.fit(X, y)
|
||||
|
||||
gbdt = HistGradientBoostingClassifier(monotonic_cst=[0, 1])
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match='monotonic constraints are not supported '
|
||||
'for multiclass classification'
|
||||
):
|
||||
gbdt.fit(X, y)
|
||||
|
||||
|
||||
def test_bounded_value_min_gain_to_split():
|
||||
# The purpose of this test is to show that when computing the gain at a
|
||||
# given split, the value of the current node should be properly bounded to
|
||||
# respect the monotonic constraints, because it strongly interacts with
|
||||
# min_gain_to_split. We build a simple example where gradients are [1, 1,
|
||||
# 100, 1, 1] (hessians are all ones). The best split happens on the 3rd
|
||||
# bin, and depending on whether the value of the node is bounded or not,
|
||||
# the min_gain_to_split constraint is or isn't satisfied.
|
||||
l2_regularization = 0
|
||||
min_hessian_to_split = 0
|
||||
min_samples_leaf = 1
|
||||
n_bins = n_samples = 5
|
||||
X_binned = np.arange(n_samples).reshape(-1, 1).astype(X_BINNED_DTYPE)
|
||||
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
||||
all_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
|
||||
all_gradients = np.array([1, 1, 100, 1, 1], dtype=G_H_DTYPE)
|
||||
sum_gradients = all_gradients.sum()
|
||||
sum_hessians = all_hessians.sum()
|
||||
hessians_are_constant = False
|
||||
|
||||
builder = HistogramBuilder(X_binned, n_bins, all_gradients,
|
||||
all_hessians, hessians_are_constant)
|
||||
n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
|
||||
dtype=np.uint32)
|
||||
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
|
||||
monotonic_cst = np.array(
|
||||
[MonotonicConstraint.NO_CST] * X_binned.shape[1],
|
||||
dtype=np.int8)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
children_lower_bound, children_upper_bound = -np.inf, np.inf
|
||||
|
||||
min_gain_to_split = 2000
|
||||
splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
|
||||
has_missing_values, monotonic_cst, l2_regularization,
|
||||
min_hessian_to_split, min_samples_leaf,
|
||||
min_gain_to_split, hessians_are_constant)
|
||||
|
||||
histograms = builder.compute_histograms_brute(sample_indices)
|
||||
|
||||
# Since the gradient array is [1, 1, 100, 1, 1]
|
||||
# the max possible gain happens on the 3rd bin (or equivalently in the 2nd)
|
||||
# and is equal to about 1307, which less than min_gain_to_split = 2000, so
|
||||
# the node is considered unsplittable (gain = -1)
|
||||
current_lower_bound, current_upper_bound = -np.inf, np.inf
|
||||
value = compute_node_value(sum_gradients, sum_hessians,
|
||||
current_lower_bound, current_upper_bound,
|
||||
l2_regularization)
|
||||
# the unbounded value is equal to -sum_gradients / sum_hessians
|
||||
assert value == pytest.approx(-104 / 5)
|
||||
split_info = splitter.find_node_split(n_samples, histograms,
|
||||
sum_gradients, sum_hessians, value,
|
||||
lower_bound=children_lower_bound,
|
||||
upper_bound=children_upper_bound)
|
||||
assert split_info.gain == -1 # min_gain_to_split not respected
|
||||
|
||||
# here again the max possible gain is on the 3rd bin but we now cap the
|
||||
# value of the node into [-10, inf].
|
||||
# This means the gain is now about 2430 which is more than the
|
||||
# min_gain_to_split constraint.
|
||||
current_lower_bound, current_upper_bound = -10, np.inf
|
||||
value = compute_node_value(sum_gradients, sum_hessians,
|
||||
current_lower_bound, current_upper_bound,
|
||||
l2_regularization)
|
||||
assert value == -10
|
||||
split_info = splitter.find_node_split(n_samples, histograms,
|
||||
sum_gradients, sum_hessians, value,
|
||||
lower_bound=children_lower_bound,
|
||||
upper_bound=children_upper_bound)
|
||||
assert split_info.gain > min_gain_to_split
|
|
@ -0,0 +1,76 @@
|
|||
import numpy as np
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import r2_score
|
||||
import pytest
|
||||
|
||||
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
|
||||
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
|
||||
from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import (
|
||||
G_H_DTYPE, PREDICTOR_RECORD_DTYPE, ALMOST_INF)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('n_bins', [200, 256])
|
||||
def test_regression_dataset(n_bins):
|
||||
X, y = make_regression(n_samples=500, n_features=10, n_informative=5,
|
||||
random_state=42)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, random_state=42)
|
||||
|
||||
mapper = _BinMapper(n_bins=n_bins, random_state=42)
|
||||
X_train_binned = mapper.fit_transform(X_train)
|
||||
|
||||
# Init gradients and hessians to that of least squares loss
|
||||
gradients = -y_train.astype(G_H_DTYPE)
|
||||
hessians = np.ones(1, dtype=G_H_DTYPE)
|
||||
|
||||
min_samples_leaf = 10
|
||||
max_leaf_nodes = 30
|
||||
grower = TreeGrower(X_train_binned, gradients, hessians,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
max_leaf_nodes=max_leaf_nodes, n_bins=n_bins,
|
||||
n_bins_non_missing=mapper.n_bins_non_missing_)
|
||||
grower.grow()
|
||||
|
||||
predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)
|
||||
|
||||
assert r2_score(y_train, predictor.predict(X_train)) > 0.82
|
||||
assert r2_score(y_test, predictor.predict(X_test)) > 0.67
|
||||
|
||||
|
||||
@pytest.mark.parametrize('threshold, expected_predictions', [
|
||||
(-np.inf, [0, 1, 1, 1]),
|
||||
(10, [0, 0, 1, 1]),
|
||||
(20, [0, 0, 0, 1]),
|
||||
(ALMOST_INF, [0, 0, 0, 1]),
|
||||
(np.inf, [0, 0, 0, 0]),
|
||||
])
|
||||
def test_infinite_values_and_thresholds(threshold, expected_predictions):
|
||||
# Make sure infinite values and infinite thresholds are handled properly.
|
||||
# In particular, if a value is +inf and the threshold is ALMOST_INF the
|
||||
# sample should go to the right child. If the threshold is inf (split on
|
||||
# nan), the +inf sample will go to the left child.
|
||||
|
||||
X = np.array([-np.inf, 10, 20, np.inf]).reshape(-1, 1)
|
||||
nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE)
|
||||
|
||||
# We just construct a simple tree with 1 root and 2 children
|
||||
# parent node
|
||||
nodes[0]['left'] = 1
|
||||
nodes[0]['right'] = 2
|
||||
nodes[0]['feature_idx'] = 0
|
||||
nodes[0]['threshold'] = threshold
|
||||
|
||||
# left child
|
||||
nodes[1]['is_leaf'] = True
|
||||
nodes[1]['value'] = 0
|
||||
|
||||
# right child
|
||||
nodes[2]['is_leaf'] = True
|
||||
nodes[2]['value'] = 1
|
||||
|
||||
predictor = TreePredictor(nodes)
|
||||
predictions = predictor.predict(X)
|
||||
|
||||
assert np.all(predictions == expected_predictions)
|
|
@ -0,0 +1,480 @@
|
|||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
|
||||
from sklearn.ensemble._hist_gradient_boosting.splitting import (
|
||||
Splitter,
|
||||
compute_node_value
|
||||
)
|
||||
from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
|
||||
from sklearn.utils._testing import skip_if_32bit
|
||||
|
||||
|
||||
@pytest.mark.parametrize('n_bins', [3, 32, 256])
|
||||
def test_histogram_split(n_bins):
|
||||
rng = np.random.RandomState(42)
|
||||
feature_idx = 0
|
||||
l2_regularization = 0
|
||||
min_hessian_to_split = 1e-3
|
||||
min_samples_leaf = 1
|
||||
min_gain_to_split = 0.
|
||||
X_binned = np.asfortranarray(
|
||||
rng.randint(0, n_bins - 1, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE)
|
||||
binned_feature = X_binned.T[feature_idx]
|
||||
sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32)
|
||||
ordered_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
|
||||
all_hessians = ordered_hessians
|
||||
sum_hessians = all_hessians.sum()
|
||||
hessians_are_constant = False
|
||||
|
||||
for true_bin in range(1, n_bins - 2):
|
||||
for sign in [-1, 1]:
|
||||
ordered_gradients = np.full_like(binned_feature, sign,
|
||||
dtype=G_H_DTYPE)
|
||||
ordered_gradients[binned_feature <= true_bin] *= -1
|
||||
all_gradients = ordered_gradients
|
||||
sum_gradients = all_gradients.sum()
|
||||
|
||||
builder = HistogramBuilder(X_binned,
|
||||
n_bins,
|
||||
all_gradients,
|
||||
all_hessians,
|
||||
hessians_are_constant)
|
||||
n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
|
||||
dtype=np.uint32)
|
||||
has_missing_values = np.array([False] * X_binned.shape[1],
|
||||
dtype=np.uint8)
|
||||
monotonic_cst = np.array(
|
||||
[MonotonicConstraint.NO_CST] * X_binned.shape[1],
|
||||
dtype=np.int8)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
splitter = Splitter(X_binned,
|
||||
n_bins_non_missing,
|
||||
missing_values_bin_idx,
|
||||
has_missing_values,
|
||||
monotonic_cst,
|
||||
l2_regularization,
|
||||
min_hessian_to_split,
|
||||
min_samples_leaf, min_gain_to_split,
|
||||
hessians_are_constant)
|
||||
|
||||
histograms = builder.compute_histograms_brute(sample_indices)
|
||||
value = compute_node_value(sum_gradients, sum_hessians,
|
||||
-np.inf, np.inf, l2_regularization)
|
||||
split_info = splitter.find_node_split(
|
||||
sample_indices.shape[0], histograms, sum_gradients,
|
||||
sum_hessians, value)
|
||||
|
||||
assert split_info.bin_idx == true_bin
|
||||
assert split_info.gain >= 0
|
||||
assert split_info.feature_idx == feature_idx
|
||||
assert (split_info.n_samples_left + split_info.n_samples_right
|
||||
== sample_indices.shape[0])
|
||||
# Constant hessian: 1. per sample.
|
||||
assert split_info.n_samples_left == split_info.sum_hessian_left
|
||||
|
||||
|
||||
@skip_if_32bit
|
||||
@pytest.mark.parametrize('constant_hessian', [True, False])
|
||||
def test_gradient_and_hessian_sanity(constant_hessian):
|
||||
# This test checks that the values of gradients and hessians are
|
||||
# consistent in different places:
|
||||
# - in split_info: si.sum_gradient_left + si.sum_gradient_right must be
|
||||
# equal to the gradient at the node. Same for hessians.
|
||||
# - in the histograms: summing 'sum_gradients' over the bins must be
|
||||
# constant across all features, and those sums must be equal to the
|
||||
# node's gradient. Same for hessians.
|
||||
|
||||
rng = np.random.RandomState(42)
|
||||
|
||||
n_bins = 10
|
||||
n_features = 20
|
||||
n_samples = 500
|
||||
l2_regularization = 0.
|
||||
min_hessian_to_split = 1e-3
|
||||
min_samples_leaf = 1
|
||||
min_gain_to_split = 0.
|
||||
|
||||
X_binned = rng.randint(0, n_bins, size=(n_samples, n_features),
|
||||
dtype=X_BINNED_DTYPE)
|
||||
X_binned = np.asfortranarray(X_binned)
|
||||
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
||||
all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
|
||||
sum_gradients = all_gradients.sum()
|
||||
if constant_hessian:
|
||||
all_hessians = np.ones(1, dtype=G_H_DTYPE)
|
||||
sum_hessians = 1 * n_samples
|
||||
else:
|
||||
all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
|
||||
sum_hessians = all_hessians.sum()
|
||||
|
||||
builder = HistogramBuilder(X_binned, n_bins, all_gradients,
|
||||
all_hessians, constant_hessian)
|
||||
n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
|
||||
dtype=np.uint32)
|
||||
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
|
||||
monotonic_cst = np.array(
|
||||
[MonotonicConstraint.NO_CST] * X_binned.shape[1],
|
||||
dtype=np.int8)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
|
||||
has_missing_values, monotonic_cst, l2_regularization,
|
||||
min_hessian_to_split, min_samples_leaf,
|
||||
min_gain_to_split, constant_hessian)
|
||||
|
||||
hists_parent = builder.compute_histograms_brute(sample_indices)
|
||||
value_parent = compute_node_value(sum_gradients, sum_hessians,
|
||||
-np.inf, np.inf, l2_regularization)
|
||||
si_parent = splitter.find_node_split(n_samples, hists_parent,
|
||||
sum_gradients, sum_hessians,
|
||||
value_parent)
|
||||
sample_indices_left, sample_indices_right, _ = splitter.split_indices(
|
||||
si_parent, sample_indices)
|
||||
|
||||
hists_left = builder.compute_histograms_brute(sample_indices_left)
|
||||
value_left = compute_node_value(si_parent.sum_gradient_left,
|
||||
si_parent.sum_hessian_left,
|
||||
-np.inf, np.inf, l2_regularization)
|
||||
hists_right = builder.compute_histograms_brute(sample_indices_right)
|
||||
value_right = compute_node_value(si_parent.sum_gradient_right,
|
||||
si_parent.sum_hessian_right,
|
||||
-np.inf, np.inf, l2_regularization)
|
||||
si_left = splitter.find_node_split(n_samples, hists_left,
|
||||
si_parent.sum_gradient_left,
|
||||
si_parent.sum_hessian_left,
|
||||
value_left)
|
||||
si_right = splitter.find_node_split(n_samples, hists_right,
|
||||
si_parent.sum_gradient_right,
|
||||
si_parent.sum_hessian_right,
|
||||
value_right)
|
||||
|
||||
# make sure that si.sum_gradient_left + si.sum_gradient_right have their
|
||||
# expected value, same for hessians
|
||||
for si, indices in (
|
||||
(si_parent, sample_indices),
|
||||
(si_left, sample_indices_left),
|
||||
(si_right, sample_indices_right)):
|
||||
gradient = si.sum_gradient_right + si.sum_gradient_left
|
||||
expected_gradient = all_gradients[indices].sum()
|
||||
hessian = si.sum_hessian_right + si.sum_hessian_left
|
||||
if constant_hessian:
|
||||
expected_hessian = indices.shape[0] * all_hessians[0]
|
||||
else:
|
||||
expected_hessian = all_hessians[indices].sum()
|
||||
|
||||
assert np.isclose(gradient, expected_gradient)
|
||||
assert np.isclose(hessian, expected_hessian)
|
||||
|
||||
# make sure sum of gradients in histograms are the same for all features,
|
||||
# and make sure they're equal to their expected value
|
||||
hists_parent = np.asarray(hists_parent, dtype=HISTOGRAM_DTYPE)
|
||||
hists_left = np.asarray(hists_left, dtype=HISTOGRAM_DTYPE)
|
||||
hists_right = np.asarray(hists_right, dtype=HISTOGRAM_DTYPE)
|
||||
for hists, indices in (
|
||||
(hists_parent, sample_indices),
|
||||
(hists_left, sample_indices_left),
|
||||
(hists_right, sample_indices_right)):
|
||||
# note: gradients and hessians have shape (n_features,),
|
||||
# we're comparing them to *scalars*. This has the benefit of also
|
||||
# making sure that all the entries are equal across features.
|
||||
gradients = hists['sum_gradients'].sum(axis=1) # shape = (n_features,)
|
||||
expected_gradient = all_gradients[indices].sum() # scalar
|
||||
hessians = hists['sum_hessians'].sum(axis=1)
|
||||
if constant_hessian:
|
||||
# 0 is not the actual hessian, but it's not computed in this case
|
||||
expected_hessian = 0.
|
||||
else:
|
||||
expected_hessian = all_hessians[indices].sum()
|
||||
|
||||
assert np.allclose(gradients, expected_gradient)
|
||||
assert np.allclose(hessians, expected_hessian)
|
||||
|
||||
|
||||
def test_split_indices():
|
||||
# Check that split_indices returns the correct splits and that
|
||||
# splitter.partition is consistent with what is returned.
|
||||
rng = np.random.RandomState(421)
|
||||
|
||||
n_bins = 5
|
||||
n_samples = 10
|
||||
l2_regularization = 0.
|
||||
min_hessian_to_split = 1e-3
|
||||
min_samples_leaf = 1
|
||||
min_gain_to_split = 0.
|
||||
|
||||
# split will happen on feature 1 and on bin 3
|
||||
X_binned = [[0, 0],
|
||||
[0, 3],
|
||||
[0, 4],
|
||||
[0, 0],
|
||||
[0, 0],
|
||||
[0, 0],
|
||||
[0, 0],
|
||||
[0, 4],
|
||||
[0, 0],
|
||||
[0, 4]]
|
||||
X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE)
|
||||
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
||||
all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
|
||||
all_hessians = np.ones(1, dtype=G_H_DTYPE)
|
||||
sum_gradients = all_gradients.sum()
|
||||
sum_hessians = 1 * n_samples
|
||||
hessians_are_constant = True
|
||||
|
||||
builder = HistogramBuilder(X_binned, n_bins,
|
||||
all_gradients, all_hessians,
|
||||
hessians_are_constant)
|
||||
n_bins_non_missing = np.array([n_bins] * X_binned.shape[1],
|
||||
dtype=np.uint32)
|
||||
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
|
||||
monotonic_cst = np.array(
|
||||
[MonotonicConstraint.NO_CST] * X_binned.shape[1],
|
||||
dtype=np.int8)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
|
||||
has_missing_values, monotonic_cst, l2_regularization,
|
||||
min_hessian_to_split, min_samples_leaf,
|
||||
min_gain_to_split, hessians_are_constant)
|
||||
|
||||
assert np.all(sample_indices == splitter.partition)
|
||||
|
||||
histograms = builder.compute_histograms_brute(sample_indices)
|
||||
value = compute_node_value(sum_gradients, sum_hessians,
|
||||
-np.inf, np.inf, l2_regularization)
|
||||
si_root = splitter.find_node_split(n_samples, histograms,
|
||||
sum_gradients, sum_hessians, value)
|
||||
|
||||
# sanity checks for best split
|
||||
assert si_root.feature_idx == 1
|
||||
assert si_root.bin_idx == 3
|
||||
|
||||
samples_left, samples_right, position_right = splitter.split_indices(
|
||||
si_root, splitter.partition)
|
||||
assert set(samples_left) == set([0, 1, 3, 4, 5, 6, 8])
|
||||
assert set(samples_right) == set([2, 7, 9])
|
||||
|
||||
assert list(samples_left) == list(splitter.partition[:position_right])
|
||||
assert list(samples_right) == list(splitter.partition[position_right:])
|
||||
|
||||
# Check that the resulting split indices sizes are consistent with the
|
||||
# count statistics anticipated when looking for the best split.
|
||||
assert samples_left.shape[0] == si_root.n_samples_left
|
||||
assert samples_right.shape[0] == si_root.n_samples_right
|
||||
|
||||
|
||||
def test_min_gain_to_split():
|
||||
# Try to split a pure node (all gradients are equal, same for hessians)
|
||||
# with min_gain_to_split = 0 and make sure that the node is not split (best
|
||||
# possible gain = -1). Note: before the strict inequality comparison, this
|
||||
# test would fail because the node would be split with a gain of 0.
|
||||
rng = np.random.RandomState(42)
|
||||
l2_regularization = 0
|
||||
min_hessian_to_split = 0
|
||||
min_samples_leaf = 1
|
||||
min_gain_to_split = 0.
|
||||
n_bins = 255
|
||||
n_samples = 100
|
||||
X_binned = np.asfortranarray(
|
||||
rng.randint(0, n_bins, size=(n_samples, 1)), dtype=X_BINNED_DTYPE)
|
||||
binned_feature = X_binned[:, 0]
|
||||
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
||||
all_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
|
||||
all_gradients = np.ones_like(binned_feature, dtype=G_H_DTYPE)
|
||||
sum_gradients = all_gradients.sum()
|
||||
sum_hessians = all_hessians.sum()
|
||||
hessians_are_constant = False
|
||||
|
||||
builder = HistogramBuilder(X_binned, n_bins, all_gradients,
|
||||
all_hessians, hessians_are_constant)
|
||||
n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
|
||||
dtype=np.uint32)
|
||||
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
|
||||
monotonic_cst = np.array(
|
||||
[MonotonicConstraint.NO_CST] * X_binned.shape[1],
|
||||
dtype=np.int8)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
|
||||
has_missing_values, monotonic_cst, l2_regularization,
|
||||
min_hessian_to_split, min_samples_leaf,
|
||||
min_gain_to_split, hessians_are_constant)
|
||||
|
||||
histograms = builder.compute_histograms_brute(sample_indices)
|
||||
value = compute_node_value(sum_gradients, sum_hessians,
|
||||
-np.inf, np.inf, l2_regularization)
|
||||
split_info = splitter.find_node_split(n_samples, histograms,
|
||||
sum_gradients, sum_hessians, value)
|
||||
assert split_info.gain == -1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'X_binned, all_gradients, has_missing_values, n_bins_non_missing, '
|
||||
' expected_split_on_nan, expected_bin_idx, expected_go_to_left', [
|
||||
|
||||
# basic sanity check with no missing values: given the gradient
|
||||
# values, the split must occur on bin_idx=3
|
||||
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], # X_binned
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5], # gradients
|
||||
False, # no missing values
|
||||
10, # n_bins_non_missing
|
||||
False, # don't split on nans
|
||||
3, # expected_bin_idx
|
||||
'not_applicable'),
|
||||
|
||||
# We replace 2 samples by NaNs (bin_idx=8)
|
||||
# These 2 samples were mapped to the left node before, so they should
|
||||
# be mapped to left node again
|
||||
# Notice how the bin_idx threshold changes from 3 to 1.
|
||||
([8, 0, 1, 8, 2, 3, 4, 5, 6, 7], # 8 <=> missing
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
8, # n_bins_non_missing
|
||||
False, # don't split on nans
|
||||
1, # cut on bin_idx=1
|
||||
True), # missing values go to left
|
||||
|
||||
# same as above, but with non-consecutive missing_values_bin
|
||||
([9, 0, 1, 9, 2, 3, 4, 5, 6, 7], # 9 <=> missing
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
8, # n_bins_non_missing
|
||||
False, # don't split on nans
|
||||
1, # cut on bin_idx=1
|
||||
True), # missing values go to left
|
||||
|
||||
# this time replacing 2 samples that were on the right.
|
||||
([0, 1, 2, 3, 8, 4, 8, 5, 6, 7], # 8 <=> missing
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
8, # n_bins_non_missing
|
||||
False, # don't split on nans
|
||||
3, # cut on bin_idx=3 (like in first case)
|
||||
False), # missing values go to right
|
||||
|
||||
# same as above, but with non-consecutive missing_values_bin
|
||||
([0, 1, 2, 3, 9, 4, 9, 5, 6, 7], # 9 <=> missing
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
8, # n_bins_non_missing
|
||||
False, # don't split on nans
|
||||
3, # cut on bin_idx=3 (like in first case)
|
||||
False), # missing values go to right
|
||||
|
||||
# For the following cases, split_on_nans is True (we replace all of
|
||||
# the samples with nans, instead of just 2).
|
||||
([0, 1, 2, 3, 4, 4, 4, 4, 4, 4], # 4 <=> missing
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
4, # n_bins_non_missing
|
||||
True, # split on nans
|
||||
3, # cut on bin_idx=3
|
||||
False), # missing values go to right
|
||||
|
||||
# same as above, but with non-consecutive missing_values_bin
|
||||
([0, 1, 2, 3, 9, 9, 9, 9, 9, 9], # 9 <=> missing
|
||||
[1, 1, 1, 1, 1, 1, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
4, # n_bins_non_missing
|
||||
True, # split on nans
|
||||
3, # cut on bin_idx=3
|
||||
False), # missing values go to right
|
||||
|
||||
([6, 6, 6, 6, 0, 1, 2, 3, 4, 5], # 6 <=> missing
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
6, # n_bins_non_missing
|
||||
True, # split on nans
|
||||
5, # cut on bin_idx=5
|
||||
False), # missing values go to right
|
||||
|
||||
# same as above, but with non-consecutive missing_values_bin
|
||||
([9, 9, 9, 9, 0, 1, 2, 3, 4, 5], # 9 <=> missing
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
6, # n_bins_non_missing
|
||||
True, # split on nans
|
||||
5, # cut on bin_idx=5
|
||||
False), # missing values go to right
|
||||
]
|
||||
)
|
||||
def test_splitting_missing_values(X_binned, all_gradients,
|
||||
has_missing_values, n_bins_non_missing,
|
||||
expected_split_on_nan, expected_bin_idx,
|
||||
expected_go_to_left):
|
||||
# Make sure missing values are properly supported.
|
||||
# we build an artificial example with gradients such that the best split
|
||||
# is on bin_idx=3, when there are no missing values.
|
||||
# Then we introduce missing values and:
|
||||
# - make sure the chosen bin is correct (find_best_bin()): it's
|
||||
# still the same split, even though the index of the bin may change
|
||||
# - make sure the missing values are mapped to the correct child
|
||||
# (split_indices())
|
||||
|
||||
n_bins = max(X_binned) + 1
|
||||
n_samples = len(X_binned)
|
||||
l2_regularization = 0.
|
||||
min_hessian_to_split = 1e-3
|
||||
min_samples_leaf = 1
|
||||
min_gain_to_split = 0.
|
||||
|
||||
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
||||
X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1)
|
||||
X_binned = np.asfortranarray(X_binned)
|
||||
all_gradients = np.array(all_gradients, dtype=G_H_DTYPE)
|
||||
has_missing_values = np.array([has_missing_values], dtype=np.uint8)
|
||||
all_hessians = np.ones(1, dtype=G_H_DTYPE)
|
||||
sum_gradients = all_gradients.sum()
|
||||
sum_hessians = 1 * n_samples
|
||||
hessians_are_constant = True
|
||||
|
||||
builder = HistogramBuilder(X_binned, n_bins,
|
||||
all_gradients, all_hessians,
|
||||
hessians_are_constant)
|
||||
|
||||
n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
|
||||
monotonic_cst = np.array(
|
||||
[MonotonicConstraint.NO_CST] * X_binned.shape[1],
|
||||
dtype=np.int8)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
splitter = Splitter(X_binned, n_bins_non_missing,
|
||||
missing_values_bin_idx, has_missing_values,
|
||||
monotonic_cst,
|
||||
l2_regularization, min_hessian_to_split,
|
||||
min_samples_leaf, min_gain_to_split,
|
||||
hessians_are_constant)
|
||||
|
||||
histograms = builder.compute_histograms_brute(sample_indices)
|
||||
value = compute_node_value(sum_gradients, sum_hessians,
|
||||
-np.inf, np.inf, l2_regularization)
|
||||
split_info = splitter.find_node_split(n_samples, histograms,
|
||||
sum_gradients, sum_hessians, value)
|
||||
|
||||
assert split_info.bin_idx == expected_bin_idx
|
||||
if has_missing_values:
|
||||
assert split_info.missing_go_to_left == expected_go_to_left
|
||||
|
||||
split_on_nan = split_info.bin_idx == n_bins_non_missing[0] - 1
|
||||
assert split_on_nan == expected_split_on_nan
|
||||
|
||||
# Make sure the split is properly computed.
|
||||
# This also make sure missing values are properly assigned to the correct
|
||||
# child in split_indices()
|
||||
samples_left, samples_right, _ = splitter.split_indices(
|
||||
split_info, splitter.partition)
|
||||
|
||||
if not expected_split_on_nan:
|
||||
# When we don't split on nans, the split should always be the same.
|
||||
assert set(samples_left) == set([0, 1, 2, 3])
|
||||
assert set(samples_right) == set([4, 5, 6, 7, 8, 9])
|
||||
else:
|
||||
# When we split on nans, samples with missing values are always mapped
|
||||
# to the right child.
|
||||
missing_samples_indices = np.flatnonzero(
|
||||
np.array(X_binned) == missing_values_bin_idx)
|
||||
non_missing_samples_indices = np.flatnonzero(
|
||||
np.array(X_binned) != missing_values_bin_idx)
|
||||
|
||||
assert set(samples_right) == set(missing_samples_indices)
|
||||
assert set(samples_left) == set(non_missing_samples_indices)
|
|
@ -0,0 +1,206 @@
|
|||
import numpy as np
|
||||
from numpy.testing import assert_array_equal
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.base import clone
|
||||
from sklearn.datasets import make_classification, make_regression
|
||||
|
||||
# To use this experimental feature, we need to explicitly ask for it:
|
||||
from sklearn.experimental import enable_hist_gradient_boosting # noqa
|
||||
from sklearn.ensemble import HistGradientBoostingRegressor
|
||||
from sklearn.ensemble import HistGradientBoostingClassifier
|
||||
from sklearn.metrics import check_scoring
|
||||
|
||||
|
||||
X_classification, y_classification = make_classification(random_state=0)
|
||||
X_regression, y_regression = make_regression(random_state=0)
|
||||
|
||||
|
||||
def _assert_predictor_equal(gb_1, gb_2, X):
|
||||
"""Assert that two HistGBM instances are identical."""
|
||||
# Check identical nodes for each tree
|
||||
for (pred_ith_1, pred_ith_2) in zip(gb_1._predictors, gb_2._predictors):
|
||||
for (predictor_1, predictor_2) in zip(pred_ith_1, pred_ith_2):
|
||||
assert_array_equal(predictor_1.nodes, predictor_2.nodes)
|
||||
|
||||
# Check identical predictions
|
||||
assert_allclose(gb_1.predict(X), gb_2.predict(X))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('GradientBoosting, X, y', [
|
||||
(HistGradientBoostingClassifier, X_classification, y_classification),
|
||||
(HistGradientBoostingRegressor, X_regression, y_regression)
|
||||
])
|
||||
def test_max_iter_with_warm_start_validation(GradientBoosting, X, y):
|
||||
# Check that a ValueError is raised when the maximum number of iterations
|
||||
# is smaller than the number of iterations from the previous fit when warm
|
||||
# start is True.
|
||||
|
||||
estimator = GradientBoosting(max_iter=10, early_stopping=False,
|
||||
warm_start=True)
|
||||
estimator.fit(X, y)
|
||||
estimator.set_params(max_iter=5)
|
||||
err_msg = ('max_iter=5 must be larger than or equal to n_iter_=10 '
|
||||
'when warm_start==True')
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
estimator.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('GradientBoosting, X, y', [
|
||||
(HistGradientBoostingClassifier, X_classification, y_classification),
|
||||
(HistGradientBoostingRegressor, X_regression, y_regression)
|
||||
])
|
||||
def test_warm_start_yields_identical_results(GradientBoosting, X, y):
|
||||
# Make sure that fitting 50 iterations and then 25 with warm start is
|
||||
# equivalent to fitting 75 iterations.
|
||||
|
||||
rng = 42
|
||||
gb_warm_start = GradientBoosting(
|
||||
n_iter_no_change=100, max_iter=50, random_state=rng, warm_start=True
|
||||
)
|
||||
gb_warm_start.fit(X, y).set_params(max_iter=75).fit(X, y)
|
||||
|
||||
gb_no_warm_start = GradientBoosting(
|
||||
n_iter_no_change=100, max_iter=75, random_state=rng, warm_start=False
|
||||
)
|
||||
gb_no_warm_start.fit(X, y)
|
||||
|
||||
# Check that both predictors are equal
|
||||
_assert_predictor_equal(gb_warm_start, gb_no_warm_start, X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('GradientBoosting, X, y', [
|
||||
(HistGradientBoostingClassifier, X_classification, y_classification),
|
||||
(HistGradientBoostingRegressor, X_regression, y_regression)
|
||||
])
|
||||
def test_warm_start_max_depth(GradientBoosting, X, y):
|
||||
# Test if possible to fit trees of different depth in ensemble.
|
||||
gb = GradientBoosting(max_iter=20, min_samples_leaf=1,
|
||||
warm_start=True, max_depth=2, early_stopping=False)
|
||||
gb.fit(X, y)
|
||||
gb.set_params(max_iter=30, max_depth=3, n_iter_no_change=110)
|
||||
gb.fit(X, y)
|
||||
|
||||
# First 20 trees have max_depth == 2
|
||||
for i in range(20):
|
||||
assert gb._predictors[i][0].get_max_depth() == 2
|
||||
# Last 10 trees have max_depth == 3
|
||||
for i in range(1, 11):
|
||||
assert gb._predictors[-i][0].get_max_depth() == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('GradientBoosting, X, y', [
|
||||
(HistGradientBoostingClassifier, X_classification, y_classification),
|
||||
(HistGradientBoostingRegressor, X_regression, y_regression)
|
||||
])
|
||||
@pytest.mark.parametrize('scoring', (None, 'loss'))
|
||||
def test_warm_start_early_stopping(GradientBoosting, X, y, scoring):
|
||||
# Make sure that early stopping occurs after a small number of iterations
|
||||
# when fitting a second time with warm starting.
|
||||
|
||||
n_iter_no_change = 5
|
||||
gb = GradientBoosting(
|
||||
n_iter_no_change=n_iter_no_change, max_iter=10000, early_stopping=True,
|
||||
random_state=42, warm_start=True, tol=1e-3, scoring=scoring,
|
||||
)
|
||||
gb.fit(X, y)
|
||||
n_iter_first_fit = gb.n_iter_
|
||||
gb.fit(X, y)
|
||||
n_iter_second_fit = gb.n_iter_
|
||||
assert 0 < n_iter_second_fit - n_iter_first_fit < n_iter_no_change
|
||||
|
||||
|
||||
@pytest.mark.parametrize('GradientBoosting, X, y', [
|
||||
(HistGradientBoostingClassifier, X_classification, y_classification),
|
||||
(HistGradientBoostingRegressor, X_regression, y_regression)
|
||||
])
|
||||
def test_warm_start_equal_n_estimators(GradientBoosting, X, y):
|
||||
# Test if warm start with equal n_estimators does nothing
|
||||
gb_1 = GradientBoosting(max_depth=2, early_stopping=False)
|
||||
gb_1.fit(X, y)
|
||||
|
||||
gb_2 = clone(gb_1)
|
||||
gb_2.set_params(max_iter=gb_1.max_iter, warm_start=True,
|
||||
n_iter_no_change=5)
|
||||
gb_2.fit(X, y)
|
||||
|
||||
# Check that both predictors are equal
|
||||
_assert_predictor_equal(gb_1, gb_2, X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('GradientBoosting, X, y', [
|
||||
(HistGradientBoostingClassifier, X_classification, y_classification),
|
||||
(HistGradientBoostingRegressor, X_regression, y_regression)
|
||||
])
|
||||
def test_warm_start_clear(GradientBoosting, X, y):
|
||||
# Test if fit clears state.
|
||||
gb_1 = GradientBoosting(n_iter_no_change=5, random_state=42)
|
||||
gb_1.fit(X, y)
|
||||
|
||||
gb_2 = GradientBoosting(n_iter_no_change=5, random_state=42,
|
||||
warm_start=True)
|
||||
gb_2.fit(X, y) # inits state
|
||||
gb_2.set_params(warm_start=False)
|
||||
gb_2.fit(X, y) # clears old state and equals est
|
||||
|
||||
# Check that both predictors have the same train_score_ and
|
||||
# validation_score_ attributes
|
||||
assert_allclose(gb_1.train_score_, gb_2.train_score_)
|
||||
assert_allclose(gb_1.validation_score_, gb_2.validation_score_)
|
||||
|
||||
# Check that both predictors are equal
|
||||
_assert_predictor_equal(gb_1, gb_2, X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('GradientBoosting, X, y', [
|
||||
(HistGradientBoostingClassifier, X_classification, y_classification),
|
||||
(HistGradientBoostingRegressor, X_regression, y_regression)
|
||||
])
|
||||
@pytest.mark.parametrize('rng_type', ('none', 'int', 'instance'))
|
||||
def test_random_seeds_warm_start(GradientBoosting, X, y, rng_type):
|
||||
# Make sure the seeds for train/val split and small trainset subsampling
|
||||
# are correctly set in a warm start context.
|
||||
def _get_rng(rng_type):
|
||||
# Helper to avoid consuming rngs
|
||||
if rng_type == 'none':
|
||||
return None
|
||||
elif rng_type == 'int':
|
||||
return 42
|
||||
else:
|
||||
return np.random.RandomState(0)
|
||||
|
||||
random_state = _get_rng(rng_type)
|
||||
gb_1 = GradientBoosting(early_stopping=True, max_iter=2,
|
||||
random_state=random_state)
|
||||
gb_1.set_params(scoring=check_scoring(gb_1))
|
||||
gb_1.fit(X, y)
|
||||
random_seed_1_1 = gb_1._random_seed
|
||||
|
||||
gb_1.fit(X, y)
|
||||
random_seed_1_2 = gb_1._random_seed # clear the old state, different seed
|
||||
|
||||
random_state = _get_rng(rng_type)
|
||||
gb_2 = GradientBoosting(early_stopping=True, max_iter=2,
|
||||
random_state=random_state, warm_start=True)
|
||||
gb_2.set_params(scoring=check_scoring(gb_2))
|
||||
gb_2.fit(X, y) # inits state
|
||||
random_seed_2_1 = gb_2._random_seed
|
||||
gb_2.fit(X, y) # clears old state and equals est
|
||||
random_seed_2_2 = gb_2._random_seed
|
||||
|
||||
# Without warm starting, the seeds should be
|
||||
# * all different if random state is None
|
||||
# * all equal if random state is an integer
|
||||
# * different when refitting and equal with a new estimator (because
|
||||
# the random state is mutated)
|
||||
if rng_type == 'none':
|
||||
assert random_seed_1_1 != random_seed_1_2 != random_seed_2_1
|
||||
elif rng_type == 'int':
|
||||
assert random_seed_1_1 == random_seed_1_2 == random_seed_2_1
|
||||
else:
|
||||
assert random_seed_1_1 == random_seed_2_1 != random_seed_1_2
|
||||
|
||||
# With warm starting, the seeds must be equal
|
||||
assert random_seed_2_1 == random_seed_2_2
|
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue