2359 lines
93 KiB
Python
2359 lines
93 KiB
Python
"""
|
|
Forest of trees-based ensemble methods.
|
|
|
|
Those methods include random forests and extremely randomized trees.
|
|
|
|
The module structure is the following:
|
|
|
|
- The ``BaseForest`` base class implements a common ``fit`` method for all
|
|
the estimators in the module. The ``fit`` method of the base ``Forest``
|
|
class calls the ``fit`` method of each sub-estimator on random samples
|
|
(with replacement, a.k.a. bootstrap) of the training set.
|
|
|
|
The init of the sub-estimator is further delegated to the
|
|
``BaseEnsemble`` constructor.
|
|
|
|
- The ``ForestClassifier`` and ``ForestRegressor`` base classes further
|
|
implement the prediction logic by computing an average of the predicted
|
|
outcomes of the sub-estimators.
|
|
|
|
- The ``RandomForestClassifier`` and ``RandomForestRegressor`` derived
|
|
classes provide the user with concrete implementations of
|
|
the forest ensemble method using classical, deterministic
|
|
``DecisionTreeClassifier`` and ``DecisionTreeRegressor`` as
|
|
sub-estimator implementations.
|
|
|
|
- The ``ExtraTreesClassifier`` and ``ExtraTreesRegressor`` derived
|
|
classes provide the user with concrete implementations of the
|
|
forest ensemble method using the extremely randomized trees
|
|
``ExtraTreeClassifier`` and ``ExtraTreeRegressor`` as
|
|
sub-estimator implementations.
|
|
|
|
Single and multi-output problems are both handled.
|
|
"""
|
|
|
|
# Authors: Gilles Louppe <g.louppe@gmail.com>
|
|
# Brian Holt <bdholt1@gmail.com>
|
|
# Joly Arnaud <arnaud.v.joly@gmail.com>
|
|
# Fares Hedayati <fares.hedayati@gmail.com>
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
|
|
import numbers
|
|
from warnings import catch_warnings, simplefilter, warn
|
|
import threading
|
|
|
|
from abc import ABCMeta, abstractmethod
|
|
import numpy as np
|
|
from scipy.sparse import issparse
|
|
from scipy.sparse import hstack as sparse_hstack
|
|
from joblib import Parallel, delayed
|
|
|
|
from ..base import ClassifierMixin, RegressorMixin, MultiOutputMixin
|
|
from ..metrics import r2_score
|
|
from ..preprocessing import OneHotEncoder
|
|
from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor,
|
|
ExtraTreeClassifier, ExtraTreeRegressor)
|
|
from ..tree._tree import DTYPE, DOUBLE
|
|
from ..utils import check_random_state, check_array, compute_sample_weight
|
|
from ..exceptions import DataConversionWarning
|
|
from ._base import BaseEnsemble, _partition_estimators
|
|
from ..utils.fixes import _joblib_parallel_args
|
|
from ..utils.multiclass import check_classification_targets
|
|
from ..utils.validation import check_is_fitted, _check_sample_weight
|
|
from ..utils.validation import _deprecate_positional_args
|
|
|
|
|
|
__all__ = ["RandomForestClassifier",
|
|
"RandomForestRegressor",
|
|
"ExtraTreesClassifier",
|
|
"ExtraTreesRegressor",
|
|
"RandomTreesEmbedding"]
|
|
|
|
MAX_INT = np.iinfo(np.int32).max
|
|
|
|
|
|
def _get_n_samples_bootstrap(n_samples, max_samples):
|
|
"""
|
|
Get the number of samples in a bootstrap sample.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int
|
|
Number of samples in the dataset.
|
|
max_samples : int or float
|
|
The maximum number of samples to draw from the total available:
|
|
- if float, this indicates a fraction of the total and should be
|
|
the interval `(0, 1)`;
|
|
- if int, this indicates the exact number of samples;
|
|
- if None, this indicates the total number of samples.
|
|
|
|
Returns
|
|
-------
|
|
n_samples_bootstrap : int
|
|
The total number of samples to draw for the bootstrap sample.
|
|
"""
|
|
if max_samples is None:
|
|
return n_samples
|
|
|
|
if isinstance(max_samples, numbers.Integral):
|
|
if not (1 <= max_samples <= n_samples):
|
|
msg = "`max_samples` must be in range 1 to {} but got value {}"
|
|
raise ValueError(msg.format(n_samples, max_samples))
|
|
return max_samples
|
|
|
|
if isinstance(max_samples, numbers.Real):
|
|
if not (0 < max_samples < 1):
|
|
msg = "`max_samples` must be in range (0, 1) but got value {}"
|
|
raise ValueError(msg.format(max_samples))
|
|
return int(round(n_samples * max_samples))
|
|
|
|
msg = "`max_samples` should be int or float, but got type '{}'"
|
|
raise TypeError(msg.format(type(max_samples)))
|
|
|
|
|
|
def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap):
|
|
"""
|
|
Private function used to _parallel_build_trees function."""
|
|
|
|
random_instance = check_random_state(random_state)
|
|
sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap)
|
|
|
|
return sample_indices
|
|
|
|
|
|
def _generate_unsampled_indices(random_state, n_samples, n_samples_bootstrap):
|
|
"""
|
|
Private function used to forest._set_oob_score function."""
|
|
sample_indices = _generate_sample_indices(random_state, n_samples,
|
|
n_samples_bootstrap)
|
|
sample_counts = np.bincount(sample_indices, minlength=n_samples)
|
|
unsampled_mask = sample_counts == 0
|
|
indices_range = np.arange(n_samples)
|
|
unsampled_indices = indices_range[unsampled_mask]
|
|
|
|
return unsampled_indices
|
|
|
|
|
|
def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
|
|
verbose=0, class_weight=None,
|
|
n_samples_bootstrap=None):
|
|
"""
|
|
Private function used to fit a single tree in parallel."""
|
|
if verbose > 1:
|
|
print("building tree %d of %d" % (tree_idx + 1, n_trees))
|
|
|
|
if forest.bootstrap:
|
|
n_samples = X.shape[0]
|
|
if sample_weight is None:
|
|
curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
|
|
else:
|
|
curr_sample_weight = sample_weight.copy()
|
|
|
|
indices = _generate_sample_indices(tree.random_state, n_samples,
|
|
n_samples_bootstrap)
|
|
sample_counts = np.bincount(indices, minlength=n_samples)
|
|
curr_sample_weight *= sample_counts
|
|
|
|
if class_weight == 'subsample':
|
|
with catch_warnings():
|
|
simplefilter('ignore', DeprecationWarning)
|
|
curr_sample_weight *= compute_sample_weight('auto', y,
|
|
indices=indices)
|
|
elif class_weight == 'balanced_subsample':
|
|
curr_sample_weight *= compute_sample_weight('balanced', y,
|
|
indices=indices)
|
|
|
|
tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
|
|
else:
|
|
tree.fit(X, y, sample_weight=sample_weight, check_input=False)
|
|
|
|
return tree
|
|
|
|
|
|
class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
|
|
"""
|
|
Base class for forests of trees.
|
|
|
|
Warning: This class should not be used directly. Use derived classes
|
|
instead.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def __init__(self,
|
|
base_estimator,
|
|
n_estimators=100, *,
|
|
estimator_params=tuple(),
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=None,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
class_weight=None,
|
|
max_samples=None):
|
|
super().__init__(
|
|
base_estimator=base_estimator,
|
|
n_estimators=n_estimators,
|
|
estimator_params=estimator_params)
|
|
|
|
self.bootstrap = bootstrap
|
|
self.oob_score = oob_score
|
|
self.n_jobs = n_jobs
|
|
self.random_state = random_state
|
|
self.verbose = verbose
|
|
self.warm_start = warm_start
|
|
self.class_weight = class_weight
|
|
self.max_samples = max_samples
|
|
|
|
def apply(self, X):
|
|
"""
|
|
Apply trees in the forest to X, return leaf indices.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
The input samples. Internally, its dtype will be converted to
|
|
``dtype=np.float32``. If a sparse matrix is provided, it will be
|
|
converted into a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
X_leaves : ndarray of shape (n_samples, n_estimators)
|
|
For each datapoint x in X and for each tree in the forest,
|
|
return the index of the leaf x ends up in.
|
|
"""
|
|
X = self._validate_X_predict(X)
|
|
results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
|
|
**_joblib_parallel_args(prefer="threads"))(
|
|
delayed(tree.apply)(X, check_input=False)
|
|
for tree in self.estimators_)
|
|
|
|
return np.array(results).T
|
|
|
|
def decision_path(self, X):
|
|
"""
|
|
Return the decision path in the forest.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
The input samples. Internally, its dtype will be converted to
|
|
``dtype=np.float32``. If a sparse matrix is provided, it will be
|
|
converted into a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
indicator : sparse matrix of shape (n_samples, n_nodes)
|
|
Return a node indicator matrix where non zero elements indicates
|
|
that the samples goes through the nodes. The matrix is of CSR
|
|
format.
|
|
|
|
n_nodes_ptr : ndarray of shape (n_estimators + 1,)
|
|
The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
|
|
gives the indicator value for the i-th estimator.
|
|
|
|
"""
|
|
X = self._validate_X_predict(X)
|
|
indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
|
|
**_joblib_parallel_args(prefer='threads'))(
|
|
delayed(tree.decision_path)(X, check_input=False)
|
|
for tree in self.estimators_)
|
|
|
|
n_nodes = [0]
|
|
n_nodes.extend([i.shape[1] for i in indicators])
|
|
n_nodes_ptr = np.array(n_nodes).cumsum()
|
|
|
|
return sparse_hstack(indicators).tocsr(), n_nodes_ptr
|
|
|
|
def fit(self, X, y, sample_weight=None):
|
|
"""
|
|
Build a forest of trees from the training set (X, y).
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
The training input samples. Internally, its dtype will be converted
|
|
to ``dtype=np.float32``. If a sparse matrix is provided, it will be
|
|
converted into a sparse ``csc_matrix``.
|
|
|
|
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
|
|
The target values (class labels in classification, real numbers in
|
|
regression).
|
|
|
|
sample_weight : array-like of shape (n_samples,), default=None
|
|
Sample weights. If None, then samples are equally weighted. Splits
|
|
that would create child nodes with net zero or negative weight are
|
|
ignored while searching for a split in each node. In the case of
|
|
classification, splits are also ignored if they would result in any
|
|
single class carrying a negative weight in either child node.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
"""
|
|
# Validate or convert input data
|
|
if issparse(y):
|
|
raise ValueError(
|
|
"sparse multilabel-indicator for y is not supported."
|
|
)
|
|
X, y = self._validate_data(X, y, multi_output=True,
|
|
accept_sparse="csc", dtype=DTYPE)
|
|
if sample_weight is not None:
|
|
sample_weight = _check_sample_weight(sample_weight, X)
|
|
|
|
if issparse(X):
|
|
# Pre-sort indices to avoid that each individual tree of the
|
|
# ensemble sorts the indices.
|
|
X.sort_indices()
|
|
|
|
# Remap output
|
|
self.n_features_ = X.shape[1]
|
|
|
|
y = np.atleast_1d(y)
|
|
if y.ndim == 2 and y.shape[1] == 1:
|
|
warn("A column-vector y was passed when a 1d array was"
|
|
" expected. Please change the shape of y to "
|
|
"(n_samples,), for example using ravel().",
|
|
DataConversionWarning, stacklevel=2)
|
|
|
|
if y.ndim == 1:
|
|
# reshape is necessary to preserve the data contiguity against vs
|
|
# [:, np.newaxis] that does not.
|
|
y = np.reshape(y, (-1, 1))
|
|
|
|
self.n_outputs_ = y.shape[1]
|
|
|
|
y, expanded_class_weight = self._validate_y_class_weight(y)
|
|
|
|
if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
|
|
y = np.ascontiguousarray(y, dtype=DOUBLE)
|
|
|
|
if expanded_class_weight is not None:
|
|
if sample_weight is not None:
|
|
sample_weight = sample_weight * expanded_class_weight
|
|
else:
|
|
sample_weight = expanded_class_weight
|
|
|
|
# Get bootstrap sample size
|
|
n_samples_bootstrap = _get_n_samples_bootstrap(
|
|
n_samples=X.shape[0],
|
|
max_samples=self.max_samples
|
|
)
|
|
|
|
# Check parameters
|
|
self._validate_estimator()
|
|
|
|
if not self.bootstrap and self.oob_score:
|
|
raise ValueError("Out of bag estimation only available"
|
|
" if bootstrap=True")
|
|
|
|
random_state = check_random_state(self.random_state)
|
|
|
|
if not self.warm_start or not hasattr(self, "estimators_"):
|
|
# Free allocated memory, if any
|
|
self.estimators_ = []
|
|
|
|
n_more_estimators = self.n_estimators - len(self.estimators_)
|
|
|
|
if n_more_estimators < 0:
|
|
raise ValueError('n_estimators=%d must be larger or equal to '
|
|
'len(estimators_)=%d when warm_start==True'
|
|
% (self.n_estimators, len(self.estimators_)))
|
|
|
|
elif n_more_estimators == 0:
|
|
warn("Warm-start fitting without increasing n_estimators does not "
|
|
"fit new trees.")
|
|
else:
|
|
if self.warm_start and len(self.estimators_) > 0:
|
|
# We draw from the random state to get the random state we
|
|
# would have got if we hadn't used a warm_start.
|
|
random_state.randint(MAX_INT, size=len(self.estimators_))
|
|
|
|
trees = [self._make_estimator(append=False,
|
|
random_state=random_state)
|
|
for i in range(n_more_estimators)]
|
|
|
|
# Parallel loop: we prefer the threading backend as the Cython code
|
|
# for fitting the trees is internally releasing the Python GIL
|
|
# making threading more efficient than multiprocessing in
|
|
# that case. However, for joblib 0.12+ we respect any
|
|
# parallel_backend contexts set at a higher level,
|
|
# since correctness does not rely on using threads.
|
|
trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
|
|
**_joblib_parallel_args(prefer='threads'))(
|
|
delayed(_parallel_build_trees)(
|
|
t, self, X, y, sample_weight, i, len(trees),
|
|
verbose=self.verbose, class_weight=self.class_weight,
|
|
n_samples_bootstrap=n_samples_bootstrap)
|
|
for i, t in enumerate(trees))
|
|
|
|
# Collect newly grown trees
|
|
self.estimators_.extend(trees)
|
|
|
|
if self.oob_score:
|
|
self._set_oob_score(X, y)
|
|
|
|
# Decapsulate classes_ attributes
|
|
if hasattr(self, "classes_") and self.n_outputs_ == 1:
|
|
self.n_classes_ = self.n_classes_[0]
|
|
self.classes_ = self.classes_[0]
|
|
|
|
return self
|
|
|
|
@abstractmethod
|
|
def _set_oob_score(self, X, y):
|
|
"""
|
|
Calculate out of bag predictions and score."""
|
|
|
|
def _validate_y_class_weight(self, y):
|
|
# Default implementation
|
|
return y, None
|
|
|
|
def _validate_X_predict(self, X):
|
|
"""
|
|
Validate X whenever one tries to predict, apply, predict_proba."""
|
|
check_is_fitted(self)
|
|
|
|
return self.estimators_[0]._validate_X_predict(X, check_input=True)
|
|
|
|
@property
|
|
def feature_importances_(self):
|
|
"""
|
|
The impurity-based feature importances.
|
|
|
|
The higher, the more important the feature.
|
|
The importance of a feature is computed as the (normalized)
|
|
total reduction of the criterion brought by that feature. It is also
|
|
known as the Gini importance.
|
|
|
|
Warning: impurity-based feature importances can be misleading for
|
|
high cardinality features (many unique values). See
|
|
:func:`sklearn.inspection.permutation_importance` as an alternative.
|
|
|
|
Returns
|
|
-------
|
|
feature_importances_ : ndarray of shape (n_features,)
|
|
The values of this array sum to 1, unless all trees are single node
|
|
trees consisting of only the root node, in which case it will be an
|
|
array of zeros.
|
|
"""
|
|
check_is_fitted(self)
|
|
|
|
all_importances = Parallel(n_jobs=self.n_jobs,
|
|
**_joblib_parallel_args(prefer='threads'))(
|
|
delayed(getattr)(tree, 'feature_importances_')
|
|
for tree in self.estimators_ if tree.tree_.node_count > 1)
|
|
|
|
if not all_importances:
|
|
return np.zeros(self.n_features_, dtype=np.float64)
|
|
|
|
all_importances = np.mean(all_importances,
|
|
axis=0, dtype=np.float64)
|
|
return all_importances / np.sum(all_importances)
|
|
|
|
|
|
def _accumulate_prediction(predict, X, out, lock):
|
|
"""
|
|
This is a utility function for joblib's Parallel.
|
|
|
|
It can't go locally in ForestClassifier or ForestRegressor, because joblib
|
|
complains that it cannot pickle it when placed there.
|
|
"""
|
|
prediction = predict(X, check_input=False)
|
|
with lock:
|
|
if len(out) == 1:
|
|
out[0] += prediction
|
|
else:
|
|
for i in range(len(out)):
|
|
out[i] += prediction[i]
|
|
|
|
|
|
class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta):
|
|
"""
|
|
Base class for forest of trees-based classifiers.
|
|
|
|
Warning: This class should not be used directly. Use derived classes
|
|
instead.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def __init__(self,
|
|
base_estimator,
|
|
n_estimators=100, *,
|
|
estimator_params=tuple(),
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=None,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
class_weight=None,
|
|
max_samples=None):
|
|
super().__init__(
|
|
base_estimator,
|
|
n_estimators=n_estimators,
|
|
estimator_params=estimator_params,
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start,
|
|
class_weight=class_weight,
|
|
max_samples=max_samples)
|
|
|
|
def _set_oob_score(self, X, y):
|
|
"""
|
|
Compute out-of-bag score."""
|
|
X = check_array(X, dtype=DTYPE, accept_sparse='csr')
|
|
|
|
n_classes_ = self.n_classes_
|
|
n_samples = y.shape[0]
|
|
|
|
oob_decision_function = []
|
|
oob_score = 0.0
|
|
predictions = [np.zeros((n_samples, n_classes_[k]))
|
|
for k in range(self.n_outputs_)]
|
|
|
|
n_samples_bootstrap = _get_n_samples_bootstrap(
|
|
n_samples, self.max_samples
|
|
)
|
|
|
|
for estimator in self.estimators_:
|
|
unsampled_indices = _generate_unsampled_indices(
|
|
estimator.random_state, n_samples, n_samples_bootstrap)
|
|
p_estimator = estimator.predict_proba(X[unsampled_indices, :],
|
|
check_input=False)
|
|
|
|
if self.n_outputs_ == 1:
|
|
p_estimator = [p_estimator]
|
|
|
|
for k in range(self.n_outputs_):
|
|
predictions[k][unsampled_indices, :] += p_estimator[k]
|
|
|
|
for k in range(self.n_outputs_):
|
|
if (predictions[k].sum(axis=1) == 0).any():
|
|
warn("Some inputs do not have OOB scores. "
|
|
"This probably means too few trees were used "
|
|
"to compute any reliable oob estimates.")
|
|
|
|
decision = (predictions[k] /
|
|
predictions[k].sum(axis=1)[:, np.newaxis])
|
|
oob_decision_function.append(decision)
|
|
oob_score += np.mean(y[:, k] ==
|
|
np.argmax(predictions[k], axis=1), axis=0)
|
|
|
|
if self.n_outputs_ == 1:
|
|
self.oob_decision_function_ = oob_decision_function[0]
|
|
else:
|
|
self.oob_decision_function_ = oob_decision_function
|
|
|
|
self.oob_score_ = oob_score / self.n_outputs_
|
|
|
|
def _validate_y_class_weight(self, y):
|
|
check_classification_targets(y)
|
|
|
|
y = np.copy(y)
|
|
expanded_class_weight = None
|
|
|
|
if self.class_weight is not None:
|
|
y_original = np.copy(y)
|
|
|
|
self.classes_ = []
|
|
self.n_classes_ = []
|
|
|
|
y_store_unique_indices = np.zeros(y.shape, dtype=np.int)
|
|
for k in range(self.n_outputs_):
|
|
classes_k, y_store_unique_indices[:, k] = \
|
|
np.unique(y[:, k], return_inverse=True)
|
|
self.classes_.append(classes_k)
|
|
self.n_classes_.append(classes_k.shape[0])
|
|
y = y_store_unique_indices
|
|
|
|
if self.class_weight is not None:
|
|
valid_presets = ('balanced', 'balanced_subsample')
|
|
if isinstance(self.class_weight, str):
|
|
if self.class_weight not in valid_presets:
|
|
raise ValueError('Valid presets for class_weight include '
|
|
'"balanced" and "balanced_subsample".'
|
|
'Given "%s".'
|
|
% self.class_weight)
|
|
if self.warm_start:
|
|
warn('class_weight presets "balanced" or '
|
|
'"balanced_subsample" are '
|
|
'not recommended for warm_start if the fitted data '
|
|
'differs from the full dataset. In order to use '
|
|
'"balanced" weights, use compute_class_weight '
|
|
'("balanced", classes, y). In place of y you can use '
|
|
'a large enough sample of the full training set '
|
|
'target to properly estimate the class frequency '
|
|
'distributions. Pass the resulting weights as the '
|
|
'class_weight parameter.')
|
|
|
|
if (self.class_weight != 'balanced_subsample' or
|
|
not self.bootstrap):
|
|
if self.class_weight == "balanced_subsample":
|
|
class_weight = "balanced"
|
|
else:
|
|
class_weight = self.class_weight
|
|
expanded_class_weight = compute_sample_weight(class_weight,
|
|
y_original)
|
|
|
|
return y, expanded_class_weight
|
|
|
|
def predict(self, X):
|
|
"""
|
|
Predict class for X.
|
|
|
|
The predicted class of an input sample is a vote by the trees in
|
|
the forest, weighted by their probability estimates. That is,
|
|
the predicted class is the one with highest mean probability
|
|
estimate across the trees.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
The input samples. Internally, its dtype will be converted to
|
|
``dtype=np.float32``. If a sparse matrix is provided, it will be
|
|
converted into a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
|
|
The predicted classes.
|
|
"""
|
|
proba = self.predict_proba(X)
|
|
|
|
if self.n_outputs_ == 1:
|
|
return self.classes_.take(np.argmax(proba, axis=1), axis=0)
|
|
|
|
else:
|
|
n_samples = proba[0].shape[0]
|
|
# all dtypes should be the same, so just take the first
|
|
class_type = self.classes_[0].dtype
|
|
predictions = np.empty((n_samples, self.n_outputs_),
|
|
dtype=class_type)
|
|
|
|
for k in range(self.n_outputs_):
|
|
predictions[:, k] = self.classes_[k].take(np.argmax(proba[k],
|
|
axis=1),
|
|
axis=0)
|
|
|
|
return predictions
|
|
|
|
def predict_proba(self, X):
|
|
"""
|
|
Predict class probabilities for X.
|
|
|
|
The predicted class probabilities of an input sample are computed as
|
|
the mean predicted class probabilities of the trees in the forest.
|
|
The class probability of a single tree is the fraction of samples of
|
|
the same class in a leaf.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
The input samples. Internally, its dtype will be converted to
|
|
``dtype=np.float32``. If a sparse matrix is provided, it will be
|
|
converted into a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
p : ndarray of shape (n_samples, n_classes), or a list of n_outputs
|
|
such arrays if n_outputs > 1.
|
|
The class probabilities of the input samples. The order of the
|
|
classes corresponds to that in the attribute :term:`classes_`.
|
|
"""
|
|
check_is_fitted(self)
|
|
# Check data
|
|
X = self._validate_X_predict(X)
|
|
|
|
# Assign chunk of trees to jobs
|
|
n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
|
|
|
|
# avoid storing the output of every estimator by summing them here
|
|
all_proba = [np.zeros((X.shape[0], j), dtype=np.float64)
|
|
for j in np.atleast_1d(self.n_classes_)]
|
|
lock = threading.Lock()
|
|
Parallel(n_jobs=n_jobs, verbose=self.verbose,
|
|
**_joblib_parallel_args(require="sharedmem"))(
|
|
delayed(_accumulate_prediction)(e.predict_proba, X, all_proba,
|
|
lock)
|
|
for e in self.estimators_)
|
|
|
|
for proba in all_proba:
|
|
proba /= len(self.estimators_)
|
|
|
|
if len(all_proba) == 1:
|
|
return all_proba[0]
|
|
else:
|
|
return all_proba
|
|
|
|
def predict_log_proba(self, X):
|
|
"""
|
|
Predict class log-probabilities for X.
|
|
|
|
The predicted class log-probabilities of an input sample is computed as
|
|
the log of the mean predicted class probabilities of the trees in the
|
|
forest.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
The input samples. Internally, its dtype will be converted to
|
|
``dtype=np.float32``. If a sparse matrix is provided, it will be
|
|
converted into a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
p : ndarray of shape (n_samples, n_classes), or a list of n_outputs
|
|
such arrays if n_outputs > 1.
|
|
The class probabilities of the input samples. The order of the
|
|
classes corresponds to that in the attribute :term:`classes_`.
|
|
"""
|
|
proba = self.predict_proba(X)
|
|
|
|
if self.n_outputs_ == 1:
|
|
return np.log(proba)
|
|
|
|
else:
|
|
for k in range(self.n_outputs_):
|
|
proba[k] = np.log(proba[k])
|
|
|
|
return proba
|
|
|
|
|
|
class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta):
|
|
"""
|
|
Base class for forest of trees-based regressors.
|
|
|
|
Warning: This class should not be used directly. Use derived classes
|
|
instead.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def __init__(self,
|
|
base_estimator,
|
|
n_estimators=100, *,
|
|
estimator_params=tuple(),
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=None,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
max_samples=None):
|
|
super().__init__(
|
|
base_estimator,
|
|
n_estimators=n_estimators,
|
|
estimator_params=estimator_params,
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start,
|
|
max_samples=max_samples)
|
|
|
|
def predict(self, X):
|
|
"""
|
|
Predict regression target for X.
|
|
|
|
The predicted regression target of an input sample is computed as the
|
|
mean predicted regression targets of the trees in the forest.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
The input samples. Internally, its dtype will be converted to
|
|
``dtype=np.float32``. If a sparse matrix is provided, it will be
|
|
converted into a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
|
|
The predicted values.
|
|
"""
|
|
check_is_fitted(self)
|
|
# Check data
|
|
X = self._validate_X_predict(X)
|
|
|
|
# Assign chunk of trees to jobs
|
|
n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
|
|
|
|
# avoid storing the output of every estimator by summing them here
|
|
if self.n_outputs_ > 1:
|
|
y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64)
|
|
else:
|
|
y_hat = np.zeros((X.shape[0]), dtype=np.float64)
|
|
|
|
# Parallel loop
|
|
lock = threading.Lock()
|
|
Parallel(n_jobs=n_jobs, verbose=self.verbose,
|
|
**_joblib_parallel_args(require="sharedmem"))(
|
|
delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock)
|
|
for e in self.estimators_)
|
|
|
|
y_hat /= len(self.estimators_)
|
|
|
|
return y_hat
|
|
|
|
def _set_oob_score(self, X, y):
|
|
"""
|
|
Compute out-of-bag scores."""
|
|
X = check_array(X, dtype=DTYPE, accept_sparse='csr')
|
|
|
|
n_samples = y.shape[0]
|
|
|
|
predictions = np.zeros((n_samples, self.n_outputs_))
|
|
n_predictions = np.zeros((n_samples, self.n_outputs_))
|
|
|
|
n_samples_bootstrap = _get_n_samples_bootstrap(
|
|
n_samples, self.max_samples
|
|
)
|
|
|
|
for estimator in self.estimators_:
|
|
unsampled_indices = _generate_unsampled_indices(
|
|
estimator.random_state, n_samples, n_samples_bootstrap)
|
|
p_estimator = estimator.predict(
|
|
X[unsampled_indices, :], check_input=False)
|
|
|
|
if self.n_outputs_ == 1:
|
|
p_estimator = p_estimator[:, np.newaxis]
|
|
|
|
predictions[unsampled_indices, :] += p_estimator
|
|
n_predictions[unsampled_indices, :] += 1
|
|
|
|
if (n_predictions == 0).any():
|
|
warn("Some inputs do not have OOB scores. "
|
|
"This probably means too few trees were used "
|
|
"to compute any reliable oob estimates.")
|
|
n_predictions[n_predictions == 0] = 1
|
|
|
|
predictions /= n_predictions
|
|
self.oob_prediction_ = predictions
|
|
|
|
if self.n_outputs_ == 1:
|
|
self.oob_prediction_ = \
|
|
self.oob_prediction_.reshape((n_samples, ))
|
|
|
|
self.oob_score_ = 0.0
|
|
|
|
for k in range(self.n_outputs_):
|
|
self.oob_score_ += r2_score(y[:, k],
|
|
predictions[:, k])
|
|
|
|
self.oob_score_ /= self.n_outputs_
|
|
|
|
def _compute_partial_dependence_recursion(self, grid, target_features):
|
|
"""Fast partial dependence computation.
|
|
|
|
Parameters
|
|
----------
|
|
grid : ndarray of shape (n_samples, n_target_features)
|
|
The grid points on which the partial dependence should be
|
|
evaluated.
|
|
target_features : ndarray of shape (n_target_features)
|
|
The set of target features for which the partial dependence
|
|
should be evaluated.
|
|
|
|
Returns
|
|
-------
|
|
averaged_predictions : ndarray of shape (n_samples,)
|
|
The value of the partial dependence function on each grid point.
|
|
"""
|
|
grid = np.asarray(grid, dtype=DTYPE, order='C')
|
|
averaged_predictions = np.zeros(shape=grid.shape[0],
|
|
dtype=np.float64, order='C')
|
|
|
|
for tree in self.estimators_:
|
|
# Note: we don't sum in parallel because the GIL isn't released in
|
|
# the fast method.
|
|
tree.tree_.compute_partial_dependence(
|
|
grid, target_features, averaged_predictions)
|
|
# Average over the forest
|
|
averaged_predictions /= len(self.estimators_)
|
|
|
|
return averaged_predictions
|
|
|
|
class RandomForestClassifier(ForestClassifier):
|
|
"""
|
|
A random forest classifier.
|
|
|
|
A random forest is a meta estimator that fits a number of decision tree
|
|
classifiers on various sub-samples of the dataset and uses averaging to
|
|
improve the predictive accuracy and control over-fitting.
|
|
The sub-sample size is controlled with the `max_samples` parameter if
|
|
`bootstrap=True` (default), otherwise the whole dataset is used to build
|
|
each tree.
|
|
|
|
Read more in the :ref:`User Guide <forest>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : int, default=100
|
|
The number of trees in the forest.
|
|
|
|
.. versionchanged:: 0.22
|
|
The default value of ``n_estimators`` changed from 10 to 100
|
|
in 0.22.
|
|
|
|
criterion : {"gini", "entropy"}, default="gini"
|
|
The function to measure the quality of a split. Supported criteria are
|
|
"gini" for the Gini impurity and "entropy" for the information gain.
|
|
Note: this parameter is tree-specific.
|
|
|
|
max_depth : int, default=None
|
|
The maximum depth of the tree. If None, then nodes are expanded until
|
|
all leaves are pure or until all leaves contain less than
|
|
min_samples_split samples.
|
|
|
|
min_samples_split : int or float, default=2
|
|
The minimum number of samples required to split an internal node:
|
|
|
|
- If int, then consider `min_samples_split` as the minimum number.
|
|
- If float, then `min_samples_split` is a fraction and
|
|
`ceil(min_samples_split * n_samples)` are the minimum
|
|
number of samples for each split.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for fractions.
|
|
|
|
min_samples_leaf : int or float, default=1
|
|
The minimum number of samples required to be at a leaf node.
|
|
A split point at any depth will only be considered if it leaves at
|
|
least ``min_samples_leaf`` training samples in each of the left and
|
|
right branches. This may have the effect of smoothing the model,
|
|
especially in regression.
|
|
|
|
- If int, then consider `min_samples_leaf` as the minimum number.
|
|
- If float, then `min_samples_leaf` is a fraction and
|
|
`ceil(min_samples_leaf * n_samples)` are the minimum
|
|
number of samples for each node.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for fractions.
|
|
|
|
min_weight_fraction_leaf : float, default=0.0
|
|
The minimum weighted fraction of the sum total of weights (of all
|
|
the input samples) required to be at a leaf node. Samples have
|
|
equal weight when sample_weight is not provided.
|
|
|
|
max_features : {"auto", "sqrt", "log2"}, int or float, default="auto"
|
|
The number of features to consider when looking for the best split:
|
|
|
|
- If int, then consider `max_features` features at each split.
|
|
- If float, then `max_features` is a fraction and
|
|
`int(max_features * n_features)` features are considered at each
|
|
split.
|
|
- If "auto", then `max_features=sqrt(n_features)`.
|
|
- If "sqrt", then `max_features=sqrt(n_features)` (same as "auto").
|
|
- If "log2", then `max_features=log2(n_features)`.
|
|
- If None, then `max_features=n_features`.
|
|
|
|
Note: the search for a split does not stop until at least one
|
|
valid partition of the node samples is found, even if it requires to
|
|
effectively inspect more than ``max_features`` features.
|
|
|
|
max_leaf_nodes : int, default=None
|
|
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
Best nodes are defined as relative reduction in impurity.
|
|
If None then unlimited number of leaf nodes.
|
|
|
|
min_impurity_decrease : float, default=0.0
|
|
A node will be split if this split induces a decrease of the impurity
|
|
greater than or equal to this value.
|
|
|
|
The weighted impurity decrease equation is the following::
|
|
|
|
N_t / N * (impurity - N_t_R / N_t * right_impurity
|
|
- N_t_L / N_t * left_impurity)
|
|
|
|
where ``N`` is the total number of samples, ``N_t`` is the number of
|
|
samples at the current node, ``N_t_L`` is the number of samples in the
|
|
left child, and ``N_t_R`` is the number of samples in the right child.
|
|
|
|
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
|
|
if ``sample_weight`` is passed.
|
|
|
|
.. versionadded:: 0.19
|
|
|
|
min_impurity_split : float, default=None
|
|
Threshold for early stopping in tree growth. A node will split
|
|
if its impurity is above the threshold, otherwise it is a leaf.
|
|
|
|
.. deprecated:: 0.19
|
|
``min_impurity_split`` has been deprecated in favor of
|
|
``min_impurity_decrease`` in 0.19. The default value of
|
|
``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
|
|
will be removed in 0.25. Use ``min_impurity_decrease`` instead.
|
|
|
|
|
|
bootstrap : bool, default=True
|
|
Whether bootstrap samples are used when building trees. If False, the
|
|
whole dataset is used to build each tree.
|
|
|
|
oob_score : bool, default=False
|
|
Whether to use out-of-bag samples to estimate
|
|
the generalization accuracy.
|
|
|
|
n_jobs : int, default=None
|
|
The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
|
|
:meth:`decision_path` and :meth:`apply` are all parallelized over the
|
|
trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
|
|
context. ``-1`` means using all processors. See :term:`Glossary
|
|
<n_jobs>` for more details.
|
|
|
|
random_state : int or RandomState, default=None
|
|
Controls both the randomness of the bootstrapping of the samples used
|
|
when building trees (if ``bootstrap=True``) and the sampling of the
|
|
features to consider when looking for the best split at each node
|
|
(if ``max_features < n_features``).
|
|
See :term:`Glossary <random_state>` for details.
|
|
|
|
verbose : int, default=0
|
|
Controls the verbosity when fitting and predicting.
|
|
|
|
warm_start : bool, default=False
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest. See :term:`the Glossary <warm_start>`.
|
|
|
|
class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \
|
|
default=None
|
|
Weights associated with classes in the form ``{class_label: weight}``.
|
|
If not given, all classes are supposed to have weight one. For
|
|
multi-output problems, a list of dicts can be provided in the same
|
|
order as the columns of y.
|
|
|
|
Note that for multioutput (including multilabel) weights should be
|
|
defined for each class of every column in its own dict. For example,
|
|
for four-class multilabel classification weights should be
|
|
[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
|
|
[{1:1}, {2:5}, {3:1}, {4:1}].
|
|
|
|
The "balanced" mode uses the values of y to automatically adjust
|
|
weights inversely proportional to class frequencies in the input data
|
|
as ``n_samples / (n_classes * np.bincount(y))``
|
|
|
|
The "balanced_subsample" mode is the same as "balanced" except that
|
|
weights are computed based on the bootstrap sample for every tree
|
|
grown.
|
|
|
|
For multi-output, the weights of each column of y will be multiplied.
|
|
|
|
Note that these weights will be multiplied with sample_weight (passed
|
|
through the fit method) if sample_weight is specified.
|
|
|
|
ccp_alpha : non-negative float, default=0.0
|
|
Complexity parameter used for Minimal Cost-Complexity Pruning. The
|
|
subtree with the largest cost complexity that is smaller than
|
|
``ccp_alpha`` will be chosen. By default, no pruning is performed. See
|
|
:ref:`minimal_cost_complexity_pruning` for details.
|
|
|
|
.. versionadded:: 0.22
|
|
|
|
max_samples : int or float, default=None
|
|
If bootstrap is True, the number of samples to draw from X
|
|
to train each base estimator.
|
|
|
|
- If None (default), then draw `X.shape[0]` samples.
|
|
- If int, then draw `max_samples` samples.
|
|
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
|
|
`max_samples` should be in the interval `(0, 1)`.
|
|
|
|
.. versionadded:: 0.22
|
|
|
|
Attributes
|
|
----------
|
|
base_estimator_ : DecisionTreeClassifier
|
|
The child estimator template used to create the collection of fitted
|
|
sub-estimators.
|
|
|
|
estimators_ : list of DecisionTreeClassifier
|
|
The collection of fitted sub-estimators.
|
|
|
|
classes_ : ndarray of shape (n_classes,) or a list of such arrays
|
|
The classes labels (single output problem), or a list of arrays of
|
|
class labels (multi-output problem).
|
|
|
|
n_classes_ : int or list
|
|
The number of classes (single output problem), or a list containing the
|
|
number of classes for each output (multi-output problem).
|
|
|
|
n_features_ : int
|
|
The number of features when ``fit`` is performed.
|
|
|
|
n_outputs_ : int
|
|
The number of outputs when ``fit`` is performed.
|
|
|
|
feature_importances_ : ndarray of shape (n_features,)
|
|
The impurity-based feature importances.
|
|
The higher, the more important the feature.
|
|
The importance of a feature is computed as the (normalized)
|
|
total reduction of the criterion brought by that feature. It is also
|
|
known as the Gini importance.
|
|
|
|
Warning: impurity-based feature importances can be misleading for
|
|
high cardinality features (many unique values). See
|
|
:func:`sklearn.inspection.permutation_importance` as an alternative.
|
|
|
|
oob_score_ : float
|
|
Score of the training dataset obtained using an out-of-bag estimate.
|
|
This attribute exists only when ``oob_score`` is True.
|
|
|
|
oob_decision_function_ : ndarray of shape (n_samples, n_classes)
|
|
Decision function computed with out-of-bag estimate on the training
|
|
set. If n_estimators is small it might be possible that a data point
|
|
was never left out during the bootstrap. In this case,
|
|
`oob_decision_function_` might contain NaN. This attribute exists
|
|
only when ``oob_score`` is True.
|
|
|
|
See Also
|
|
--------
|
|
DecisionTreeClassifier, ExtraTreesClassifier
|
|
|
|
Notes
|
|
-----
|
|
The default values for the parameters controlling the size of the trees
|
|
(e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
|
|
unpruned trees which can potentially be very large on some data sets. To
|
|
reduce memory consumption, the complexity and size of the trees should be
|
|
controlled by setting those parameter values.
|
|
|
|
The features are always randomly permuted at each split. Therefore,
|
|
the best found split may vary, even with the same training data,
|
|
``max_features=n_features`` and ``bootstrap=False``, if the improvement
|
|
of the criterion is identical for several splits enumerated during the
|
|
search of the best split. To obtain a deterministic behaviour during
|
|
fitting, ``random_state`` has to be fixed.
|
|
|
|
References
|
|
----------
|
|
.. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.ensemble import RandomForestClassifier
|
|
>>> from sklearn.datasets import make_classification
|
|
>>> X, y = make_classification(n_samples=1000, n_features=4,
|
|
... n_informative=2, n_redundant=0,
|
|
... random_state=0, shuffle=False)
|
|
>>> clf = RandomForestClassifier(max_depth=2, random_state=0)
|
|
>>> clf.fit(X, y)
|
|
RandomForestClassifier(...)
|
|
>>> print(clf.predict([[0, 0, 0, 0]]))
|
|
[1]
|
|
"""
|
|
@_deprecate_positional_args
|
|
def __init__(self,
|
|
n_estimators=100, *,
|
|
criterion="gini",
|
|
max_depth=None,
|
|
min_samples_split=2,
|
|
min_samples_leaf=1,
|
|
min_weight_fraction_leaf=0.,
|
|
max_features="auto",
|
|
max_leaf_nodes=None,
|
|
min_impurity_decrease=0.,
|
|
min_impurity_split=None,
|
|
bootstrap=True,
|
|
oob_score=False,
|
|
n_jobs=None,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
class_weight=None,
|
|
ccp_alpha=0.0,
|
|
max_samples=None):
|
|
super().__init__(
|
|
base_estimator=DecisionTreeClassifier(),
|
|
n_estimators=n_estimators,
|
|
estimator_params=("criterion", "max_depth", "min_samples_split",
|
|
"min_samples_leaf", "min_weight_fraction_leaf",
|
|
"max_features", "max_leaf_nodes",
|
|
"min_impurity_decrease", "min_impurity_split",
|
|
"random_state", "ccp_alpha"),
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start,
|
|
class_weight=class_weight,
|
|
max_samples=max_samples)
|
|
|
|
self.criterion = criterion
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
|
self.max_features = max_features
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
self.min_impurity_decrease = min_impurity_decrease
|
|
self.min_impurity_split = min_impurity_split
|
|
self.ccp_alpha = ccp_alpha
|
|
|
|
|
|
class RandomForestRegressor(ForestRegressor):
|
|
"""
|
|
A random forest regressor.
|
|
|
|
A random forest is a meta estimator that fits a number of classifying
|
|
decision trees on various sub-samples of the dataset and uses averaging
|
|
to improve the predictive accuracy and control over-fitting.
|
|
The sub-sample size is controlled with the `max_samples` parameter if
|
|
`bootstrap=True` (default), otherwise the whole dataset is used to build
|
|
each tree.
|
|
|
|
Read more in the :ref:`User Guide <forest>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : int, default=100
|
|
The number of trees in the forest.
|
|
|
|
.. versionchanged:: 0.22
|
|
The default value of ``n_estimators`` changed from 10 to 100
|
|
in 0.22.
|
|
|
|
criterion : {"mse", "mae"}, default="mse"
|
|
The function to measure the quality of a split. Supported criteria
|
|
are "mse" for the mean squared error, which is equal to variance
|
|
reduction as feature selection criterion, and "mae" for the mean
|
|
absolute error.
|
|
|
|
.. versionadded:: 0.18
|
|
Mean Absolute Error (MAE) criterion.
|
|
|
|
max_depth : int, default=None
|
|
The maximum depth of the tree. If None, then nodes are expanded until
|
|
all leaves are pure or until all leaves contain less than
|
|
min_samples_split samples.
|
|
|
|
min_samples_split : int or float, default=2
|
|
The minimum number of samples required to split an internal node:
|
|
|
|
- If int, then consider `min_samples_split` as the minimum number.
|
|
- If float, then `min_samples_split` is a fraction and
|
|
`ceil(min_samples_split * n_samples)` are the minimum
|
|
number of samples for each split.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for fractions.
|
|
|
|
min_samples_leaf : int or float, default=1
|
|
The minimum number of samples required to be at a leaf node.
|
|
A split point at any depth will only be considered if it leaves at
|
|
least ``min_samples_leaf`` training samples in each of the left and
|
|
right branches. This may have the effect of smoothing the model,
|
|
especially in regression.
|
|
|
|
- If int, then consider `min_samples_leaf` as the minimum number.
|
|
- If float, then `min_samples_leaf` is a fraction and
|
|
`ceil(min_samples_leaf * n_samples)` are the minimum
|
|
number of samples for each node.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for fractions.
|
|
|
|
min_weight_fraction_leaf : float, default=0.0
|
|
The minimum weighted fraction of the sum total of weights (of all
|
|
the input samples) required to be at a leaf node. Samples have
|
|
equal weight when sample_weight is not provided.
|
|
|
|
max_features : {"auto", "sqrt", "log2"}, int or float, default="auto"
|
|
The number of features to consider when looking for the best split:
|
|
|
|
- If int, then consider `max_features` features at each split.
|
|
- If float, then `max_features` is a fraction and
|
|
`int(max_features * n_features)` features are considered at each
|
|
split.
|
|
- If "auto", then `max_features=n_features`.
|
|
- If "sqrt", then `max_features=sqrt(n_features)`.
|
|
- If "log2", then `max_features=log2(n_features)`.
|
|
- If None, then `max_features=n_features`.
|
|
|
|
Note: the search for a split does not stop until at least one
|
|
valid partition of the node samples is found, even if it requires to
|
|
effectively inspect more than ``max_features`` features.
|
|
|
|
max_leaf_nodes : int, default=None
|
|
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
Best nodes are defined as relative reduction in impurity.
|
|
If None then unlimited number of leaf nodes.
|
|
|
|
min_impurity_decrease : float, default=0.0
|
|
A node will be split if this split induces a decrease of the impurity
|
|
greater than or equal to this value.
|
|
|
|
The weighted impurity decrease equation is the following::
|
|
|
|
N_t / N * (impurity - N_t_R / N_t * right_impurity
|
|
- N_t_L / N_t * left_impurity)
|
|
|
|
where ``N`` is the total number of samples, ``N_t`` is the number of
|
|
samples at the current node, ``N_t_L`` is the number of samples in the
|
|
left child, and ``N_t_R`` is the number of samples in the right child.
|
|
|
|
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
|
|
if ``sample_weight`` is passed.
|
|
|
|
.. versionadded:: 0.19
|
|
|
|
min_impurity_split : float, default=None
|
|
Threshold for early stopping in tree growth. A node will split
|
|
if its impurity is above the threshold, otherwise it is a leaf.
|
|
|
|
.. deprecated:: 0.19
|
|
``min_impurity_split`` has been deprecated in favor of
|
|
``min_impurity_decrease`` in 0.19. The default value of
|
|
``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
|
|
will be removed in 0.25. Use ``min_impurity_decrease`` instead.
|
|
|
|
bootstrap : bool, default=True
|
|
Whether bootstrap samples are used when building trees. If False, the
|
|
whole dataset is used to build each tree.
|
|
|
|
oob_score : bool, default=False
|
|
whether to use out-of-bag samples to estimate
|
|
the R^2 on unseen data.
|
|
|
|
n_jobs : int, default=None
|
|
The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
|
|
:meth:`decision_path` and :meth:`apply` are all parallelized over the
|
|
trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
|
|
context. ``-1`` means using all processors. See :term:`Glossary
|
|
<n_jobs>` for more details.
|
|
|
|
random_state : int or RandomState, default=None
|
|
Controls both the randomness of the bootstrapping of the samples used
|
|
when building trees (if ``bootstrap=True``) and the sampling of the
|
|
features to consider when looking for the best split at each node
|
|
(if ``max_features < n_features``).
|
|
See :term:`Glossary <random_state>` for details.
|
|
|
|
verbose : int, default=0
|
|
Controls the verbosity when fitting and predicting.
|
|
|
|
warm_start : bool, default=False
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest. See :term:`the Glossary <warm_start>`.
|
|
|
|
ccp_alpha : non-negative float, default=0.0
|
|
Complexity parameter used for Minimal Cost-Complexity Pruning. The
|
|
subtree with the largest cost complexity that is smaller than
|
|
``ccp_alpha`` will be chosen. By default, no pruning is performed. See
|
|
:ref:`minimal_cost_complexity_pruning` for details.
|
|
|
|
.. versionadded:: 0.22
|
|
|
|
max_samples : int or float, default=None
|
|
If bootstrap is True, the number of samples to draw from X
|
|
to train each base estimator.
|
|
|
|
- If None (default), then draw `X.shape[0]` samples.
|
|
- If int, then draw `max_samples` samples.
|
|
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
|
|
`max_samples` should be in the interval `(0, 1)`.
|
|
|
|
.. versionadded:: 0.22
|
|
|
|
Attributes
|
|
----------
|
|
base_estimator_ : DecisionTreeRegressor
|
|
The child estimator template used to create the collection of fitted
|
|
sub-estimators.
|
|
|
|
estimators_ : list of DecisionTreeRegressor
|
|
The collection of fitted sub-estimators.
|
|
|
|
feature_importances_ : ndarray of shape (n_features,)
|
|
The impurity-based feature importances.
|
|
The higher, the more important the feature.
|
|
The importance of a feature is computed as the (normalized)
|
|
total reduction of the criterion brought by that feature. It is also
|
|
known as the Gini importance.
|
|
|
|
Warning: impurity-based feature importances can be misleading for
|
|
high cardinality features (many unique values). See
|
|
:func:`sklearn.inspection.permutation_importance` as an alternative.
|
|
|
|
n_features_ : int
|
|
The number of features when ``fit`` is performed.
|
|
|
|
n_outputs_ : int
|
|
The number of outputs when ``fit`` is performed.
|
|
|
|
oob_score_ : float
|
|
Score of the training dataset obtained using an out-of-bag estimate.
|
|
This attribute exists only when ``oob_score`` is True.
|
|
|
|
oob_prediction_ : ndarray of shape (n_samples,)
|
|
Prediction computed with out-of-bag estimate on the training set.
|
|
This attribute exists only when ``oob_score`` is True.
|
|
|
|
See Also
|
|
--------
|
|
DecisionTreeRegressor, ExtraTreesRegressor
|
|
|
|
Notes
|
|
-----
|
|
The default values for the parameters controlling the size of the trees
|
|
(e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
|
|
unpruned trees which can potentially be very large on some data sets. To
|
|
reduce memory consumption, the complexity and size of the trees should be
|
|
controlled by setting those parameter values.
|
|
|
|
The features are always randomly permuted at each split. Therefore,
|
|
the best found split may vary, even with the same training data,
|
|
``max_features=n_features`` and ``bootstrap=False``, if the improvement
|
|
of the criterion is identical for several splits enumerated during the
|
|
search of the best split. To obtain a deterministic behaviour during
|
|
fitting, ``random_state`` has to be fixed.
|
|
|
|
The default value ``max_features="auto"`` uses ``n_features``
|
|
rather than ``n_features / 3``. The latter was originally suggested in
|
|
[1], whereas the former was more recently justified empirically in [2].
|
|
|
|
References
|
|
----------
|
|
.. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
|
|
|
|
.. [2] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized
|
|
trees", Machine Learning, 63(1), 3-42, 2006.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.ensemble import RandomForestRegressor
|
|
>>> from sklearn.datasets import make_regression
|
|
>>> X, y = make_regression(n_features=4, n_informative=2,
|
|
... random_state=0, shuffle=False)
|
|
>>> regr = RandomForestRegressor(max_depth=2, random_state=0)
|
|
>>> regr.fit(X, y)
|
|
RandomForestRegressor(...)
|
|
>>> print(regr.predict([[0, 0, 0, 0]]))
|
|
[-8.32987858]
|
|
"""
|
|
@_deprecate_positional_args
|
|
def __init__(self,
|
|
n_estimators=100, *,
|
|
criterion="mse",
|
|
max_depth=None,
|
|
min_samples_split=2,
|
|
min_samples_leaf=1,
|
|
min_weight_fraction_leaf=0.,
|
|
max_features="auto",
|
|
max_leaf_nodes=None,
|
|
min_impurity_decrease=0.,
|
|
min_impurity_split=None,
|
|
bootstrap=True,
|
|
oob_score=False,
|
|
n_jobs=None,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
ccp_alpha=0.0,
|
|
max_samples=None):
|
|
super().__init__(
|
|
base_estimator=DecisionTreeRegressor(),
|
|
n_estimators=n_estimators,
|
|
estimator_params=("criterion", "max_depth", "min_samples_split",
|
|
"min_samples_leaf", "min_weight_fraction_leaf",
|
|
"max_features", "max_leaf_nodes",
|
|
"min_impurity_decrease", "min_impurity_split",
|
|
"random_state", "ccp_alpha"),
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start,
|
|
max_samples=max_samples)
|
|
|
|
self.criterion = criterion
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
|
self.max_features = max_features
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
self.min_impurity_decrease = min_impurity_decrease
|
|
self.min_impurity_split = min_impurity_split
|
|
self.ccp_alpha = ccp_alpha
|
|
|
|
|
|
class ExtraTreesClassifier(ForestClassifier):
|
|
"""
|
|
An extra-trees classifier.
|
|
|
|
This class implements a meta estimator that fits a number of
|
|
randomized decision trees (a.k.a. extra-trees) on various sub-samples
|
|
of the dataset and uses averaging to improve the predictive accuracy
|
|
and control over-fitting.
|
|
|
|
Read more in the :ref:`User Guide <forest>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : int, default=100
|
|
The number of trees in the forest.
|
|
|
|
.. versionchanged:: 0.22
|
|
The default value of ``n_estimators`` changed from 10 to 100
|
|
in 0.22.
|
|
|
|
criterion : {"gini", "entropy"}, default="gini"
|
|
The function to measure the quality of a split. Supported criteria are
|
|
"gini" for the Gini impurity and "entropy" for the information gain.
|
|
|
|
max_depth : int, default=None
|
|
The maximum depth of the tree. If None, then nodes are expanded until
|
|
all leaves are pure or until all leaves contain less than
|
|
min_samples_split samples.
|
|
|
|
min_samples_split : int or float, default=2
|
|
The minimum number of samples required to split an internal node:
|
|
|
|
- If int, then consider `min_samples_split` as the minimum number.
|
|
- If float, then `min_samples_split` is a fraction and
|
|
`ceil(min_samples_split * n_samples)` are the minimum
|
|
number of samples for each split.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for fractions.
|
|
|
|
min_samples_leaf : int or float, default=1
|
|
The minimum number of samples required to be at a leaf node.
|
|
A split point at any depth will only be considered if it leaves at
|
|
least ``min_samples_leaf`` training samples in each of the left and
|
|
right branches. This may have the effect of smoothing the model,
|
|
especially in regression.
|
|
|
|
- If int, then consider `min_samples_leaf` as the minimum number.
|
|
- If float, then `min_samples_leaf` is a fraction and
|
|
`ceil(min_samples_leaf * n_samples)` are the minimum
|
|
number of samples for each node.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for fractions.
|
|
|
|
min_weight_fraction_leaf : float, default=0.0
|
|
The minimum weighted fraction of the sum total of weights (of all
|
|
the input samples) required to be at a leaf node. Samples have
|
|
equal weight when sample_weight is not provided.
|
|
|
|
max_features : {"auto", "sqrt", "log2"}, int or float, default="auto"
|
|
The number of features to consider when looking for the best split:
|
|
|
|
- If int, then consider `max_features` features at each split.
|
|
- If float, then `max_features` is a fraction and
|
|
`int(max_features * n_features)` features are considered at each
|
|
split.
|
|
- If "auto", then `max_features=sqrt(n_features)`.
|
|
- If "sqrt", then `max_features=sqrt(n_features)`.
|
|
- If "log2", then `max_features=log2(n_features)`.
|
|
- If None, then `max_features=n_features`.
|
|
|
|
Note: the search for a split does not stop until at least one
|
|
valid partition of the node samples is found, even if it requires to
|
|
effectively inspect more than ``max_features`` features.
|
|
|
|
max_leaf_nodes : int, default=None
|
|
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
Best nodes are defined as relative reduction in impurity.
|
|
If None then unlimited number of leaf nodes.
|
|
|
|
min_impurity_decrease : float, default=0.0
|
|
A node will be split if this split induces a decrease of the impurity
|
|
greater than or equal to this value.
|
|
|
|
The weighted impurity decrease equation is the following::
|
|
|
|
N_t / N * (impurity - N_t_R / N_t * right_impurity
|
|
- N_t_L / N_t * left_impurity)
|
|
|
|
where ``N`` is the total number of samples, ``N_t`` is the number of
|
|
samples at the current node, ``N_t_L`` is the number of samples in the
|
|
left child, and ``N_t_R`` is the number of samples in the right child.
|
|
|
|
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
|
|
if ``sample_weight`` is passed.
|
|
|
|
.. versionadded:: 0.19
|
|
|
|
min_impurity_split : float, default=None
|
|
Threshold for early stopping in tree growth. A node will split
|
|
if its impurity is above the threshold, otherwise it is a leaf.
|
|
|
|
.. deprecated:: 0.19
|
|
``min_impurity_split`` has been deprecated in favor of
|
|
``min_impurity_decrease`` in 0.19. The default value of
|
|
``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
|
|
will be removed in 0.25. Use ``min_impurity_decrease`` instead.
|
|
|
|
bootstrap : bool, default=False
|
|
Whether bootstrap samples are used when building trees. If False, the
|
|
whole dataset is used to build each tree.
|
|
|
|
oob_score : bool, default=False
|
|
Whether to use out-of-bag samples to estimate
|
|
the generalization accuracy.
|
|
|
|
n_jobs : int, default=None
|
|
The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
|
|
:meth:`decision_path` and :meth:`apply` are all parallelized over the
|
|
trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
|
|
context. ``-1`` means using all processors. See :term:`Glossary
|
|
<n_jobs>` for more details.
|
|
|
|
random_state : int, RandomState, default=None
|
|
Controls 3 sources of randomness:
|
|
|
|
- the bootstrapping of the samples used when building trees
|
|
(if ``bootstrap=True``)
|
|
- the sampling of the features to consider when looking for the best
|
|
split at each node (if ``max_features < n_features``)
|
|
- the draw of the splits for each of the `max_features`
|
|
|
|
See :term:`Glossary <random_state>` for details.
|
|
|
|
verbose : int, default=0
|
|
Controls the verbosity when fitting and predicting.
|
|
|
|
warm_start : bool, default=False
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest. See :term:`the Glossary <warm_start>`.
|
|
|
|
class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \
|
|
default=None
|
|
Weights associated with classes in the form ``{class_label: weight}``.
|
|
If not given, all classes are supposed to have weight one. For
|
|
multi-output problems, a list of dicts can be provided in the same
|
|
order as the columns of y.
|
|
|
|
Note that for multioutput (including multilabel) weights should be
|
|
defined for each class of every column in its own dict. For example,
|
|
for four-class multilabel classification weights should be
|
|
[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
|
|
[{1:1}, {2:5}, {3:1}, {4:1}].
|
|
|
|
The "balanced" mode uses the values of y to automatically adjust
|
|
weights inversely proportional to class frequencies in the input data
|
|
as ``n_samples / (n_classes * np.bincount(y))``
|
|
|
|
The "balanced_subsample" mode is the same as "balanced" except that
|
|
weights are computed based on the bootstrap sample for every tree
|
|
grown.
|
|
|
|
For multi-output, the weights of each column of y will be multiplied.
|
|
|
|
Note that these weights will be multiplied with sample_weight (passed
|
|
through the fit method) if sample_weight is specified.
|
|
|
|
ccp_alpha : non-negative float, default=0.0
|
|
Complexity parameter used for Minimal Cost-Complexity Pruning. The
|
|
subtree with the largest cost complexity that is smaller than
|
|
``ccp_alpha`` will be chosen. By default, no pruning is performed. See
|
|
:ref:`minimal_cost_complexity_pruning` for details.
|
|
|
|
.. versionadded:: 0.22
|
|
|
|
max_samples : int or float, default=None
|
|
If bootstrap is True, the number of samples to draw from X
|
|
to train each base estimator.
|
|
|
|
- If None (default), then draw `X.shape[0]` samples.
|
|
- If int, then draw `max_samples` samples.
|
|
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
|
|
`max_samples` should be in the interval `(0, 1)`.
|
|
|
|
.. versionadded:: 0.22
|
|
|
|
Attributes
|
|
----------
|
|
base_estimator_ : ExtraTreesClassifier
|
|
The child estimator template used to create the collection of fitted
|
|
sub-estimators.
|
|
|
|
estimators_ : list of DecisionTreeClassifier
|
|
The collection of fitted sub-estimators.
|
|
|
|
classes_ : ndarray of shape (n_classes,) or a list of such arrays
|
|
The classes labels (single output problem), or a list of arrays of
|
|
class labels (multi-output problem).
|
|
|
|
n_classes_ : int or list
|
|
The number of classes (single output problem), or a list containing the
|
|
number of classes for each output (multi-output problem).
|
|
|
|
feature_importances_ : ndarray of shape (n_features,)
|
|
The impurity-based feature importances.
|
|
The higher, the more important the feature.
|
|
The importance of a feature is computed as the (normalized)
|
|
total reduction of the criterion brought by that feature. It is also
|
|
known as the Gini importance.
|
|
|
|
Warning: impurity-based feature importances can be misleading for
|
|
high cardinality features (many unique values). See
|
|
:func:`sklearn.inspection.permutation_importance` as an alternative.
|
|
|
|
n_features_ : int
|
|
The number of features when ``fit`` is performed.
|
|
|
|
n_outputs_ : int
|
|
The number of outputs when ``fit`` is performed.
|
|
|
|
oob_score_ : float
|
|
Score of the training dataset obtained using an out-of-bag estimate.
|
|
This attribute exists only when ``oob_score`` is True.
|
|
|
|
oob_decision_function_ : ndarray of shape (n_samples, n_classes)
|
|
Decision function computed with out-of-bag estimate on the training
|
|
set. If n_estimators is small it might be possible that a data point
|
|
was never left out during the bootstrap. In this case,
|
|
`oob_decision_function_` might contain NaN. This attribute exists
|
|
only when ``oob_score`` is True.
|
|
|
|
See Also
|
|
--------
|
|
sklearn.tree.ExtraTreeClassifier : Base classifier for this ensemble.
|
|
RandomForestClassifier : Ensemble Classifier based on trees with optimal
|
|
splits.
|
|
|
|
Notes
|
|
-----
|
|
The default values for the parameters controlling the size of the trees
|
|
(e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
|
|
unpruned trees which can potentially be very large on some data sets. To
|
|
reduce memory consumption, the complexity and size of the trees should be
|
|
controlled by setting those parameter values.
|
|
|
|
References
|
|
----------
|
|
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized
|
|
trees", Machine Learning, 63(1), 3-42, 2006.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.ensemble import ExtraTreesClassifier
|
|
>>> from sklearn.datasets import make_classification
|
|
>>> X, y = make_classification(n_features=4, random_state=0)
|
|
>>> clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
|
|
>>> clf.fit(X, y)
|
|
ExtraTreesClassifier(random_state=0)
|
|
>>> clf.predict([[0, 0, 0, 0]])
|
|
array([1])
|
|
"""
|
|
@_deprecate_positional_args
|
|
def __init__(self,
|
|
n_estimators=100, *,
|
|
criterion="gini",
|
|
max_depth=None,
|
|
min_samples_split=2,
|
|
min_samples_leaf=1,
|
|
min_weight_fraction_leaf=0.,
|
|
max_features="auto",
|
|
max_leaf_nodes=None,
|
|
min_impurity_decrease=0.,
|
|
min_impurity_split=None,
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=None,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
class_weight=None,
|
|
ccp_alpha=0.0,
|
|
max_samples=None):
|
|
super().__init__(
|
|
base_estimator=ExtraTreeClassifier(),
|
|
n_estimators=n_estimators,
|
|
estimator_params=("criterion", "max_depth", "min_samples_split",
|
|
"min_samples_leaf", "min_weight_fraction_leaf",
|
|
"max_features", "max_leaf_nodes",
|
|
"min_impurity_decrease", "min_impurity_split",
|
|
"random_state", "ccp_alpha"),
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start,
|
|
class_weight=class_weight,
|
|
max_samples=max_samples)
|
|
|
|
self.criterion = criterion
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
|
self.max_features = max_features
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
self.min_impurity_decrease = min_impurity_decrease
|
|
self.min_impurity_split = min_impurity_split
|
|
self.ccp_alpha = ccp_alpha
|
|
|
|
|
|
class ExtraTreesRegressor(ForestRegressor):
|
|
"""
|
|
An extra-trees regressor.
|
|
|
|
This class implements a meta estimator that fits a number of
|
|
randomized decision trees (a.k.a. extra-trees) on various sub-samples
|
|
of the dataset and uses averaging to improve the predictive accuracy
|
|
and control over-fitting.
|
|
|
|
Read more in the :ref:`User Guide <forest>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : int, default=100
|
|
The number of trees in the forest.
|
|
|
|
.. versionchanged:: 0.22
|
|
The default value of ``n_estimators`` changed from 10 to 100
|
|
in 0.22.
|
|
|
|
criterion : {"mse", "mae"}, default="mse"
|
|
The function to measure the quality of a split. Supported criteria
|
|
are "mse" for the mean squared error, which is equal to variance
|
|
reduction as feature selection criterion, and "mae" for the mean
|
|
absolute error.
|
|
|
|
.. versionadded:: 0.18
|
|
Mean Absolute Error (MAE) criterion.
|
|
|
|
max_depth : int, default=None
|
|
The maximum depth of the tree. If None, then nodes are expanded until
|
|
all leaves are pure or until all leaves contain less than
|
|
min_samples_split samples.
|
|
|
|
min_samples_split : int or float, default=2
|
|
The minimum number of samples required to split an internal node:
|
|
|
|
- If int, then consider `min_samples_split` as the minimum number.
|
|
- If float, then `min_samples_split` is a fraction and
|
|
`ceil(min_samples_split * n_samples)` are the minimum
|
|
number of samples for each split.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for fractions.
|
|
|
|
min_samples_leaf : int or float, default=1
|
|
The minimum number of samples required to be at a leaf node.
|
|
A split point at any depth will only be considered if it leaves at
|
|
least ``min_samples_leaf`` training samples in each of the left and
|
|
right branches. This may have the effect of smoothing the model,
|
|
especially in regression.
|
|
|
|
- If int, then consider `min_samples_leaf` as the minimum number.
|
|
- If float, then `min_samples_leaf` is a fraction and
|
|
`ceil(min_samples_leaf * n_samples)` are the minimum
|
|
number of samples for each node.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for fractions.
|
|
|
|
min_weight_fraction_leaf : float, default=0.0
|
|
The minimum weighted fraction of the sum total of weights (of all
|
|
the input samples) required to be at a leaf node. Samples have
|
|
equal weight when sample_weight is not provided.
|
|
|
|
max_features : {"auto", "sqrt", "log2"} int or float, default="auto"
|
|
The number of features to consider when looking for the best split:
|
|
|
|
- If int, then consider `max_features` features at each split.
|
|
- If float, then `max_features` is a fraction and
|
|
`int(max_features * n_features)` features are considered at each
|
|
split.
|
|
- If "auto", then `max_features=n_features`.
|
|
- If "sqrt", then `max_features=sqrt(n_features)`.
|
|
- If "log2", then `max_features=log2(n_features)`.
|
|
- If None, then `max_features=n_features`.
|
|
|
|
Note: the search for a split does not stop until at least one
|
|
valid partition of the node samples is found, even if it requires to
|
|
effectively inspect more than ``max_features`` features.
|
|
|
|
max_leaf_nodes : int, default=None
|
|
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
Best nodes are defined as relative reduction in impurity.
|
|
If None then unlimited number of leaf nodes.
|
|
|
|
min_impurity_decrease : float, default=0.0
|
|
A node will be split if this split induces a decrease of the impurity
|
|
greater than or equal to this value.
|
|
|
|
The weighted impurity decrease equation is the following::
|
|
|
|
N_t / N * (impurity - N_t_R / N_t * right_impurity
|
|
- N_t_L / N_t * left_impurity)
|
|
|
|
where ``N`` is the total number of samples, ``N_t`` is the number of
|
|
samples at the current node, ``N_t_L`` is the number of samples in the
|
|
left child, and ``N_t_R`` is the number of samples in the right child.
|
|
|
|
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
|
|
if ``sample_weight`` is passed.
|
|
|
|
.. versionadded:: 0.19
|
|
|
|
min_impurity_split : float, default=None
|
|
Threshold for early stopping in tree growth. A node will split
|
|
if its impurity is above the threshold, otherwise it is a leaf.
|
|
|
|
.. deprecated:: 0.19
|
|
``min_impurity_split`` has been deprecated in favor of
|
|
``min_impurity_decrease`` in 0.19. The default value of
|
|
``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
|
|
will be removed in 0.25. Use ``min_impurity_decrease`` instead.
|
|
|
|
bootstrap : bool, default=False
|
|
Whether bootstrap samples are used when building trees. If False, the
|
|
whole dataset is used to build each tree.
|
|
|
|
oob_score : bool, default=False
|
|
Whether to use out-of-bag samples to estimate the R^2 on unseen data.
|
|
|
|
n_jobs : int, default=None
|
|
The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
|
|
:meth:`decision_path` and :meth:`apply` are all parallelized over the
|
|
trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
|
|
context. ``-1`` means using all processors. See :term:`Glossary
|
|
<n_jobs>` for more details.
|
|
|
|
random_state : int or RandomState, default=None
|
|
Controls 3 sources of randomness:
|
|
|
|
- the bootstrapping of the samples used when building trees
|
|
(if ``bootstrap=True``)
|
|
- the sampling of the features to consider when looking for the best
|
|
split at each node (if ``max_features < n_features``)
|
|
- the draw of the splits for each of the `max_features`
|
|
|
|
See :term:`Glossary <random_state>` for details.
|
|
|
|
verbose : int, default=0
|
|
Controls the verbosity when fitting and predicting.
|
|
|
|
warm_start : bool, default=False
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest. See :term:`the Glossary <warm_start>`.
|
|
|
|
ccp_alpha : non-negative float, default=0.0
|
|
Complexity parameter used for Minimal Cost-Complexity Pruning. The
|
|
subtree with the largest cost complexity that is smaller than
|
|
``ccp_alpha`` will be chosen. By default, no pruning is performed. See
|
|
:ref:`minimal_cost_complexity_pruning` for details.
|
|
|
|
.. versionadded:: 0.22
|
|
|
|
max_samples : int or float, default=None
|
|
If bootstrap is True, the number of samples to draw from X
|
|
to train each base estimator.
|
|
|
|
- If None (default), then draw `X.shape[0]` samples.
|
|
- If int, then draw `max_samples` samples.
|
|
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
|
|
`max_samples` should be in the interval `(0, 1)`.
|
|
|
|
.. versionadded:: 0.22
|
|
|
|
Attributes
|
|
----------
|
|
base_estimator_ : ExtraTreeRegressor
|
|
The child estimator template used to create the collection of fitted
|
|
sub-estimators.
|
|
|
|
estimators_ : list of DecisionTreeRegressor
|
|
The collection of fitted sub-estimators.
|
|
|
|
feature_importances_ : ndarray of shape (n_features,)
|
|
The impurity-based feature importances.
|
|
The higher, the more important the feature.
|
|
The importance of a feature is computed as the (normalized)
|
|
total reduction of the criterion brought by that feature. It is also
|
|
known as the Gini importance.
|
|
|
|
Warning: impurity-based feature importances can be misleading for
|
|
high cardinality features (many unique values). See
|
|
:func:`sklearn.inspection.permutation_importance` as an alternative.
|
|
|
|
n_features_ : int
|
|
The number of features.
|
|
|
|
n_outputs_ : int
|
|
The number of outputs.
|
|
|
|
oob_score_ : float
|
|
Score of the training dataset obtained using an out-of-bag estimate.
|
|
This attribute exists only when ``oob_score`` is True.
|
|
|
|
oob_prediction_ : ndarray of shape (n_samples,)
|
|
Prediction computed with out-of-bag estimate on the training set.
|
|
This attribute exists only when ``oob_score`` is True.
|
|
|
|
See Also
|
|
--------
|
|
sklearn.tree.ExtraTreeRegressor: Base estimator for this ensemble.
|
|
RandomForestRegressor: Ensemble regressor using trees with optimal splits.
|
|
|
|
Notes
|
|
-----
|
|
The default values for the parameters controlling the size of the trees
|
|
(e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
|
|
unpruned trees which can potentially be very large on some data sets. To
|
|
reduce memory consumption, the complexity and size of the trees should be
|
|
controlled by setting those parameter values.
|
|
|
|
References
|
|
----------
|
|
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
|
|
Machine Learning, 63(1), 3-42, 2006.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import load_diabetes
|
|
>>> from sklearn.model_selection import train_test_split
|
|
>>> from sklearn.ensemble import ExtraTreesRegressor
|
|
>>> X, y = load_diabetes(return_X_y=True)
|
|
>>> X_train, X_test, y_train, y_test = train_test_split(
|
|
... X, y, random_state=0)
|
|
>>> reg = ExtraTreesRegressor(n_estimators=100, random_state=0).fit(
|
|
... X_train, y_train)
|
|
>>> reg.score(X_test, y_test)
|
|
0.2708...
|
|
"""
|
|
@_deprecate_positional_args
|
|
def __init__(self,
|
|
n_estimators=100, *,
|
|
criterion="mse",
|
|
max_depth=None,
|
|
min_samples_split=2,
|
|
min_samples_leaf=1,
|
|
min_weight_fraction_leaf=0.,
|
|
max_features="auto",
|
|
max_leaf_nodes=None,
|
|
min_impurity_decrease=0.,
|
|
min_impurity_split=None,
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=None,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
ccp_alpha=0.0,
|
|
max_samples=None):
|
|
super().__init__(
|
|
base_estimator=ExtraTreeRegressor(),
|
|
n_estimators=n_estimators,
|
|
estimator_params=("criterion", "max_depth", "min_samples_split",
|
|
"min_samples_leaf", "min_weight_fraction_leaf",
|
|
"max_features", "max_leaf_nodes",
|
|
"min_impurity_decrease", "min_impurity_split",
|
|
"random_state", "ccp_alpha"),
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start,
|
|
max_samples=max_samples)
|
|
|
|
self.criterion = criterion
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
|
self.max_features = max_features
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
self.min_impurity_decrease = min_impurity_decrease
|
|
self.min_impurity_split = min_impurity_split
|
|
self.ccp_alpha = ccp_alpha
|
|
|
|
|
|
class RandomTreesEmbedding(BaseForest):
|
|
"""
|
|
An ensemble of totally random trees.
|
|
|
|
An unsupervised transformation of a dataset to a high-dimensional
|
|
sparse representation. A datapoint is coded according to which leaf of
|
|
each tree it is sorted into. Using a one-hot encoding of the leaves,
|
|
this leads to a binary coding with as many ones as there are trees in
|
|
the forest.
|
|
|
|
The dimensionality of the resulting representation is
|
|
``n_out <= n_estimators * max_leaf_nodes``. If ``max_leaf_nodes == None``,
|
|
the number of leaf nodes is at most ``n_estimators * 2 ** max_depth``.
|
|
|
|
Read more in the :ref:`User Guide <random_trees_embedding>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : int, default=100
|
|
Number of trees in the forest.
|
|
|
|
.. versionchanged:: 0.22
|
|
The default value of ``n_estimators`` changed from 10 to 100
|
|
in 0.22.
|
|
|
|
max_depth : int, default=5
|
|
The maximum depth of each tree. If None, then nodes are expanded until
|
|
all leaves are pure or until all leaves contain less than
|
|
min_samples_split samples.
|
|
|
|
min_samples_split : int or float, default=2
|
|
The minimum number of samples required to split an internal node:
|
|
|
|
- If int, then consider `min_samples_split` as the minimum number.
|
|
- If float, then `min_samples_split` is a fraction and
|
|
`ceil(min_samples_split * n_samples)` is the minimum
|
|
number of samples for each split.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for fractions.
|
|
|
|
min_samples_leaf : int or float, default=1
|
|
The minimum number of samples required to be at a leaf node.
|
|
A split point at any depth will only be considered if it leaves at
|
|
least ``min_samples_leaf`` training samples in each of the left and
|
|
right branches. This may have the effect of smoothing the model,
|
|
especially in regression.
|
|
|
|
- If int, then consider `min_samples_leaf` as the minimum number.
|
|
- If float, then `min_samples_leaf` is a fraction and
|
|
`ceil(min_samples_leaf * n_samples)` is the minimum
|
|
number of samples for each node.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for fractions.
|
|
|
|
min_weight_fraction_leaf : float, default=0.0
|
|
The minimum weighted fraction of the sum total of weights (of all
|
|
the input samples) required to be at a leaf node. Samples have
|
|
equal weight when sample_weight is not provided.
|
|
|
|
max_leaf_nodes : int, default=None
|
|
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
Best nodes are defined as relative reduction in impurity.
|
|
If None then unlimited number of leaf nodes.
|
|
|
|
min_impurity_decrease : float, default=0.0
|
|
A node will be split if this split induces a decrease of the impurity
|
|
greater than or equal to this value.
|
|
|
|
The weighted impurity decrease equation is the following::
|
|
|
|
N_t / N * (impurity - N_t_R / N_t * right_impurity
|
|
- N_t_L / N_t * left_impurity)
|
|
|
|
where ``N`` is the total number of samples, ``N_t`` is the number of
|
|
samples at the current node, ``N_t_L`` is the number of samples in the
|
|
left child, and ``N_t_R`` is the number of samples in the right child.
|
|
|
|
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
|
|
if ``sample_weight`` is passed.
|
|
|
|
.. versionadded:: 0.19
|
|
|
|
min_impurity_split : float, default=None
|
|
Threshold for early stopping in tree growth. A node will split
|
|
if its impurity is above the threshold, otherwise it is a leaf.
|
|
|
|
.. deprecated:: 0.19
|
|
``min_impurity_split`` has been deprecated in favor of
|
|
``min_impurity_decrease`` in 0.19. The default value of
|
|
``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
|
|
will be removed in 0.25. Use ``min_impurity_decrease`` instead.
|
|
|
|
sparse_output : bool, default=True
|
|
Whether or not to return a sparse CSR matrix, as default behavior,
|
|
or to return a dense array compatible with dense pipeline operators.
|
|
|
|
n_jobs : int, default=None
|
|
The number of jobs to run in parallel. :meth:`fit`, :meth:`transform`,
|
|
:meth:`decision_path` and :meth:`apply` are all parallelized over the
|
|
trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
|
|
context. ``-1`` means using all processors. See :term:`Glossary
|
|
<n_jobs>` for more details.
|
|
|
|
random_state : int or RandomState, default=None
|
|
Controls the generation of the random `y` used to fit the trees
|
|
and the draw of the splits for each feature at the trees' nodes.
|
|
See :term:`Glossary <random_state>` for details.
|
|
|
|
verbose : int, default=0
|
|
Controls the verbosity when fitting and predicting.
|
|
|
|
warm_start : bool, default=False
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest. See :term:`the Glossary <warm_start>`.
|
|
|
|
Attributes
|
|
----------
|
|
estimators_ : list of DecisionTreeClassifier
|
|
The collection of fitted sub-estimators.
|
|
|
|
References
|
|
----------
|
|
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
|
|
Machine Learning, 63(1), 3-42, 2006.
|
|
.. [2] Moosmann, F. and Triggs, B. and Jurie, F. "Fast discriminative
|
|
visual codebooks using randomized clustering forests"
|
|
NIPS 2007
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.ensemble import RandomTreesEmbedding
|
|
>>> X = [[0,0], [1,0], [0,1], [-1,0], [0,-1]]
|
|
>>> random_trees = RandomTreesEmbedding(
|
|
... n_estimators=5, random_state=0, max_depth=1).fit(X)
|
|
>>> X_sparse_embedding = random_trees.transform(X)
|
|
>>> X_sparse_embedding.toarray()
|
|
array([[0., 1., 1., 0., 1., 0., 0., 1., 1., 0.],
|
|
[0., 1., 1., 0., 1., 0., 0., 1., 1., 0.],
|
|
[0., 1., 0., 1., 0., 1., 0., 1., 0., 1.],
|
|
[1., 0., 1., 0., 1., 0., 1., 0., 1., 0.],
|
|
[0., 1., 1., 0., 1., 0., 0., 1., 1., 0.]])
|
|
"""
|
|
|
|
criterion = 'mse'
|
|
max_features = 1
|
|
|
|
@_deprecate_positional_args
|
|
def __init__(self,
|
|
n_estimators=100, *,
|
|
max_depth=5,
|
|
min_samples_split=2,
|
|
min_samples_leaf=1,
|
|
min_weight_fraction_leaf=0.,
|
|
max_leaf_nodes=None,
|
|
min_impurity_decrease=0.,
|
|
min_impurity_split=None,
|
|
sparse_output=True,
|
|
n_jobs=None,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False):
|
|
super().__init__(
|
|
base_estimator=ExtraTreeRegressor(),
|
|
n_estimators=n_estimators,
|
|
estimator_params=("criterion", "max_depth", "min_samples_split",
|
|
"min_samples_leaf", "min_weight_fraction_leaf",
|
|
"max_features", "max_leaf_nodes",
|
|
"min_impurity_decrease", "min_impurity_split",
|
|
"random_state"),
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start,
|
|
max_samples=None)
|
|
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
self.min_impurity_decrease = min_impurity_decrease
|
|
self.min_impurity_split = min_impurity_split
|
|
self.sparse_output = sparse_output
|
|
|
|
def _set_oob_score(self, X, y):
|
|
raise NotImplementedError("OOB score not supported by tree embedding")
|
|
|
|
def fit(self, X, y=None, sample_weight=None):
|
|
"""
|
|
Fit estimator.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
The input samples. Use ``dtype=np.float32`` for maximum
|
|
efficiency. Sparse matrices are also supported, use sparse
|
|
``csc_matrix`` for maximum efficiency.
|
|
|
|
y : Ignored
|
|
Not used, present for API consistency by convention.
|
|
|
|
sample_weight : array-like of shape (n_samples,), default=None
|
|
Sample weights. If None, then samples are equally weighted. Splits
|
|
that would create child nodes with net zero or negative weight are
|
|
ignored while searching for a split in each node. In the case of
|
|
classification, splits are also ignored if they would result in any
|
|
single class carrying a negative weight in either child node.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
|
|
"""
|
|
self.fit_transform(X, y, sample_weight=sample_weight)
|
|
return self
|
|
|
|
def fit_transform(self, X, y=None, sample_weight=None):
|
|
"""
|
|
Fit estimator and transform dataset.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
Input data used to build forests. Use ``dtype=np.float32`` for
|
|
maximum efficiency.
|
|
|
|
y : Ignored
|
|
Not used, present for API consistency by convention.
|
|
|
|
sample_weight : array-like of shape (n_samples,), default=None
|
|
Sample weights. If None, then samples are equally weighted. Splits
|
|
that would create child nodes with net zero or negative weight are
|
|
ignored while searching for a split in each node. In the case of
|
|
classification, splits are also ignored if they would result in any
|
|
single class carrying a negative weight in either child node.
|
|
|
|
Returns
|
|
-------
|
|
X_transformed : sparse matrix of shape (n_samples, n_out)
|
|
Transformed dataset.
|
|
"""
|
|
X = check_array(X, accept_sparse=['csc'])
|
|
if issparse(X):
|
|
# Pre-sort indices to avoid that each individual tree of the
|
|
# ensemble sorts the indices.
|
|
X.sort_indices()
|
|
|
|
rnd = check_random_state(self.random_state)
|
|
y = rnd.uniform(size=X.shape[0])
|
|
super().fit(X, y, sample_weight=sample_weight)
|
|
|
|
self.one_hot_encoder_ = OneHotEncoder(sparse=self.sparse_output)
|
|
return self.one_hot_encoder_.fit_transform(self.apply(X))
|
|
|
|
def transform(self, X):
|
|
"""
|
|
Transform dataset.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
Input data to be transformed. Use ``dtype=np.float32`` for maximum
|
|
efficiency. Sparse matrices are also supported, use sparse
|
|
``csr_matrix`` for maximum efficiency.
|
|
|
|
Returns
|
|
-------
|
|
X_transformed : sparse matrix of shape (n_samples, n_out)
|
|
Transformed dataset.
|
|
"""
|
|
check_is_fitted(self)
|
|
return self.one_hot_encoder_.transform(self.apply(X))
|