Uploaded Test files

This commit is contained in:
Batuhan Berk Başoğlu 2020-11-12 11:05:57 -05:00
parent f584ad9d97
commit 2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions

View file

@ -0,0 +1,26 @@
"""The :mod:`sklearn.inspection` module includes tools for model inspection."""
# TODO: remove me in 0.24 (as well as the noqa markers) and
# import the partial_dependence func directly from the
# ._partial_dependence module instead.
# Pre-cache the import of the deprecated module so that import
# sklearn.inspection.partial_dependence returns the function as in
# 0.21, instead of the module
# https://github.com/scikit-learn/scikit-learn/issues/15842
import warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning)
from .partial_dependence import partial_dependence
from ._permutation_importance import permutation_importance # noqa
from ._plot.partial_dependence import plot_partial_dependence # noqa
from ._plot.partial_dependence import PartialDependenceDisplay # noqa
__all__ = [
'partial_dependence',
'plot_partial_dependence',
'permutation_importance',
'PartialDependenceDisplay'
]

View file

@ -0,0 +1,421 @@
"""Partial dependence plots for regression and classification models."""
# Authors: Peter Prettenhofer
# Trevor Stephens
# Nicolas Hug
# License: BSD 3 clause
from collections.abc import Iterable
import numpy as np
from scipy import sparse
from scipy.stats.mstats import mquantiles
from ..base import is_classifier, is_regressor
from ..pipeline import Pipeline
from ..utils.extmath import cartesian
from ..utils import check_array
from ..utils import check_matplotlib_support # noqa
from ..utils import _safe_indexing
from ..utils import _determine_key_type
from ..utils import _get_column_indices
from ..utils.validation import check_is_fitted
from ..utils.validation import _deprecate_positional_args
from ..tree import DecisionTreeRegressor
from ..ensemble import RandomForestRegressor
from ..exceptions import NotFittedError
from ..ensemble._gb import BaseGradientBoosting
from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import (
BaseHistGradientBoosting)
__all__ = [
'partial_dependence',
]
def _grid_from_X(X, percentiles, grid_resolution):
"""Generate a grid of points based on the percentiles of X.
The grid is a cartesian product between the columns of ``values``. The
ith column of ``values`` consists in ``grid_resolution`` equally-spaced
points between the percentiles of the jth column of X.
If ``grid_resolution`` is bigger than the number of unique values in the
jth column of X, then those unique values will be used instead.
Parameters
----------
X : ndarray, shape (n_samples, n_target_features)
The data
percentiles : tuple of floats
The percentiles which are used to construct the extreme values of
the grid. Must be in [0, 1].
grid_resolution : int
The number of equally spaced points to be placed on the grid for each
feature.
Returns
-------
grid : ndarray, shape (n_points, n_target_features)
A value for each feature at each point in the grid. ``n_points`` is
always ``<= grid_resolution ** X.shape[1]``.
values : list of 1d ndarrays
The values with which the grid has been created. The size of each
array ``values[j]`` is either ``grid_resolution``, or the number of
unique values in ``X[:, j]``, whichever is smaller.
"""
if not isinstance(percentiles, Iterable) or len(percentiles) != 2:
raise ValueError("'percentiles' must be a sequence of 2 elements.")
if not all(0 <= x <= 1 for x in percentiles):
raise ValueError("'percentiles' values must be in [0, 1].")
if percentiles[0] >= percentiles[1]:
raise ValueError('percentiles[0] must be strictly less '
'than percentiles[1].')
if grid_resolution <= 1:
raise ValueError("'grid_resolution' must be strictly greater than 1.")
values = []
for feature in range(X.shape[1]):
uniques = np.unique(_safe_indexing(X, feature, axis=1))
if uniques.shape[0] < grid_resolution:
# feature has low resolution use unique vals
axis = uniques
else:
# create axis based on percentiles and grid resolution
emp_percentiles = mquantiles(
_safe_indexing(X, feature, axis=1), prob=percentiles, axis=0
)
if np.allclose(emp_percentiles[0], emp_percentiles[1]):
raise ValueError(
'percentiles are too close to each other, '
'unable to build the grid. Please choose percentiles '
'that are further apart.')
axis = np.linspace(emp_percentiles[0],
emp_percentiles[1],
num=grid_resolution, endpoint=True)
values.append(axis)
return cartesian(values), values
def _partial_dependence_recursion(est, grid, features):
averaged_predictions = est._compute_partial_dependence_recursion(grid,
features)
if averaged_predictions.ndim == 1:
# reshape to (1, n_points) for consistency with
# _partial_dependence_brute
averaged_predictions = averaged_predictions.reshape(1, -1)
return averaged_predictions
def _partial_dependence_brute(est, grid, features, X, response_method):
averaged_predictions = []
# define the prediction_method (predict, predict_proba, decision_function).
if is_regressor(est):
prediction_method = est.predict
else:
predict_proba = getattr(est, 'predict_proba', None)
decision_function = getattr(est, 'decision_function', None)
if response_method == 'auto':
# try predict_proba, then decision_function if it doesn't exist
prediction_method = predict_proba or decision_function
else:
prediction_method = (predict_proba if response_method ==
'predict_proba' else decision_function)
if prediction_method is None:
if response_method == 'auto':
raise ValueError(
'The estimator has no predict_proba and no '
'decision_function method.'
)
elif response_method == 'predict_proba':
raise ValueError('The estimator has no predict_proba method.')
else:
raise ValueError(
'The estimator has no decision_function method.')
for new_values in grid:
X_eval = X.copy()
for i, variable in enumerate(features):
if hasattr(X_eval, 'iloc'):
X_eval.iloc[:, variable] = new_values[i]
else:
X_eval[:, variable] = new_values[i]
try:
predictions = prediction_method(X_eval)
except NotFittedError:
raise ValueError(
"'estimator' parameter must be a fitted estimator")
# Note: predictions is of shape
# (n_points,) for non-multioutput regressors
# (n_points, n_tasks) for multioutput regressors
# (n_points, 1) for the regressors in cross_decomposition (I think)
# (n_points, 2) for binary classification
# (n_points, n_classes) for multiclass classification
# average over samples
averaged_predictions.append(np.mean(predictions, axis=0))
# reshape to (n_targets, n_points) where n_targets is:
# - 1 for non-multioutput regression and binary classification (shape is
# already correct in those cases)
# - n_tasks for multi-output regression
# - n_classes for multiclass classification.
averaged_predictions = np.array(averaged_predictions).T
if is_regressor(est) and averaged_predictions.ndim == 1:
# non-multioutput regression, shape is (n_points,)
averaged_predictions = averaged_predictions.reshape(1, -1)
elif is_classifier(est) and averaged_predictions.shape[0] == 2:
# Binary classification, shape is (2, n_points).
# we output the effect of **positive** class
averaged_predictions = averaged_predictions[1]
averaged_predictions = averaged_predictions.reshape(1, -1)
return averaged_predictions
@_deprecate_positional_args
def partial_dependence(estimator, X, features, *, response_method='auto',
percentiles=(0.05, 0.95), grid_resolution=100,
method='auto'):
"""Partial dependence of ``features``.
Partial dependence of a feature (or a set of features) corresponds to
the average response of an estimator for each possible value of the
feature.
Read more in the :ref:`User Guide <partial_dependence>`.
.. warning::
For :class:`~sklearn.ensemble.GradientBoostingClassifier` and
:class:`~sklearn.ensemble.GradientBoostingRegressor`, the
'recursion' method (used by default) will not account for the `init`
predictor of the boosting process. In practice, this will produce
the same values as 'brute' up to a constant offset in the target
response, provided that `init` is a constant estimator (which is the
default). However, if `init` is not a constant estimator, the
partial dependence values are incorrect for 'recursion' because the
offset will be sample-dependent. It is preferable to use the 'brute'
method. Note that this only applies to
:class:`~sklearn.ensemble.GradientBoostingClassifier` and
:class:`~sklearn.ensemble.GradientBoostingRegressor`, not to
:class:`~sklearn.ensemble.HistGradientBoostingClassifier` and
:class:`~sklearn.ensemble.HistGradientBoostingRegressor`.
Parameters
----------
estimator : BaseEstimator
A fitted estimator object implementing :term:`predict`,
:term:`predict_proba`, or :term:`decision_function`.
Multioutput-multiclass classifiers are not supported.
X : {array-like or dataframe} of shape (n_samples, n_features)
``X`` is used to generate a grid of values for the target
``features`` (where the partial dependence will be evaluated), and
also to generate values for the complement features when the
`method` is 'brute'.
features : array-like of {int, str}
The feature (e.g. `[0]`) or pair of interacting features
(e.g. `[(0, 1)]`) for which the partial dependency should be computed.
response_method : 'auto', 'predict_proba' or 'decision_function', \
optional (default='auto')
Specifies whether to use :term:`predict_proba` or
:term:`decision_function` as the target response. For regressors
this parameter is ignored and the response is always the output of
:term:`predict`. By default, :term:`predict_proba` is tried first
and we revert to :term:`decision_function` if it doesn't exist. If
``method`` is 'recursion', the response is always the output of
:term:`decision_function`.
percentiles : tuple of float, optional (default=(0.05, 0.95))
The lower and upper percentile used to create the extreme values
for the grid. Must be in [0, 1].
grid_resolution : int, optional (default=100)
The number of equally spaced points on the grid, for each target
feature.
method : str, optional (default='auto')
The method used to calculate the averaged predictions:
- 'recursion' is only supported for some tree-based estimators (namely
:class:`~sklearn.ensemble.GradientBoostingClassifier`,
:class:`~sklearn.ensemble.GradientBoostingRegressor`,
:class:`~sklearn.ensemble.HistGradientBoostingClassifier`,
:class:`~sklearn.ensemble.HistGradientBoostingRegressor`,
:class:`~sklearn.tree.DecisionTreeRegressor`,
:class:`~sklearn.ensemble.RandomForestRegressor`,
)
but is more efficient in terms of speed.
With this method, the target response of a
classifier is always the decision function, not the predicted
probabilities.
- 'brute' is supported for any estimator, but is more
computationally intensive.
- 'auto': the 'recursion' is used for estimators that support it,
and 'brute' is used otherwise.
Please see :ref:`this note <pdp_method_differences>` for
differences between the 'brute' and 'recursion' method.
Returns
-------
averaged_predictions : ndarray, \
shape (n_outputs, len(values[0]), len(values[1]), ...)
The predictions for all the points in the grid, averaged over all
samples in X (or over the training data if ``method`` is
'recursion'). ``n_outputs`` corresponds to the number of classes in
a multi-class setting, or to the number of tasks for multi-output
regression. For classical regression and binary classification
``n_outputs==1``. ``n_values_feature_j`` corresponds to the size
``values[j]``.
values : seq of 1d ndarrays
The values with which the grid has been created. The generated grid
is a cartesian product of the arrays in ``values``. ``len(values) ==
len(features)``. The size of each array ``values[j]`` is either
``grid_resolution``, or the number of unique values in ``X[:, j]``,
whichever is smaller.
Examples
--------
>>> X = [[0, 0, 2], [1, 0, 0]]
>>> y = [0, 1]
>>> from sklearn.ensemble import GradientBoostingClassifier
>>> gb = GradientBoostingClassifier(random_state=0).fit(X, y)
>>> partial_dependence(gb, features=[0], X=X, percentiles=(0, 1),
... grid_resolution=2) # doctest: +SKIP
(array([[-4.52..., 4.52...]]), [array([ 0., 1.])])
See also
--------
sklearn.inspection.plot_partial_dependence: Plot partial dependence
"""
if not (is_classifier(estimator) or is_regressor(estimator)):
raise ValueError(
"'estimator' must be a fitted regressor or classifier."
)
if isinstance(estimator, Pipeline):
# TODO: to be removed if/when pipeline get a `steps_` attributes
# assuming Pipeline is the only estimator that does not store a new
# attribute
for est in estimator:
# FIXME: remove the None option when it will be deprecated
if est not in (None, 'drop'):
check_is_fitted(est)
else:
check_is_fitted(estimator)
if (is_classifier(estimator) and
isinstance(estimator.classes_[0], np.ndarray)):
raise ValueError(
'Multiclass-multioutput estimators are not supported'
)
# Use check_array only on lists and other non-array-likes / sparse. Do not
# convert DataFrame into a NumPy array.
if not(hasattr(X, '__array__') or sparse.issparse(X)):
X = check_array(X, force_all_finite='allow-nan', dtype=np.object)
accepted_responses = ('auto', 'predict_proba', 'decision_function')
if response_method not in accepted_responses:
raise ValueError(
'response_method {} is invalid. Accepted response_method names '
'are {}.'.format(response_method, ', '.join(accepted_responses)))
if is_regressor(estimator) and response_method != 'auto':
raise ValueError(
"The response_method parameter is ignored for regressors and "
"must be 'auto'."
)
accepted_methods = ('brute', 'recursion', 'auto')
if method not in accepted_methods:
raise ValueError(
'method {} is invalid. Accepted method names are {}.'.format(
method, ', '.join(accepted_methods)))
if method == 'auto':
if (isinstance(estimator, BaseGradientBoosting) and
estimator.init is None):
method = 'recursion'
elif isinstance(estimator, (BaseHistGradientBoosting,
DecisionTreeRegressor,
RandomForestRegressor)):
method = 'recursion'
else:
method = 'brute'
if method == 'recursion':
if not isinstance(estimator,
(BaseGradientBoosting, BaseHistGradientBoosting,
DecisionTreeRegressor, RandomForestRegressor)):
supported_classes_recursion = (
'GradientBoostingClassifier',
'GradientBoostingRegressor',
'HistGradientBoostingClassifier',
'HistGradientBoostingRegressor',
'HistGradientBoostingRegressor',
'DecisionTreeRegressor',
'RandomForestRegressor',
)
raise ValueError(
"Only the following estimators support the 'recursion' "
"method: {}. Try using method='brute'."
.format(', '.join(supported_classes_recursion)))
if response_method == 'auto':
response_method = 'decision_function'
if response_method != 'decision_function':
raise ValueError(
"With the 'recursion' method, the response_method must be "
"'decision_function'. Got {}.".format(response_method)
)
if _determine_key_type(features, accept_slice=False) == 'int':
# _get_column_indices() supports negative indexing. Here, we limit
# the indexing to be positive. The upper bound will be checked
# by _get_column_indices()
if np.any(np.less(features, 0)):
raise ValueError(
'all features must be in [0, {}]'.format(X.shape[1] - 1)
)
features_indices = np.asarray(
_get_column_indices(X, features), dtype=np.int32, order='C'
).ravel()
grid, values = _grid_from_X(
_safe_indexing(X, features_indices, axis=1), percentiles,
grid_resolution
)
if method == 'brute':
averaged_predictions = _partial_dependence_brute(
estimator, grid, features_indices, X, response_method
)
else:
averaged_predictions = _partial_dependence_recursion(
estimator, grid, features_indices
)
# reshape averaged_predictions to
# (n_outputs, n_values_feature_0, n_values_feature_1, ...)
averaged_predictions = averaged_predictions.reshape(
-1, *[val.shape[0] for val in values])
return averaged_predictions, values

View file

@ -0,0 +1,142 @@
"""Permutation importance for estimators"""
import numpy as np
from joblib import Parallel
from joblib import delayed
from ..metrics import check_scoring
from ..utils import Bunch
from ..utils import check_random_state
from ..utils import check_array
from ..utils.validation import _deprecate_positional_args
def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
n_repeats, scorer):
"""Calculate score when `col_idx` is permuted."""
random_state = check_random_state(random_state)
# Work on a copy of X to to ensure thread-safety in case of threading based
# parallelism. Furthermore, making a copy is also useful when the joblib
# backend is 'loky' (default) or the old 'multiprocessing': in those cases,
# if X is large it will be automatically be backed by a readonly memory map
# (memmap). X.copy() on the other hand is always guaranteed to return a
# writable data-structure whose columns can be shuffled inplace.
X_permuted = X.copy()
scores = np.zeros(n_repeats)
shuffling_idx = np.arange(X.shape[0])
for n_round in range(n_repeats):
random_state.shuffle(shuffling_idx)
if hasattr(X_permuted, "iloc"):
col = X_permuted.iloc[shuffling_idx, col_idx]
col.index = X_permuted.index
X_permuted.iloc[:, col_idx] = col
else:
X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx]
feature_score = scorer(estimator, X_permuted, y)
scores[n_round] = feature_score
return scores
@_deprecate_positional_args
def permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5,
n_jobs=None, random_state=None):
"""Permutation importance for feature evaluation [BRE]_.
The :term:`estimator` is required to be a fitted estimator. `X` can be the
data set used to train the estimator or a hold-out set. The permutation
importance of a feature is calculated as follows. First, a baseline metric,
defined by :term:`scoring`, is evaluated on a (potentially different)
dataset defined by the `X`. Next, a feature column from the validation set
is permuted and the metric is evaluated again. The permutation importance
is defined to be the difference between the baseline metric and metric from
permutating the feature column.
Read more in the :ref:`User Guide <permutation_importance>`.
Parameters
----------
estimator : object
An estimator that has already been :term:`fitted` and is compatible
with :term:`scorer`.
X : ndarray or DataFrame, shape (n_samples, n_features)
Data on which permutation importance will be computed.
y : array-like or None, shape (n_samples, ) or (n_samples, n_classes)
Targets for supervised or `None` for unsupervised.
scoring : string, callable or None, default=None
Scorer to use. It can be a single
string (see :ref:`scoring_parameter`) or a callable (see
:ref:`scoring`). If None, the estimator's default scorer is used.
n_repeats : int, default=5
Number of times to permute a feature.
n_jobs : int or None, default=None
The number of jobs to use for the computation.
`None` means 1 unless in a :obj:`joblib.parallel_backend` context.
`-1` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
random_state : int, RandomState instance, default=None
Pseudo-random number generator to control the permutations of each
feature.
Pass an int to get reproducible results across function calls.
See :term: `Glossary <random_state>`.
Returns
-------
result : :class:`~sklearn.utils.Bunch`
Dictionary-like object, with the following attributes.
importances_mean : ndarray, shape (n_features, )
Mean of feature importance over `n_repeats`.
importances_std : ndarray, shape (n_features, )
Standard deviation over `n_repeats`.
importances : ndarray, shape (n_features, n_repeats)
Raw permutation importance scores.
References
----------
.. [BRE] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32,
2001. https://doi.org/10.1023/A:1010933404324
Examples
--------
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.inspection import permutation_importance
>>> X = [[1, 9, 9],[1, 9, 9],[1, 9, 9],
... [0, 9, 9],[0, 9, 9],[0, 9, 9]]
>>> y = [1, 1, 1, 0, 0, 0]
>>> clf = LogisticRegression().fit(X, y)
>>> result = permutation_importance(clf, X, y, n_repeats=10,
... random_state=0)
>>> result.importances_mean
array([0.4666..., 0. , 0. ])
>>> result.importances_std
array([0.2211..., 0. , 0. ])
"""
if not hasattr(X, "iloc"):
X = check_array(X, force_all_finite='allow-nan', dtype=None)
# Precompute random seed from the random state to be used
# to get a fresh independent RandomState instance for each
# parallel call to _calculate_permutation_scores, irrespective of
# the fact that variables are shared or not depending on the active
# joblib backend (sequential, thread-based or process-based).
random_state = check_random_state(random_state)
random_seed = random_state.randint(np.iinfo(np.int32).max + 1)
scorer = check_scoring(estimator, scoring=scoring)
baseline_score = scorer(estimator, X, y)
scores = Parallel(n_jobs=n_jobs)(delayed(_calculate_permutation_scores)(
estimator, X, y, col_idx, random_seed, n_repeats, scorer
) for col_idx in range(X.shape[1]))
importances = baseline_score - np.array(scores)
return Bunch(importances_mean=np.mean(importances, axis=1),
importances_std=np.std(importances, axis=1),
importances=importances)

View file

@ -0,0 +1,593 @@
import numbers
from itertools import chain
from itertools import count
import warnings
import numpy as np
from scipy import sparse
from scipy.stats.mstats import mquantiles
from joblib import Parallel, delayed
from .. import partial_dependence
from ...base import is_regressor
from ...utils import check_array
from ...utils import check_matplotlib_support # noqa
from ...utils import _safe_indexing
from ...utils.validation import _deprecate_positional_args
@_deprecate_positional_args
def plot_partial_dependence(estimator, X, features, *, feature_names=None,
target=None, response_method='auto', n_cols=3,
grid_resolution=100, percentiles=(0.05, 0.95),
method='auto', n_jobs=None, verbose=0, fig=None,
line_kw=None, contour_kw=None, ax=None):
"""Partial dependence plots.
The ``len(features)`` plots are arranged in a grid with ``n_cols``
columns. Two-way partial dependence plots are plotted as contour plots. The
deciles of the feature values will be shown with tick marks on the x-axes
for one-way plots, and on both axes for two-way plots.
Read more in the :ref:`User Guide <partial_dependence>`.
.. note::
:func:`plot_partial_dependence` does not support using the same axes
with multiple calls. To plot the the partial dependence for multiple
estimators, please pass the axes created by the first call to the
second call::
>>> from sklearn.inspection import plot_partial_dependence
>>> from sklearn.datasets import make_friedman1
>>> from sklearn.linear_model import LinearRegression
>>> X, y = make_friedman1()
>>> est = LinearRegression().fit(X, y)
>>> disp1 = plot_partial_dependence(est, X) # doctest: +SKIP
>>> disp2 = plot_partial_dependence(est, X,
... ax=disp1.axes_) # doctest: +SKIP
.. warning::
For :class:`~sklearn.ensemble.GradientBoostingClassifier` and
:class:`~sklearn.ensemble.GradientBoostingRegressor`, the
'recursion' method (used by default) will not account for the `init`
predictor of the boosting process. In practice, this will produce
the same values as 'brute' up to a constant offset in the target
response, provided that `init` is a constant estimator (which is the
default). However, if `init` is not a constant estimator, the
partial dependence values are incorrect for 'recursion' because the
offset will be sample-dependent. It is preferable to use the 'brute'
method. Note that this only applies to
:class:`~sklearn.ensemble.GradientBoostingClassifier` and
:class:`~sklearn.ensemble.GradientBoostingRegressor`, not to
:class:`~sklearn.ensemble.HistGradientBoostingClassifier` and
:class:`~sklearn.ensemble.HistGradientBoostingRegressor`.
Parameters
----------
estimator : BaseEstimator
A fitted estimator object implementing :term:`predict`,
:term:`predict_proba`, or :term:`decision_function`.
Multioutput-multiclass classifiers are not supported.
X : {array-like or dataframe} of shape (n_samples, n_features)
``X`` is used to generate a grid of values for the target
``features`` (where the partial dependence will be evaluated), and
also to generate values for the complement features when the
`method` is 'brute'.
features : list of {int, str, pair of int, pair of str}
The target features for which to create the PDPs.
If features[i] is an int or a string, a one-way PDP is created; if
features[i] is a tuple, a two-way PDP is created. Each tuple must be
of size 2.
if any entry is a string, then it must be in ``feature_names``.
feature_names : array-like of shape (n_features,), dtype=str, default=None
Name of each feature; feature_names[i] holds the name of the feature
with index i.
By default, the name of the feature corresponds to their numerical
index for NumPy array and their column name for pandas dataframe.
target : int, optional (default=None)
- In a multiclass setting, specifies the class for which the PDPs
should be computed. Note that for binary classification, the
positive class (index 1) is always used.
- In a multioutput setting, specifies the task for which the PDPs
should be computed.
Ignored in binary classification or classical regression settings.
response_method : 'auto', 'predict_proba' or 'decision_function', \
optional (default='auto')
Specifies whether to use :term:`predict_proba` or
:term:`decision_function` as the target response. For regressors
this parameter is ignored and the response is always the output of
:term:`predict`. By default, :term:`predict_proba` is tried first
and we revert to :term:`decision_function` if it doesn't exist. If
``method`` is 'recursion', the response is always the output of
:term:`decision_function`.
n_cols : int, optional (default=3)
The maximum number of columns in the grid plot. Only active when `ax`
is a single axis or `None`.
grid_resolution : int, optional (default=100)
The number of equally spaced points on the axes of the plots, for each
target feature.
percentiles : tuple of float, optional (default=(0.05, 0.95))
The lower and upper percentile used to create the extreme values
for the PDP axes. Must be in [0, 1].
method : str, optional (default='auto')
The method used to calculate the averaged predictions:
- 'recursion' is only supported for some tree-based estimators (namely
:class:`~sklearn.ensemble.GradientBoostingClassifier`,
:class:`~sklearn.ensemble.GradientBoostingRegressor`,
:class:`~sklearn.ensemble.HistGradientBoostingClassifier`,
:class:`~sklearn.ensemble.HistGradientBoostingRegressor`,
:class:`~sklearn.tree.DecisionTreeRegressor`,
:class:`~sklearn.ensemble.RandomForestRegressor`
but is more efficient in terms of speed.
With this method, the target response of a
classifier is always the decision function, not the predicted
probabilities.
- 'brute' is supported for any estimator, but is more
computationally intensive.
- 'auto': the 'recursion' is used for estimators that support it,
and 'brute' is used otherwise.
Please see :ref:`this note <pdp_method_differences>` for
differences between the 'brute' and 'recursion' method.
n_jobs : int, optional (default=None)
The number of CPUs to use to compute the partial dependences.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
verbose : int, optional (default=0)
Verbose output during PD computations.
fig : Matplotlib figure object, optional (default=None)
A figure object onto which the plots will be drawn, after the figure
has been cleared. By default, a new one is created.
.. deprecated:: 0.22
``fig`` will be removed in 0.24.
line_kw : dict, optional
Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
For one-way partial dependence plots.
contour_kw : dict, optional
Dict with keywords passed to the ``matplotlib.pyplot.contourf`` call.
For two-way partial dependence plots.
ax : Matplotlib axes or array-like of Matplotlib axes, default=None
- If a single axis is passed in, it is treated as a bounding axes
and a grid of partial dependence plots will be drawn within
these bounds. The `n_cols` parameter controls the number of
columns in the grid.
- If an array-like of axes are passed in, the partial dependence
plots will be drawn directly into these axes.
- If `None`, a figure and a bounding axes is created and treated
as the single axes case.
.. versionadded:: 0.22
Returns
-------
display: :class:`~sklearn.inspection.PartialDependenceDisplay`
Examples
--------
>>> from sklearn.datasets import make_friedman1
>>> from sklearn.ensemble import GradientBoostingRegressor
>>> X, y = make_friedman1()
>>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)
>>> plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
See also
--------
sklearn.inspection.partial_dependence: Return raw partial
dependence values
"""
check_matplotlib_support('plot_partial_dependence') # noqa
import matplotlib.pyplot as plt # noqa
from matplotlib import transforms # noqa
from matplotlib.ticker import MaxNLocator # noqa
from matplotlib.ticker import ScalarFormatter # noqa
# set target_idx for multi-class estimators
if hasattr(estimator, 'classes_') and np.size(estimator.classes_) > 2:
if target is None:
raise ValueError('target must be specified for multi-class')
target_idx = np.searchsorted(estimator.classes_, target)
if (not (0 <= target_idx < len(estimator.classes_)) or
estimator.classes_[target_idx] != target):
raise ValueError('target not in est.classes_, got {}'.format(
target))
else:
# regression and binary classification
target_idx = 0
# Use check_array only on lists and other non-array-likes / sparse. Do not
# convert DataFrame into a NumPy array.
if not(hasattr(X, '__array__') or sparse.issparse(X)):
X = check_array(X, force_all_finite='allow-nan', dtype=np.object)
n_features = X.shape[1]
# convert feature_names to list
if feature_names is None:
if hasattr(X, "loc"):
# get the column names for a pandas dataframe
feature_names = X.columns.tolist()
else:
# define a list of numbered indices for a numpy array
feature_names = [str(i) for i in range(n_features)]
elif hasattr(feature_names, "tolist"):
# convert numpy array or pandas index to a list
feature_names = feature_names.tolist()
if len(set(feature_names)) != len(feature_names):
raise ValueError('feature_names should not contain duplicates.')
def convert_feature(fx):
if isinstance(fx, str):
try:
fx = feature_names.index(fx)
except ValueError:
raise ValueError('Feature %s not in feature_names' % fx)
return int(fx)
# convert features into a seq of int tuples
tmp_features = []
for fxs in features:
if isinstance(fxs, (numbers.Integral, str)):
fxs = (fxs,)
try:
fxs = tuple(convert_feature(fx) for fx in fxs)
except TypeError:
raise ValueError('Each entry in features must be either an int, '
'a string, or an iterable of size at most 2.')
if not 1 <= np.size(fxs) <= 2:
raise ValueError('Each entry in features must be either an int, '
'a string, or an iterable of size at most 2.')
tmp_features.append(fxs)
features = tmp_features
# Early exit if the axes does not have the correct number of axes
if ax is not None and not isinstance(ax, plt.Axes):
axes = np.asarray(ax, dtype=object)
if axes.size != len(features):
raise ValueError("Expected ax to have {} axes, got {}".format(
len(features), axes.size))
for i in chain.from_iterable(features):
if i >= len(feature_names):
raise ValueError('All entries of features must be less than '
'len(feature_names) = {0}, got {1}.'
.format(len(feature_names), i))
# compute averaged predictions
pd_results = Parallel(n_jobs=n_jobs, verbose=verbose)(
delayed(partial_dependence)(estimator, X, fxs,
response_method=response_method,
method=method,
grid_resolution=grid_resolution,
percentiles=percentiles)
for fxs in features)
# For multioutput regression, we can only check the validity of target
# now that we have the predictions.
# Also note: as multiclass-multioutput classifiers are not supported,
# multiclass and multioutput scenario are mutually exclusive. So there is
# no risk of overwriting target_idx here.
avg_preds, _ = pd_results[0] # checking the first result is enough
if is_regressor(estimator) and avg_preds.shape[0] > 1:
if target is None:
raise ValueError(
'target must be specified for multi-output regressors')
if not 0 <= target <= avg_preds.shape[0]:
raise ValueError(
'target must be in [0, n_tasks], got {}.'.format(target))
target_idx = target
# get global min and max average predictions of PD grouped by plot type
pdp_lim = {}
for avg_preds, values in pd_results:
min_pd = avg_preds[target_idx].min()
max_pd = avg_preds[target_idx].max()
n_fx = len(values)
old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
min_pd = min(min_pd, old_min_pd)
max_pd = max(max_pd, old_max_pd)
pdp_lim[n_fx] = (min_pd, max_pd)
deciles = {}
for fx in chain.from_iterable(features):
if fx not in deciles:
X_col = _safe_indexing(X, fx, axis=1)
deciles[fx] = mquantiles(X_col, prob=np.arange(0.1, 1.0, 0.1))
if fig is not None:
warnings.warn("The fig parameter is deprecated in version "
"0.22 and will be removed in version 0.24",
FutureWarning)
fig.clear()
ax = fig.gca()
display = PartialDependenceDisplay(pd_results=pd_results,
features=features,
feature_names=feature_names,
target_idx=target_idx,
pdp_lim=pdp_lim,
deciles=deciles)
return display.plot(ax=ax, n_cols=n_cols, line_kw=line_kw,
contour_kw=contour_kw)
class PartialDependenceDisplay:
"""Partial Dependence Plot (PDP) visualization.
It is recommended to use
:func:`~sklearn.inspection.plot_partial_dependence` to create a
:class:`~sklearn.inspection.PartialDependenceDisplay`. All parameters are
stored as attributes.
Read more in
:ref:`sphx_glr_auto_examples_miscellaneous_plot_partial_dependence_visualization_api.py`
and the :ref:`User Guide <visualizations>`.
.. versionadded:: 0.22
Parameters
----------
pd_results : list of (ndarray, ndarray)
Results of :func:`~sklearn.inspection.partial_dependence` for
``features``. Each tuple corresponds to a (averaged_predictions, grid).
features : list of (int,) or list of (int, int)
Indices of features for a given plot. A tuple of one integer will plot
a partial dependence curve of one feature. A tuple of two integers will
plot a two-way partial dependence curve as a contour plot.
feature_names : list of str
Feature names corresponding to the indices in ``features``.
target_idx : int
- In a multiclass setting, specifies the class for which the PDPs
should be computed. Note that for binary classification, the
positive class (index 1) is always used.
- In a multioutput setting, specifies the task for which the PDPs
should be computed.
Ignored in binary classification or classical regression settings.
pdp_lim : dict
Global min and max average predictions, such that all plots will have
the same scale and y limits. `pdp_lim[1]` is the global min and max for
single partial dependence curves. `pdp_lim[2]` is the global min and
max for two-way partial dependence curves.
deciles : dict
Deciles for feature indices in ``features``.
Attributes
----------
bounding_ax_ : matplotlib Axes or None
If `ax` is an axes or None, the `bounding_ax_` is the axes where the
grid of partial dependence plots are drawn. If `ax` is a list of axes
or a numpy array of axes, `bounding_ax_` is None.
axes_ : ndarray of matplotlib Axes
If `ax` is an axes or None, `axes_[i, j]` is the axes on the i-th row
and j-th column. If `ax` is a list of axes, `axes_[i]` is the i-th item
in `ax`. Elements that are None correspond to a nonexisting axes in
that position.
lines_ : ndarray of matplotlib Artists
If `ax` is an axes or None, `lines_[i, j]` is the partial dependence
curve on the i-th row and j-th column. If `ax` is a list of axes,
`lines_[i]` is the partial dependence curve corresponding to the i-th
item in `ax`. Elements that are None correspond to a nonexisting axes
or an axes that does not include a line plot.
deciles_vlines_ : ndarray of matplotlib LineCollection
If `ax` is an axes or None, `vlines_[i, j]` is the line collection
representing the x axis deciles of the i-th row and j-th column. If
`ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in
`ax`. Elements that are None correspond to a nonexisting axes or an
axes that does not include a PDP plot.
.. versionadded:: 0.23
deciles_hlines_ : ndarray of matplotlib LineCollection
If `ax` is an axes or None, `vlines_[i, j]` is the line collection
representing the y axis deciles of the i-th row and j-th column. If
`ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in
`ax`. Elements that are None correspond to a nonexisting axes or an
axes that does not include a 2-way plot.
.. versionadded:: 0.23
contours_ : ndarray of matplotlib Artists
If `ax` is an axes or None, `contours_[i, j]` is the partial dependence
plot on the i-th row and j-th column. If `ax` is a list of axes,
`contours_[i]` is the partial dependence plot corresponding to the i-th
item in `ax`. Elements that are None correspond to a nonexisting axes
or an axes that does not include a contour plot.
figure_ : matplotlib Figure
Figure containing partial dependence plots.
"""
@_deprecate_positional_args
def __init__(self, pd_results, *, features, feature_names, target_idx,
pdp_lim, deciles):
self.pd_results = pd_results
self.features = features
self.feature_names = feature_names
self.target_idx = target_idx
self.pdp_lim = pdp_lim
self.deciles = deciles
def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None):
"""Plot partial dependence plots.
Parameters
----------
ax : Matplotlib axes or array-like of Matplotlib axes, default=None
- If a single axis is passed in, it is treated as a bounding axes
and a grid of partial dependence plots will be drawn within
these bounds. The `n_cols` parameter controls the number of
columns in the grid.
- If an array-like of axes are passed in, the partial dependence
plots will be drawn directly into these axes.
- If `None`, a figure and a bounding axes is created and treated
as the single axes case.
n_cols : int, default=3
The maximum number of columns in the grid plot. Only active when
`ax` is a single axes or `None`.
line_kw : dict, default=None
Dict with keywords passed to the `matplotlib.pyplot.plot` call.
For one-way partial dependence plots.
contour_kw : dict, default=None
Dict with keywords passed to the `matplotlib.pyplot.contourf`
call for two-way partial dependence plots.
Returns
-------
display: :class:`~sklearn.inspection.PartialDependenceDisplay`
"""
check_matplotlib_support("plot_partial_dependence")
import matplotlib.pyplot as plt # noqa
from matplotlib import transforms # noqa
from matplotlib.ticker import MaxNLocator # noqa
from matplotlib.ticker import ScalarFormatter # noqa
from matplotlib.gridspec import GridSpecFromSubplotSpec # noqa
if line_kw is None:
line_kw = {}
if contour_kw is None:
contour_kw = {}
if ax is None:
_, ax = plt.subplots()
default_contour_kws = {"alpha": 0.75}
contour_kw = {**default_contour_kws, **contour_kw}
n_features = len(self.features)
if isinstance(ax, plt.Axes):
# If ax was set off, it has most likely been set to off
# by a previous call to plot.
if not ax.axison:
raise ValueError("The ax was already used in another plot "
"function, please set ax=display.axes_ "
"instead")
ax.set_axis_off()
self.bounding_ax_ = ax
self.figure_ = ax.figure
n_cols = min(n_cols, n_features)
n_rows = int(np.ceil(n_features / float(n_cols)))
self.axes_ = np.empty((n_rows, n_cols), dtype=np.object)
axes_ravel = self.axes_.ravel()
gs = GridSpecFromSubplotSpec(n_rows, n_cols,
subplot_spec=ax.get_subplotspec())
for i, spec in zip(range(n_features), gs):
axes_ravel[i] = self.figure_.add_subplot(spec)
else: # array-like
ax = np.asarray(ax, dtype=object)
if ax.size != n_features:
raise ValueError("Expected ax to have {} axes, got {}"
.format(n_features, ax.size))
if ax.ndim == 2:
n_cols = ax.shape[1]
else:
n_cols = None
self.bounding_ax_ = None
self.figure_ = ax.ravel()[0].figure
self.axes_ = ax
# create contour levels for two-way plots
if 2 in self.pdp_lim:
Z_level = np.linspace(*self.pdp_lim[2], num=8)
self.lines_ = np.empty_like(self.axes_, dtype=np.object)
self.contours_ = np.empty_like(self.axes_, dtype=np.object)
self.deciles_vlines_ = np.empty_like(self.axes_, dtype=np.object)
self.deciles_hlines_ = np.empty_like(self.axes_, dtype=np.object)
# Create 1d views of these 2d arrays for easy indexing
lines_ravel = self.lines_.ravel(order='C')
contours_ravel = self.contours_.ravel(order='C')
vlines_ravel = self.deciles_vlines_.ravel(order='C')
hlines_ravel = self.deciles_hlines_.ravel(order='C')
for i, axi, fx, (avg_preds, values) in zip(count(),
self.axes_.ravel(),
self.features,
self.pd_results):
if len(values) == 1:
lines_ravel[i] = axi.plot(values[0],
avg_preds[self.target_idx].ravel(),
**line_kw)[0]
else:
# contour plot
XX, YY = np.meshgrid(values[0], values[1])
Z = avg_preds[self.target_idx].T
CS = axi.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
colors='k')
contours_ravel[i] = axi.contourf(XX, YY, Z, levels=Z_level,
vmax=Z_level[-1],
vmin=Z_level[0],
**contour_kw)
axi.clabel(CS, fmt='%2.2f', colors='k', fontsize=10,
inline=True)
trans = transforms.blended_transform_factory(axi.transData,
axi.transAxes)
ylim = axi.get_ylim()
vlines_ravel[i] = axi.vlines(self.deciles[fx[0]], 0, 0.05,
transform=trans, color='k')
axi.set_ylim(ylim)
# Set xlabel if it is not already set
if not axi.get_xlabel():
axi.set_xlabel(self.feature_names[fx[0]])
if len(values) == 1:
if n_cols is None or i % n_cols == 0:
axi.set_ylabel('Partial dependence')
else:
axi.set_yticklabels([])
axi.set_ylim(self.pdp_lim[1])
else:
# contour plot
trans = transforms.blended_transform_factory(axi.transAxes,
axi.transData)
xlim = axi.get_xlim()
hlines_ravel[i] = axi.hlines(self.deciles[fx[1]], 0, 0.05,
transform=trans, color='k')
# hline erases xlim
axi.set_ylabel(self.feature_names[fx[1]])
axi.set_xlim(xlim)
return self

View file

@ -0,0 +1,474 @@
import numpy as np
from scipy.stats.mstats import mquantiles
import pytest
from numpy.testing import assert_allclose
from sklearn.datasets import load_boston
from sklearn.datasets import load_iris
from sklearn.datasets import make_classification, make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.utils._testing import _convert_container
from sklearn.inspection import plot_partial_dependence
# TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved
pytestmark = pytest.mark.filterwarnings(
"ignore:In future, it will be an error for 'np.bool_':DeprecationWarning:"
"matplotlib.*")
@pytest.fixture(scope="module")
def boston():
return load_boston()
@pytest.fixture(scope="module")
def clf_boston(boston):
clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
clf.fit(boston.data, boston.target)
return clf
@pytest.mark.parametrize("grid_resolution", [10, 20])
def test_plot_partial_dependence(grid_resolution, pyplot, clf_boston, boston):
# Test partial dependence plot function.
feature_names = boston.feature_names
disp = plot_partial_dependence(clf_boston, boston.data,
[0, 1, (0, 1)],
grid_resolution=grid_resolution,
feature_names=feature_names,
contour_kw={"cmap": "jet"})
fig = pyplot.gcf()
axs = fig.get_axes()
assert disp.figure_ is fig
assert len(axs) == 4
assert disp.bounding_ax_ is not None
assert disp.axes_.shape == (1, 3)
assert disp.lines_.shape == (1, 3)
assert disp.contours_.shape == (1, 3)
assert disp.deciles_vlines_.shape == (1, 3)
assert disp.deciles_hlines_.shape == (1, 3)
assert disp.lines_[0, 2] is None
assert disp.contours_[0, 0] is None
assert disp.contours_[0, 1] is None
# deciles lines: always show on xaxis, only show on yaxis if 2-way PDP
for i in range(3):
assert disp.deciles_vlines_[0, i] is not None
assert disp.deciles_hlines_[0, 0] is None
assert disp.deciles_hlines_[0, 1] is None
assert disp.deciles_hlines_[0, 2] is not None
assert disp.features == [(0, ), (1, ), (0, 1)]
assert np.all(disp.feature_names == feature_names)
assert len(disp.deciles) == 2
for i in [0, 1]:
assert_allclose(disp.deciles[i],
mquantiles(boston.data[:, i],
prob=np.arange(0.1, 1.0, 0.1)))
single_feature_positions = [(0, 0), (0, 1)]
expected_ylabels = ["Partial dependence", ""]
for i, pos in enumerate(single_feature_positions):
ax = disp.axes_[pos]
assert ax.get_ylabel() == expected_ylabels[i]
assert ax.get_xlabel() == boston.feature_names[i]
assert_allclose(ax.get_ylim(), disp.pdp_lim[1])
line = disp.lines_[pos]
avg_preds, values = disp.pd_results[i]
assert avg_preds.shape == (1, grid_resolution)
target_idx = disp.target_idx
line_data = line.get_data()
assert_allclose(line_data[0], values[0])
assert_allclose(line_data[1], avg_preds[target_idx].ravel())
# two feature position
ax = disp.axes_[0, 2]
coutour = disp.contours_[0, 2]
expected_levels = np.linspace(*disp.pdp_lim[2], num=8)
assert_allclose(coutour.levels, expected_levels)
assert coutour.get_cmap().name == "jet"
assert ax.get_xlabel() == boston.feature_names[0]
assert ax.get_ylabel() == boston.feature_names[1]
@pytest.mark.parametrize(
"input_type, feature_names_type",
[('dataframe', None),
('dataframe', 'list'), ('list', 'list'), ('array', 'list'),
('dataframe', 'array'), ('list', 'array'), ('array', 'array'),
('dataframe', 'series'), ('list', 'series'), ('array', 'series'),
('dataframe', 'index'), ('list', 'index'), ('array', 'index')]
)
def test_plot_partial_dependence_str_features(pyplot, clf_boston, boston,
input_type, feature_names_type):
if input_type == 'dataframe':
pd = pytest.importorskip("pandas")
X = pd.DataFrame(boston.data, columns=boston.feature_names)
elif input_type == 'list':
X = boston.data.tolist()
else:
X = boston.data
if feature_names_type is None:
feature_names = None
else:
feature_names = _convert_container(boston.feature_names,
feature_names_type)
grid_resolution = 25
# check with str features and array feature names and single column
disp = plot_partial_dependence(clf_boston, X,
[('CRIM', 'ZN'), 'ZN'],
grid_resolution=grid_resolution,
feature_names=feature_names,
n_cols=1, line_kw={"alpha": 0.8})
fig = pyplot.gcf()
axs = fig.get_axes()
assert len(axs) == 3
assert disp.figure_ is fig
assert disp.axes_.shape == (2, 1)
assert disp.lines_.shape == (2, 1)
assert disp.contours_.shape == (2, 1)
assert disp.deciles_vlines_.shape == (2, 1)
assert disp.deciles_hlines_.shape == (2, 1)
assert disp.lines_[0, 0] is None
assert disp.deciles_vlines_[0, 0] is not None
assert disp.deciles_hlines_[0, 0] is not None
assert disp.contours_[1, 0] is None
assert disp.deciles_hlines_[1, 0] is None
assert disp.deciles_vlines_[1, 0] is not None
# line
ax = disp.axes_[1, 0]
assert ax.get_xlabel() == "ZN"
assert ax.get_ylabel() == "Partial dependence"
line = disp.lines_[1, 0]
avg_preds, values = disp.pd_results[1]
target_idx = disp.target_idx
assert line.get_alpha() == 0.8
line_data = line.get_data()
assert_allclose(line_data[0], values[0])
assert_allclose(line_data[1], avg_preds[target_idx].ravel())
# contour
ax = disp.axes_[0, 0]
coutour = disp.contours_[0, 0]
expect_levels = np.linspace(*disp.pdp_lim[2], num=8)
assert_allclose(coutour.levels, expect_levels)
assert ax.get_xlabel() == "CRIM"
assert ax.get_ylabel() == "ZN"
def test_plot_partial_dependence_custom_axes(pyplot, clf_boston, boston):
grid_resolution = 25
fig, (ax1, ax2) = pyplot.subplots(1, 2)
feature_names = boston.feature_names.tolist()
disp = plot_partial_dependence(clf_boston, boston.data,
['CRIM', ('CRIM', 'ZN')],
grid_resolution=grid_resolution,
feature_names=feature_names, ax=[ax1, ax2])
assert fig is disp.figure_
assert disp.bounding_ax_ is None
assert disp.axes_.shape == (2, )
assert disp.axes_[0] is ax1
assert disp.axes_[1] is ax2
ax = disp.axes_[0]
assert ax.get_xlabel() == "CRIM"
assert ax.get_ylabel() == "Partial dependence"
line = disp.lines_[0]
avg_preds, values = disp.pd_results[0]
target_idx = disp.target_idx
line_data = line.get_data()
assert_allclose(line_data[0], values[0])
assert_allclose(line_data[1], avg_preds[target_idx].ravel())
# contour
ax = disp.axes_[1]
coutour = disp.contours_[1]
expect_levels = np.linspace(*disp.pdp_lim[2], num=8)
assert_allclose(coutour.levels, expect_levels)
assert ax.get_xlabel() == "CRIM"
assert ax.get_ylabel() == "ZN"
def test_plot_partial_dependence_passing_numpy_axes(pyplot, clf_boston,
boston):
grid_resolution = 25
feature_names = boston.feature_names.tolist()
disp1 = plot_partial_dependence(clf_boston, boston.data,
['CRIM', 'ZN'],
grid_resolution=grid_resolution,
feature_names=feature_names)
assert disp1.axes_.shape == (1, 2)
assert disp1.axes_[0, 0].get_ylabel() == "Partial dependence"
assert disp1.axes_[0, 1].get_ylabel() == ""
assert len(disp1.axes_[0, 0].get_lines()) == 1
assert len(disp1.axes_[0, 1].get_lines()) == 1
lr = LinearRegression()
lr.fit(boston.data, boston.target)
disp2 = plot_partial_dependence(lr, boston.data,
['CRIM', 'ZN'],
grid_resolution=grid_resolution,
feature_names=feature_names,
ax=disp1.axes_)
assert np.all(disp1.axes_ == disp2.axes_)
assert len(disp2.axes_[0, 0].get_lines()) == 2
assert len(disp2.axes_[0, 1].get_lines()) == 2
@pytest.mark.parametrize("nrows, ncols", [(2, 2), (3, 1)])
def test_plot_partial_dependence_incorrent_num_axes(pyplot, clf_boston,
boston, nrows, ncols):
grid_resolution = 5
fig, axes = pyplot.subplots(nrows, ncols)
axes_formats = [list(axes.ravel()), tuple(axes.ravel()), axes]
msg = "Expected ax to have 2 axes, got {}".format(nrows * ncols)
disp = plot_partial_dependence(clf_boston, boston.data,
['CRIM', 'ZN'],
grid_resolution=grid_resolution,
feature_names=boston.feature_names)
for ax_format in axes_formats:
with pytest.raises(ValueError, match=msg):
plot_partial_dependence(clf_boston, boston.data,
['CRIM', 'ZN'],
grid_resolution=grid_resolution,
feature_names=boston.feature_names,
ax=ax_format)
# with axes object
with pytest.raises(ValueError, match=msg):
disp.plot(ax=ax_format)
def test_plot_partial_dependence_with_same_axes(pyplot, clf_boston, boston):
# The first call to plot_partial_dependence will create two new axes to
# place in the space of the passed in axes, which results in a total of
# three axes in the figure.
# Currently the API does not allow for the second call to
# plot_partial_dependence to use the same axes again, because it will
# create two new axes in the space resulting in five axes. To get the
# expected behavior one needs to pass the generated axes into the second
# call:
# disp1 = plot_partial_dependence(...)
# disp2 = plot_partial_dependence(..., ax=disp1.axes_)
grid_resolution = 25
fig, ax = pyplot.subplots()
plot_partial_dependence(clf_boston, boston.data, ['CRIM', 'ZN'],
grid_resolution=grid_resolution,
feature_names=boston.feature_names, ax=ax)
msg = ("The ax was already used in another plot function, please set "
"ax=display.axes_ instead")
with pytest.raises(ValueError, match=msg):
plot_partial_dependence(clf_boston, boston.data,
['CRIM', 'ZN'],
grid_resolution=grid_resolution,
feature_names=boston.feature_names, ax=ax)
def test_plot_partial_dependence_feature_name_reuse(pyplot, clf_boston,
boston):
# second call to plot does not change the feature names from the first
# call
feature_names = boston.feature_names
disp = plot_partial_dependence(clf_boston, boston.data,
[0, 1],
grid_resolution=10,
feature_names=feature_names)
plot_partial_dependence(clf_boston, boston.data, [0, 1],
grid_resolution=10, ax=disp.axes_)
for i, ax in enumerate(disp.axes_.ravel()):
assert ax.get_xlabel() == feature_names[i]
def test_plot_partial_dependence_multiclass(pyplot):
grid_resolution = 25
clf_int = GradientBoostingClassifier(n_estimators=10, random_state=1)
iris = load_iris()
# Test partial dependence plot function on multi-class input.
clf_int.fit(iris.data, iris.target)
disp_target_0 = plot_partial_dependence(clf_int, iris.data, [0, 1],
target=0,
grid_resolution=grid_resolution)
assert disp_target_0.figure_ is pyplot.gcf()
assert disp_target_0.axes_.shape == (1, 2)
assert disp_target_0.lines_.shape == (1, 2)
assert disp_target_0.contours_.shape == (1, 2)
assert disp_target_0.deciles_vlines_.shape == (1, 2)
assert disp_target_0.deciles_hlines_.shape == (1, 2)
assert all(c is None for c in disp_target_0.contours_.flat)
assert disp_target_0.target_idx == 0
# now with symbol labels
target = iris.target_names[iris.target]
clf_symbol = GradientBoostingClassifier(n_estimators=10, random_state=1)
clf_symbol.fit(iris.data, target)
disp_symbol = plot_partial_dependence(clf_symbol, iris.data, [0, 1],
target='setosa',
grid_resolution=grid_resolution)
assert disp_symbol.figure_ is pyplot.gcf()
assert disp_symbol.axes_.shape == (1, 2)
assert disp_symbol.lines_.shape == (1, 2)
assert disp_symbol.contours_.shape == (1, 2)
assert disp_symbol.deciles_vlines_.shape == (1, 2)
assert disp_symbol.deciles_hlines_.shape == (1, 2)
assert all(c is None for c in disp_symbol.contours_.flat)
assert disp_symbol.target_idx == 0
for int_result, symbol_result in zip(disp_target_0.pd_results,
disp_symbol.pd_results):
avg_preds_int, values_int = int_result
avg_preds_symbol, values_symbol = symbol_result
assert_allclose(avg_preds_int, avg_preds_symbol)
assert_allclose(values_int, values_symbol)
# check that the pd plots are different for another target
disp_target_1 = plot_partial_dependence(clf_int, iris.data, [0, 1],
target=1,
grid_resolution=grid_resolution)
target_0_data_y = disp_target_0.lines_[0, 0].get_data()[1]
target_1_data_y = disp_target_1.lines_[0, 0].get_data()[1]
assert any(target_0_data_y != target_1_data_y)
multioutput_regression_data = make_regression(n_samples=50, n_targets=2,
random_state=0)
@pytest.mark.parametrize("target", [0, 1])
def test_plot_partial_dependence_multioutput(pyplot, target):
# Test partial dependence plot function on multi-output input.
X, y = multioutput_regression_data
clf = LinearRegression().fit(X, y)
grid_resolution = 25
disp = plot_partial_dependence(clf, X, [0, 1], target=target,
grid_resolution=grid_resolution)
fig = pyplot.gcf()
axs = fig.get_axes()
assert len(axs) == 3
assert disp.target_idx == target
assert disp.bounding_ax_ is not None
positions = [(0, 0), (0, 1)]
expected_label = ["Partial dependence", ""]
for i, pos in enumerate(positions):
ax = disp.axes_[pos]
assert ax.get_ylabel() == expected_label[i]
assert ax.get_xlabel() == "{}".format(i)
def test_plot_partial_dependence_dataframe(pyplot, clf_boston, boston):
pd = pytest.importorskip('pandas')
df = pd.DataFrame(boston.data, columns=boston.feature_names)
grid_resolution = 25
plot_partial_dependence(
clf_boston, df, ['TAX', 'AGE'], grid_resolution=grid_resolution,
feature_names=df.columns.tolist()
)
dummy_classification_data = make_classification(random_state=0)
@pytest.mark.parametrize(
"data, params, err_msg",
[(multioutput_regression_data, {"target": None, 'features': [0]},
"target must be specified for multi-output"),
(multioutput_regression_data, {"target": -1, 'features': [0]},
r'target must be in \[0, n_tasks\]'),
(multioutput_regression_data, {"target": 100, 'features': [0]},
r'target must be in \[0, n_tasks\]'),
(dummy_classification_data,
{'features': ['foobar'], 'feature_names': None},
'Feature foobar not in feature_names'),
(dummy_classification_data,
{'features': ['foobar'], 'feature_names': ['abcd', 'def']},
'Feature foobar not in feature_names'),
(dummy_classification_data, {'features': [(1, 2, 3)]},
'Each entry in features must be either an int, '),
(dummy_classification_data, {'features': [1, {}]},
'Each entry in features must be either an int, '),
(dummy_classification_data, {'features': [tuple()]},
'Each entry in features must be either an int, '),
(dummy_classification_data,
{'features': [123], 'feature_names': ['blahblah']},
'All entries of features must be less than '),
(dummy_classification_data,
{'features': [0, 1, 2], 'feature_names': ['a', 'b', 'a']},
'feature_names should not contain duplicates')]
)
def test_plot_partial_dependence_error(pyplot, data, params, err_msg):
X, y = data
estimator = LinearRegression().fit(X, y)
with pytest.raises(ValueError, match=err_msg):
plot_partial_dependence(estimator, X, **params)
@pytest.mark.parametrize("params, err_msg", [
({'target': 4, 'features': [0]},
'target not in est.classes_, got 4'),
({'target': None, 'features': [0]},
'target must be specified for multi-class'),
({'target': 1, 'features': [4.5]},
'Each entry in features must be either an int,'),
])
def test_plot_partial_dependence_multiclass_error(pyplot, params, err_msg):
iris = load_iris()
clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
clf.fit(iris.data, iris.target)
with pytest.raises(ValueError, match=err_msg):
plot_partial_dependence(clf, iris.data, **params)
def test_plot_partial_dependence_fig_deprecated(pyplot):
# Make sure fig object is correctly used if not None
X, y = make_regression(n_samples=50, random_state=0)
clf = LinearRegression()
clf.fit(X, y)
fig = pyplot.figure()
grid_resolution = 25
msg = ("The fig parameter is deprecated in version 0.22 and will be "
"removed in version 0.24")
with pytest.warns(FutureWarning, match=msg):
plot_partial_dependence(
clf, X, [0, 1], target=0, grid_resolution=grid_resolution, fig=fig)
assert pyplot.gcf() is fig

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _partial_dependence # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.inspection.partial_dependence'
correct_import_path = 'sklearn.inspection'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_partial_dependence, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,17 @@
from numpy.distutils.misc_util import Configuration
def configuration(parent_package="", top_path=None):
config = Configuration("inspection", parent_package, top_path)
config.add_subpackage('_plot')
config.add_subpackage('_plot.tests')
config.add_subpackage('tests')
return config
if __name__ == "__main__":
from numpy.distutils.core import setup
setup(**configuration().todict())

View file

@ -0,0 +1,663 @@
"""
Testing for the partial dependence module.
"""
import numpy as np
import pytest
import sklearn
from sklearn.inspection import partial_dependence
from sklearn.inspection._partial_dependence import (
_grid_from_X,
_partial_dependence_brute,
_partial_dependence_recursion
)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import MultiTaskLasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import load_iris
from sklearn.datasets import make_classification, make_regression
from sklearn.cluster import KMeans
from sklearn.compose import make_column_transformer
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.exceptions import NotFittedError
from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import ignore_warnings
from sklearn.utils import _IS_32BIT
from sklearn.utils.validation import check_random_state
from sklearn.tree.tests.test_tree import assert_is_subtree
# toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y = [-1, -1, -1, 1, 1, 1]
# (X, y), n_targets <-- as expected in the output of partial_dep()
binary_classification_data = (make_classification(n_samples=50,
random_state=0), 1)
multiclass_classification_data = (make_classification(n_samples=50,
n_classes=3,
n_clusters_per_class=1,
random_state=0), 3)
regression_data = (make_regression(n_samples=50, random_state=0), 1)
multioutput_regression_data = (make_regression(n_samples=50, n_targets=2,
random_state=0), 2)
# iris
iris = load_iris()
@pytest.mark.parametrize('Estimator, method, data', [
(GradientBoostingClassifier, 'recursion', binary_classification_data),
(GradientBoostingClassifier, 'recursion', multiclass_classification_data),
(GradientBoostingClassifier, 'brute', binary_classification_data),
(GradientBoostingClassifier, 'brute', multiclass_classification_data),
(GradientBoostingRegressor, 'recursion', regression_data),
(GradientBoostingRegressor, 'brute', regression_data),
(DecisionTreeRegressor, 'brute', regression_data),
(LinearRegression, 'brute', regression_data),
(LinearRegression, 'brute', multioutput_regression_data),
(LogisticRegression, 'brute', binary_classification_data),
(LogisticRegression, 'brute', multiclass_classification_data),
(MultiTaskLasso, 'brute', multioutput_regression_data),
])
@pytest.mark.parametrize('grid_resolution', (5, 10))
@pytest.mark.parametrize('features', ([1], [1, 2]))
def test_output_shape(Estimator, method, data, grid_resolution,
features):
# Check that partial_dependence has consistent output shape for different
# kinds of estimators:
# - classifiers with binary and multiclass settings
# - regressors
# - multi-task regressors
est = Estimator()
# n_target corresponds to the number of classes (1 for binary classif) or
# the number of tasks / outputs in multi task settings. It's equal to 1 for
# classical regression_data.
(X, y), n_targets = data
est.fit(X, y)
pdp, axes = partial_dependence(est, X=X, features=features,
method=method,
grid_resolution=grid_resolution)
expected_pdp_shape = (n_targets, *[grid_resolution
for _ in range(len(features))])
expected_axes_shape = (len(features), grid_resolution)
assert pdp.shape == expected_pdp_shape
assert axes is not None
assert np.asarray(axes).shape == expected_axes_shape
def test_grid_from_X():
# tests for _grid_from_X: sanity check for output, and for shapes.
# Make sure that the grid is a cartesian product of the input (it will use
# the unique values instead of the percentiles)
percentiles = (.05, .95)
grid_resolution = 100
X = np.asarray([[1, 2],
[3, 4]])
grid, axes = _grid_from_X(X, percentiles, grid_resolution)
assert_array_equal(grid, [[1, 2],
[1, 4],
[3, 2],
[3, 4]])
assert_array_equal(axes, X.T)
# test shapes of returned objects depending on the number of unique values
# for a feature.
rng = np.random.RandomState(0)
grid_resolution = 15
# n_unique_values > grid_resolution
X = rng.normal(size=(20, 2))
grid, axes = _grid_from_X(X, percentiles, grid_resolution=grid_resolution)
assert grid.shape == (grid_resolution * grid_resolution, X.shape[1])
assert np.asarray(axes).shape == (2, grid_resolution)
# n_unique_values < grid_resolution, will use actual values
n_unique_values = 12
X[n_unique_values - 1:, 0] = 12345
rng.shuffle(X) # just to make sure the order is irrelevant
grid, axes = _grid_from_X(X, percentiles, grid_resolution=grid_resolution)
assert grid.shape == (n_unique_values * grid_resolution, X.shape[1])
# axes is a list of arrays of different shapes
assert axes[0].shape == (n_unique_values,)
assert axes[1].shape == (grid_resolution,)
@pytest.mark.parametrize(
"grid_resolution, percentiles, err_msg",
[(2, (0, 0.0001), "percentiles are too close"),
(100, (1, 2, 3, 4), "'percentiles' must be a sequence of 2 elements"),
(100, 12345, "'percentiles' must be a sequence of 2 elements"),
(100, (-1, .95), r"'percentiles' values must be in \[0, 1\]"),
(100, (.05, 2), r"'percentiles' values must be in \[0, 1\]"),
(100, (.9, .1), r"percentiles\[0\] must be strictly less than"),
(1, (0.05, 0.95), "'grid_resolution' must be strictly greater than 1")]
)
def test_grid_from_X_error(grid_resolution, percentiles, err_msg):
X = np.asarray([[1, 2], [3, 4]])
with pytest.raises(ValueError, match=err_msg):
_grid_from_X(
X, grid_resolution=grid_resolution, percentiles=percentiles
)
@pytest.mark.parametrize('target_feature', range(5))
@pytest.mark.parametrize('est, method', [
(LinearRegression(), 'brute'),
(GradientBoostingRegressor(random_state=0), 'brute'),
(GradientBoostingRegressor(random_state=0), 'recursion'),
(HistGradientBoostingRegressor(random_state=0), 'brute'),
(HistGradientBoostingRegressor(random_state=0), 'recursion')]
)
def test_partial_dependence_helpers(est, method, target_feature):
# Check that what is returned by _partial_dependence_brute or
# _partial_dependence_recursion is equivalent to manually setting a target
# feature to a given value, and computing the average prediction over all
# samples.
# This also checks that the brute and recursion methods give the same
# output.
# Note that even on the trainset, the brute and the recursion methods
# aren't always strictly equivalent, in particular when the slow method
# generates unrealistic samples that have low mass in the joint
# distribution of the input features, and when some of the features are
# dependent. Hence the high tolerance on the checks.
X, y = make_regression(random_state=0, n_features=5, n_informative=5)
# The 'init' estimator for GBDT (here the average prediction) isn't taken
# into account with the recursion method, for technical reasons. We set
# the mean to 0 to that this 'bug' doesn't have any effect.
y = y - y.mean()
est.fit(X, y)
# target feature will be set to .5 and then to 123
features = np.array([target_feature], dtype=np.int32)
grid = np.array([[.5],
[123]])
if method == 'brute':
pdp = _partial_dependence_brute(est, grid, features, X,
response_method='auto')
else:
pdp = _partial_dependence_recursion(est, grid, features)
mean_predictions = []
for val in (.5, 123):
X_ = X.copy()
X_[:, target_feature] = val
mean_predictions.append(est.predict(X_).mean())
pdp = pdp[0] # (shape is (1, 2) so make it (2,))
# allow for greater margin for error with recursion method
rtol = 1e-1 if method == 'recursion' else 1e-3
assert np.allclose(pdp, mean_predictions, rtol=rtol)
@pytest.mark.parametrize('seed', range(1))
def test_recursion_decision_tree_vs_forest_and_gbdt(seed):
# Make sure that the recursion method gives the same results on a
# DecisionTreeRegressor and a GradientBoostingRegressor or a
# RandomForestRegressor with 1 tree and equivalent parameters.
rng = np.random.RandomState(seed)
# Purely random dataset to avoid correlated features
n_samples = 1000
n_features = 5
X = rng.randn(n_samples, n_features)
y = rng.randn(n_samples) * 10
# The 'init' estimator for GBDT (here the average prediction) isn't taken
# into account with the recursion method, for technical reasons. We set
# the mean to 0 to that this 'bug' doesn't have any effect.
y = y - y.mean()
# set max_depth not too high to avoid splits with same gain but different
# features
max_depth = 5
tree_seed = 0
forest = RandomForestRegressor(n_estimators=1, max_features=None,
bootstrap=False, max_depth=max_depth,
random_state=tree_seed)
# The forest will use ensemble.base._set_random_states to set the
# random_state of the tree sub-estimator. We simulate this here to have
# equivalent estimators.
equiv_random_state = check_random_state(tree_seed).randint(
np.iinfo(np.int32).max)
gbdt = GradientBoostingRegressor(n_estimators=1, learning_rate=1,
criterion='mse', max_depth=max_depth,
random_state=equiv_random_state)
tree = DecisionTreeRegressor(max_depth=max_depth,
random_state=equiv_random_state)
forest.fit(X, y)
gbdt.fit(X, y)
tree.fit(X, y)
# sanity check: if the trees aren't the same, the PD values won't be equal
try:
assert_is_subtree(tree.tree_, gbdt[0, 0].tree_)
assert_is_subtree(tree.tree_, forest[0].tree_)
except AssertionError:
# For some reason the trees aren't exactly equal on 32bits, so the PDs
# cannot be equal either. See
# https://github.com/scikit-learn/scikit-learn/issues/8853
assert _IS_32BIT, "this should only fail on 32 bit platforms"
return
grid = rng.randn(50).reshape(-1, 1)
for f in range(n_features):
features = np.array([f], dtype=np.int32)
pdp_forest = _partial_dependence_recursion(forest, grid, features)
pdp_gbdt = _partial_dependence_recursion(gbdt, grid, features)
pdp_tree = _partial_dependence_recursion(tree, grid, features)
np.testing.assert_allclose(pdp_gbdt, pdp_tree)
np.testing.assert_allclose(pdp_forest, pdp_tree)
@pytest.mark.parametrize('est', (
GradientBoostingClassifier(random_state=0),
HistGradientBoostingClassifier(random_state=0),
))
@pytest.mark.parametrize('target_feature', (0, 1, 2, 3, 4, 5))
def test_recursion_decision_function(est, target_feature):
# Make sure the recursion method (implicitly uses decision_function) has
# the same result as using brute method with
# response_method=decision_function
X, y = make_classification(n_classes=2, n_clusters_per_class=1,
random_state=1)
assert np.mean(y) == .5 # make sure the init estimator predicts 0 anyway
est.fit(X, y)
preds_1, _ = partial_dependence(est, X, [target_feature],
response_method='decision_function',
method='recursion')
preds_2, _ = partial_dependence(est, X, [target_feature],
response_method='decision_function',
method='brute')
assert_allclose(preds_1, preds_2, atol=1e-7)
@pytest.mark.parametrize('est', (
LinearRegression(),
GradientBoostingRegressor(random_state=0),
HistGradientBoostingRegressor(random_state=0, min_samples_leaf=1,
max_leaf_nodes=None, max_iter=1),
DecisionTreeRegressor(random_state=0),
))
@pytest.mark.parametrize('power', (1, 2))
def test_partial_dependence_easy_target(est, power):
# If the target y only depends on one feature in an obvious way (linear or
# quadratic) then the partial dependence for that feature should reflect
# it.
# We here fit a linear regression_data model (with polynomial features if
# needed) and compute r_squared to check that the partial dependence
# correctly reflects the target.
rng = np.random.RandomState(0)
n_samples = 200
target_variable = 2
X = rng.normal(size=(n_samples, 5))
y = X[:, target_variable]**power
est.fit(X, y)
averaged_predictions, values = partial_dependence(
est, features=[target_variable], X=X, grid_resolution=1000)
new_X = values[0].reshape(-1, 1)
new_y = averaged_predictions[0]
# add polynomial features if needed
new_X = PolynomialFeatures(degree=power).fit_transform(new_X)
lr = LinearRegression().fit(new_X, new_y)
r2 = r2_score(new_y, lr.predict(new_X))
assert r2 > .99
@pytest.mark.parametrize('Estimator',
(sklearn.tree.DecisionTreeClassifier,
sklearn.tree.ExtraTreeClassifier,
sklearn.ensemble.ExtraTreesClassifier,
sklearn.neighbors.KNeighborsClassifier,
sklearn.neighbors.RadiusNeighborsClassifier,
sklearn.ensemble.RandomForestClassifier))
def test_multiclass_multioutput(Estimator):
# Make sure error is raised for multiclass-multioutput classifiers
# make multiclass-multioutput dataset
X, y = make_classification(n_classes=3, n_clusters_per_class=1,
random_state=0)
y = np.array([y, y]).T
est = Estimator()
est.fit(X, y)
with pytest.raises(
ValueError,
match="Multiclass-multioutput estimators are not supported"):
partial_dependence(est, X, [0])
class NoPredictProbaNoDecisionFunction(ClassifierMixin, BaseEstimator):
def fit(self, X, y):
# simulate that we have some classes
self.classes_ = [0, 1]
return self
@pytest.mark.parametrize(
"estimator, params, err_msg",
[(KMeans(),
{'features': [0]},
"'estimator' must be a fitted regressor or classifier"),
(LinearRegression(),
{'features': [0], 'response_method': 'predict_proba'},
'The response_method parameter is ignored for regressors'),
(GradientBoostingClassifier(random_state=0),
{'features': [0], 'response_method': 'predict_proba',
'method': 'recursion'},
"'recursion' method, the response_method must be 'decision_function'"),
(GradientBoostingClassifier(random_state=0),
{'features': [0], 'response_method': 'predict_proba', 'method': 'auto'},
"'recursion' method, the response_method must be 'decision_function'"),
(GradientBoostingClassifier(random_state=0),
{'features': [0], 'response_method': 'blahblah'},
'response_method blahblah is invalid. Accepted response_method'),
(NoPredictProbaNoDecisionFunction(),
{'features': [0], 'response_method': 'auto'},
'The estimator has no predict_proba and no decision_function method'),
(NoPredictProbaNoDecisionFunction(),
{'features': [0], 'response_method': 'predict_proba'},
'The estimator has no predict_proba method.'),
(NoPredictProbaNoDecisionFunction(),
{'features': [0], 'response_method': 'decision_function'},
'The estimator has no decision_function method.'),
(LinearRegression(),
{'features': [0], 'method': 'blahblah'},
'blahblah is invalid. Accepted method names are brute, recursion, auto'),
(LinearRegression(),
{'features': [0], 'method': 'recursion'},
"Only the following estimators support the 'recursion' method:")]
)
def test_partial_dependence_error(estimator, params, err_msg):
X, y = make_classification(random_state=0)
estimator.fit(X, y)
with pytest.raises(ValueError, match=err_msg):
partial_dependence(estimator, X, **params)
@pytest.mark.parametrize(
"with_dataframe, err_msg",
[(True, "Only array-like or scalar are supported"),
(False, "Only array-like or scalar are supported")]
)
def test_partial_dependence_slice_error(with_dataframe, err_msg):
X, y = make_classification(random_state=0)
if with_dataframe:
pd = pytest.importorskip('pandas')
X = pd.DataFrame(X)
estimator = LogisticRegression().fit(X, y)
with pytest.raises(TypeError, match=err_msg):
partial_dependence(estimator, X, features=slice(0, 2, 1))
@pytest.mark.parametrize(
'estimator',
[LinearRegression(), GradientBoostingClassifier(random_state=0)]
)
@pytest.mark.parametrize('features', [-1, 10000])
def test_partial_dependence_unknown_feature_indices(estimator, features):
X, y = make_classification(random_state=0)
estimator.fit(X, y)
err_msg = 'all features must be in'
with pytest.raises(ValueError, match=err_msg):
partial_dependence(estimator, X, [features])
@pytest.mark.parametrize(
'estimator',
[LinearRegression(), GradientBoostingClassifier(random_state=0)]
)
def test_partial_dependence_unknown_feature_string(estimator):
pd = pytest.importorskip("pandas")
X, y = make_classification(random_state=0)
df = pd.DataFrame(X)
estimator.fit(df, y)
features = ['random']
err_msg = 'A given column is not a column of the dataframe'
with pytest.raises(ValueError, match=err_msg):
partial_dependence(estimator, df, features)
@pytest.mark.parametrize(
'estimator',
[LinearRegression(), GradientBoostingClassifier(random_state=0)]
)
def test_partial_dependence_X_list(estimator):
# check that array-like objects are accepted
X, y = make_classification(random_state=0)
estimator.fit(X, y)
partial_dependence(estimator, list(X), [0])
# TODO: Remove in 0.24 when DummyClassifier's `strategy` default updates
@ignore_warnings(category=FutureWarning)
def test_warning_recursion_non_constant_init():
# make sure that passing a non-constant init parameter to a GBDT and using
# recursion method yields a warning.
gbc = GradientBoostingClassifier(init=DummyClassifier(), random_state=0)
gbc.fit(X, y)
with pytest.warns(
UserWarning,
match='Using recursion method with a non-constant init predictor'):
partial_dependence(gbc, X, [0], method='recursion')
with pytest.warns(
UserWarning,
match='Using recursion method with a non-constant init predictor'):
partial_dependence(gbc, X, [0], method='recursion')
def test_partial_dependence_sample_weight():
# Test near perfect correlation between partial dependence and diagonal
# when sample weights emphasize y = x predictions
# non-regression test for #13193
# TODO: extend to HistGradientBoosting once sample_weight is supported
N = 1000
rng = np.random.RandomState(123456)
mask = rng.randint(2, size=N, dtype=bool)
x = rng.rand(N)
# set y = x on mask and y = -x outside
y = x.copy()
y[~mask] = -y[~mask]
X = np.c_[mask, x]
# sample weights to emphasize data points where y = x
sample_weight = np.ones(N)
sample_weight[mask] = 1000.
clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
clf.fit(X, y, sample_weight=sample_weight)
pdp, values = partial_dependence(clf, X, features=[1])
assert np.corrcoef(pdp, values)[0, 1] > 0.99
def test_hist_gbdt_sw_not_supported():
# TODO: remove/fix when PDP supports HGBT with sample weights
clf = HistGradientBoostingRegressor(random_state=1)
clf.fit(X, y, sample_weight=np.ones(len(X)))
with pytest.raises(NotImplementedError,
match="does not support partial dependence"):
partial_dependence(clf, X, features=[1])
# TODO: Remove in 0.24 when DummyClassifier's `strategy` default updates
@ignore_warnings(category=FutureWarning)
def test_partial_dependence_pipeline():
# check that the partial dependence support pipeline
iris = load_iris()
scaler = StandardScaler()
clf = DummyClassifier(random_state=42)
pipe = make_pipeline(scaler, clf)
clf.fit(scaler.fit_transform(iris.data), iris.target)
pipe.fit(iris.data, iris.target)
features = 0
pdp_pipe, values_pipe = partial_dependence(
pipe, iris.data, features=[features], grid_resolution=10
)
pdp_clf, values_clf = partial_dependence(
clf, scaler.transform(iris.data), features=[features],
grid_resolution=10
)
assert_allclose(pdp_pipe, pdp_clf)
assert_allclose(
values_pipe[0],
values_clf[0] * scaler.scale_[features] + scaler.mean_[features]
)
@pytest.mark.parametrize(
"estimator",
[LogisticRegression(max_iter=1000, random_state=0),
GradientBoostingClassifier(random_state=0, n_estimators=5)],
ids=['estimator-brute', 'estimator-recursion']
)
@pytest.mark.parametrize(
"preprocessor",
[None,
make_column_transformer(
(StandardScaler(), [iris.feature_names[i] for i in (0, 2)]),
(RobustScaler(), [iris.feature_names[i] for i in (1, 3)])),
make_column_transformer(
(StandardScaler(), [iris.feature_names[i] for i in (0, 2)]),
remainder='passthrough')],
ids=['None', 'column-transformer', 'column-transformer-passthrough']
)
@pytest.mark.parametrize(
"features",
[[0, 2], [iris.feature_names[i] for i in (0, 2)]],
ids=['features-integer', 'features-string']
)
def test_partial_dependence_dataframe(estimator, preprocessor, features):
# check that the partial dependence support dataframe and pipeline
# including a column transformer
pd = pytest.importorskip("pandas")
df = pd.DataFrame(iris.data, columns=iris.feature_names)
pipe = make_pipeline(preprocessor, estimator)
pipe.fit(df, iris.target)
pdp_pipe, values_pipe = partial_dependence(
pipe, df, features=features, grid_resolution=10
)
# the column transformer will reorder the column when transforming
# we mixed the index to be sure that we are computing the partial
# dependence of the right columns
if preprocessor is not None:
X_proc = clone(preprocessor).fit_transform(df)
features_clf = [0, 1]
else:
X_proc = df
features_clf = [0, 2]
clf = clone(estimator).fit(X_proc, iris.target)
pdp_clf, values_clf = partial_dependence(
clf, X_proc, features=features_clf, method='brute', grid_resolution=10
)
assert_allclose(pdp_pipe, pdp_clf)
if preprocessor is not None:
scaler = preprocessor.named_transformers_['standardscaler']
assert_allclose(
values_pipe[1],
values_clf[1] * scaler.scale_[1] + scaler.mean_[1]
)
else:
assert_allclose(values_pipe[1], values_clf[1])
@pytest.mark.parametrize(
"features, expected_pd_shape",
[(0, (3, 10)),
(iris.feature_names[0], (3, 10)),
([0, 2], (3, 10, 10)),
([iris.feature_names[i] for i in (0, 2)], (3, 10, 10)),
([True, False, True, False], (3, 10, 10))],
ids=['scalar-int', 'scalar-str', 'list-int', 'list-str', 'mask']
)
def test_partial_dependence_feature_type(features, expected_pd_shape):
# check all possible features type supported in PDP
pd = pytest.importorskip("pandas")
df = pd.DataFrame(iris.data, columns=iris.feature_names)
preprocessor = make_column_transformer(
(StandardScaler(), [iris.feature_names[i] for i in (0, 2)]),
(RobustScaler(), [iris.feature_names[i] for i in (1, 3)])
)
pipe = make_pipeline(
preprocessor, LogisticRegression(max_iter=1000, random_state=0)
)
pipe.fit(df, iris.target)
pdp_pipe, values_pipe = partial_dependence(
pipe, df, features=features, grid_resolution=10
)
assert pdp_pipe.shape == expected_pd_shape
assert len(values_pipe) == len(pdp_pipe.shape) - 1
@pytest.mark.parametrize(
"estimator", [LinearRegression(), LogisticRegression(),
GradientBoostingRegressor(), GradientBoostingClassifier()]
)
def test_partial_dependence_unfitted(estimator):
X = iris.data
preprocessor = make_column_transformer(
(StandardScaler(), [0, 2]), (RobustScaler(), [1, 3])
)
pipe = make_pipeline(preprocessor, estimator)
with pytest.raises(NotFittedError, match="is not fitted yet"):
partial_dependence(pipe, X, features=[0, 2], grid_resolution=10)
with pytest.raises(NotFittedError, match="is not fitted yet"):
partial_dependence(estimator, X, features=[0, 2], grid_resolution=10)

View file

@ -0,0 +1,353 @@
import pytest
import numpy as np
from numpy.testing import assert_allclose
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_iris
from sklearn.datasets import make_classification
from sklearn.datasets import make_regression
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.utils import parallel_backend
from sklearn.utils._testing import _convert_container
@pytest.mark.parametrize("n_jobs", [1, 2])
def test_permutation_importance_correlated_feature_regression(n_jobs):
# Make sure that feature highly correlated to the target have a higher
# importance
rng = np.random.RandomState(42)
n_repeats = 5
X, y = load_diabetes(return_X_y=True)
y_with_little_noise = (
y + rng.normal(scale=0.001, size=y.shape[0])).reshape(-1, 1)
X = np.hstack([X, y_with_little_noise])
clf = RandomForestRegressor(n_estimators=10, random_state=42)
clf.fit(X, y)
result = permutation_importance(clf, X, y, n_repeats=n_repeats,
random_state=rng, n_jobs=n_jobs)
assert result.importances.shape == (X.shape[1], n_repeats)
# the correlated feature with y was added as the last column and should
# have the highest importance
assert np.all(result.importances_mean[-1] >
result.importances_mean[:-1])
@pytest.mark.parametrize("n_jobs", [1, 2])
def test_permutation_importance_correlated_feature_regression_pandas(n_jobs):
pd = pytest.importorskip("pandas")
# Make sure that feature highly correlated to the target have a higher
# importance
rng = np.random.RandomState(42)
n_repeats = 5
dataset = load_iris()
X, y = dataset.data, dataset.target
y_with_little_noise = (
y + rng.normal(scale=0.001, size=y.shape[0])).reshape(-1, 1)
# Adds feature correlated with y as the last column
X = pd.DataFrame(X, columns=dataset.feature_names)
X['correlated_feature'] = y_with_little_noise
clf = RandomForestClassifier(n_estimators=10, random_state=42)
clf.fit(X, y)
result = permutation_importance(clf, X, y, n_repeats=n_repeats,
random_state=rng, n_jobs=n_jobs)
assert result.importances.shape == (X.shape[1], n_repeats)
# the correlated feature with y was added as the last column and should
# have the highest importance
assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
@pytest.mark.parametrize("n_jobs", [1, 2])
def test_robustness_to_high_cardinality_noisy_feature(n_jobs, seed=42):
# Permutation variable importance should not be affected by the high
# cardinality bias of traditional feature importances, especially when
# computed on a held-out test set:
rng = np.random.RandomState(seed)
n_repeats = 5
n_samples = 1000
n_classes = 5
n_informative_features = 2
n_noise_features = 1
n_features = n_informative_features + n_noise_features
# Generate a multiclass classification dataset and a set of informative
# binary features that can be used to predict some classes of y exactly
# while leaving some classes unexplained to make the problem harder.
classes = np.arange(n_classes)
y = rng.choice(classes, size=n_samples)
X = np.hstack([(y == c).reshape(-1, 1)
for c in classes[:n_informative_features]])
X = X.astype(np.float32)
# Not all target classes are explained by the binary class indicator
# features:
assert n_informative_features < n_classes
# Add 10 other noisy features with high cardinality (numerical) values
# that can be used to overfit the training data.
X = np.concatenate([X, rng.randn(n_samples, n_noise_features)], axis=1)
assert X.shape == (n_samples, n_features)
# Split the dataset to be able to evaluate on a held-out test set. The
# Test size should be large enough for importance measurements to be
# stable:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=rng)
clf = RandomForestClassifier(n_estimators=5, random_state=rng)
clf.fit(X_train, y_train)
# Variable importances computed by impurity decrease on the tree node
# splits often use the noisy features in splits. This can give misleading
# impression that high cardinality noisy variables are the most important:
tree_importances = clf.feature_importances_
informative_tree_importances = tree_importances[:n_informative_features]
noisy_tree_importances = tree_importances[n_informative_features:]
assert informative_tree_importances.max() < noisy_tree_importances.min()
# Let's check that permutation-based feature importances do not have this
# problem.
r = permutation_importance(clf, X_test, y_test, n_repeats=n_repeats,
random_state=rng, n_jobs=n_jobs)
assert r.importances.shape == (X.shape[1], n_repeats)
# Split the importances between informative and noisy features
informative_importances = r.importances_mean[:n_informative_features]
noisy_importances = r.importances_mean[n_informative_features:]
# Because we do not have a binary variable explaining each target classes,
# the RF model will have to use the random variable to make some
# (overfitting) splits (as max_depth is not set). Therefore the noisy
# variables will be non-zero but with small values oscillating around
# zero:
assert max(np.abs(noisy_importances)) > 1e-7
assert noisy_importances.max() < 0.05
# The binary features correlated with y should have a higher importance
# than the high cardinality noisy features.
# The maximum test accuracy is 2 / 5 == 0.4, each informative feature
# contributing approximately a bit more than 0.2 of accuracy.
assert informative_importances.min() > 0.15
def test_permutation_importance_mixed_types():
rng = np.random.RandomState(42)
n_repeats = 4
# Last column is correlated with y
X = np.array([[1.0, 2.0, 3.0, np.nan], [2, 1, 2, 1]]).T
y = np.array([0, 1, 0, 1])
clf = make_pipeline(SimpleImputer(), LogisticRegression(solver='lbfgs'))
clf.fit(X, y)
result = permutation_importance(clf, X, y, n_repeats=n_repeats,
random_state=rng)
assert result.importances.shape == (X.shape[1], n_repeats)
# the correlated feature with y is the last column and should
# have the highest importance
assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
# use another random state
rng = np.random.RandomState(0)
result2 = permutation_importance(clf, X, y, n_repeats=n_repeats,
random_state=rng)
assert result2.importances.shape == (X.shape[1], n_repeats)
assert not np.allclose(result.importances, result2.importances)
# the correlated feature with y is the last column and should
# have the highest importance
assert np.all(result2.importances_mean[-1] > result2.importances_mean[:-1])
def test_permutation_importance_mixed_types_pandas():
pd = pytest.importorskip("pandas")
rng = np.random.RandomState(42)
n_repeats = 5
# Last column is correlated with y
X = pd.DataFrame({'col1': [1.0, 2.0, 3.0, np.nan],
'col2': ['a', 'b', 'a', 'b']})
y = np.array([0, 1, 0, 1])
num_preprocess = make_pipeline(SimpleImputer(), StandardScaler())
preprocess = ColumnTransformer([
('num', num_preprocess, ['col1']),
('cat', OneHotEncoder(), ['col2'])
])
clf = make_pipeline(preprocess, LogisticRegression(solver='lbfgs'))
clf.fit(X, y)
result = permutation_importance(clf, X, y, n_repeats=n_repeats,
random_state=rng)
assert result.importances.shape == (X.shape[1], n_repeats)
# the correlated feature with y is the last column and should
# have the highest importance
assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
def test_permutation_importance_linear_regresssion():
X, y = make_regression(n_samples=500, n_features=10, random_state=0)
X = scale(X)
y = scale(y)
lr = LinearRegression().fit(X, y)
# this relationship can be computed in closed form
expected_importances = 2 * lr.coef_**2
results = permutation_importance(lr, X, y,
n_repeats=50,
scoring='neg_mean_squared_error')
assert_allclose(expected_importances, results.importances_mean,
rtol=1e-1, atol=1e-6)
def test_permutation_importance_equivalence_sequential_parallel():
# regression test to make sure that sequential and parallel calls will
# output the same results.
X, y = make_regression(n_samples=500, n_features=10, random_state=0)
lr = LinearRegression().fit(X, y)
importance_sequential = permutation_importance(
lr, X, y, n_repeats=5, random_state=0, n_jobs=1
)
# First check that the problem is structured enough and that the model is
# complex enough to not yield trivial, constant importances:
imp_min = importance_sequential['importances'].min()
imp_max = importance_sequential['importances'].max()
assert imp_max - imp_min > 0.3
# The actually check that parallelism does not impact the results
# either with shared memory (threading) or without isolated memory
# via process-based parallelism using the default backend
# ('loky' or 'multiprocessing') depending on the joblib version:
# process-based parallelism (by default):
importance_processes = permutation_importance(
lr, X, y, n_repeats=5, random_state=0, n_jobs=2)
assert_allclose(
importance_processes['importances'],
importance_sequential['importances']
)
# thread-based parallelism:
with parallel_backend("threading"):
importance_threading = permutation_importance(
lr, X, y, n_repeats=5, random_state=0, n_jobs=2
)
assert_allclose(
importance_threading['importances'],
importance_sequential['importances']
)
@pytest.mark.parametrize("n_jobs", [None, 1, 2])
def test_permutation_importance_equivalence_array_dataframe(n_jobs):
# This test checks that the column shuffling logic has the same behavior
# both a dataframe and a simple numpy array.
pd = pytest.importorskip('pandas')
# regression test to make sure that sequential and parallel calls will
# output the same results.
X, y = make_regression(n_samples=100, n_features=5, random_state=0)
X_df = pd.DataFrame(X)
# Add a categorical feature that is statistically linked to y:
binner = KBinsDiscretizer(n_bins=3, encode="ordinal")
cat_column = binner.fit_transform(y.reshape(-1, 1))
# Concatenate the extra column to the numpy array: integers will be
# cast to float values
X = np.hstack([X, cat_column])
assert X.dtype.kind == "f"
# Insert extra column as a non-numpy-native dtype (while keeping backward
# compat for old pandas versions):
if hasattr(pd, "Categorical"):
cat_column = pd.Categorical(cat_column.ravel())
else:
cat_column = cat_column.ravel()
new_col_idx = len(X_df.columns)
X_df[new_col_idx] = cat_column
assert X_df[new_col_idx].dtype == cat_column.dtype
# Stich an aribtrary index to the dataframe:
X_df.index = np.arange(len(X_df)).astype(str)
rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0)
rf.fit(X, y)
n_repeats = 3
importance_array = permutation_importance(
rf, X, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs
)
# First check that the problem is structured enough and that the model is
# complex enough to not yield trivial, constant importances:
imp_min = importance_array['importances'].min()
imp_max = importance_array['importances'].max()
assert imp_max - imp_min > 0.3
# Now check that importances computed on dataframe matche the values
# of those computed on the array with the same data.
importance_dataframe = permutation_importance(
rf, X_df, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs
)
assert_allclose(
importance_array['importances'],
importance_dataframe['importances']
)
@pytest.mark.parametrize("input_type", ["array", "dataframe"])
def test_permutation_importance_large_memmaped_data(input_type):
# Smoke, non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/15810
n_samples, n_features = int(5e4), 4
X, y = make_classification(n_samples=n_samples, n_features=n_features,
random_state=0)
assert X.nbytes > 1e6 # trigger joblib memmaping
X = _convert_container(X, input_type)
clf = DummyClassifier(strategy='prior').fit(X, y)
# Actual smoke test: should not raise any error:
n_repeats = 5
r = permutation_importance(clf, X, y, n_repeats=n_repeats, n_jobs=2)
# Auxiliary check: DummyClassifier is feature independent:
# permutating feature should not change the predictions
expected_importances = np.zeros((n_features, n_repeats))
assert_allclose(expected_importances, r.importances)