"""Partial dependence plots for regression and classification models.""" # Authors: Peter Prettenhofer # Trevor Stephens # Nicolas Hug # License: BSD 3 clause from collections.abc import Iterable import numpy as np from scipy import sparse from scipy.stats.mstats import mquantiles from ..base import is_classifier, is_regressor from ..pipeline import Pipeline from ..utils.extmath import cartesian from ..utils import check_array from ..utils import check_matplotlib_support # noqa from ..utils import _safe_indexing from ..utils import _determine_key_type from ..utils import _get_column_indices from ..utils.validation import check_is_fitted from ..utils.validation import _deprecate_positional_args from ..tree import DecisionTreeRegressor from ..ensemble import RandomForestRegressor from ..exceptions import NotFittedError from ..ensemble._gb import BaseGradientBoosting from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import ( BaseHistGradientBoosting) __all__ = [ 'partial_dependence', ] def _grid_from_X(X, percentiles, grid_resolution): """Generate a grid of points based on the percentiles of X. The grid is a cartesian product between the columns of ``values``. The ith column of ``values`` consists in ``grid_resolution`` equally-spaced points between the percentiles of the jth column of X. If ``grid_resolution`` is bigger than the number of unique values in the jth column of X, then those unique values will be used instead. Parameters ---------- X : ndarray, shape (n_samples, n_target_features) The data percentiles : tuple of floats The percentiles which are used to construct the extreme values of the grid. Must be in [0, 1]. grid_resolution : int The number of equally spaced points to be placed on the grid for each feature. Returns ------- grid : ndarray, shape (n_points, n_target_features) A value for each feature at each point in the grid. ``n_points`` is always ``<= grid_resolution ** X.shape[1]``. values : list of 1d ndarrays The values with which the grid has been created. The size of each array ``values[j]`` is either ``grid_resolution``, or the number of unique values in ``X[:, j]``, whichever is smaller. """ if not isinstance(percentiles, Iterable) or len(percentiles) != 2: raise ValueError("'percentiles' must be a sequence of 2 elements.") if not all(0 <= x <= 1 for x in percentiles): raise ValueError("'percentiles' values must be in [0, 1].") if percentiles[0] >= percentiles[1]: raise ValueError('percentiles[0] must be strictly less ' 'than percentiles[1].') if grid_resolution <= 1: raise ValueError("'grid_resolution' must be strictly greater than 1.") values = [] for feature in range(X.shape[1]): uniques = np.unique(_safe_indexing(X, feature, axis=1)) if uniques.shape[0] < grid_resolution: # feature has low resolution use unique vals axis = uniques else: # create axis based on percentiles and grid resolution emp_percentiles = mquantiles( _safe_indexing(X, feature, axis=1), prob=percentiles, axis=0 ) if np.allclose(emp_percentiles[0], emp_percentiles[1]): raise ValueError( 'percentiles are too close to each other, ' 'unable to build the grid. Please choose percentiles ' 'that are further apart.') axis = np.linspace(emp_percentiles[0], emp_percentiles[1], num=grid_resolution, endpoint=True) values.append(axis) return cartesian(values), values def _partial_dependence_recursion(est, grid, features): averaged_predictions = est._compute_partial_dependence_recursion(grid, features) if averaged_predictions.ndim == 1: # reshape to (1, n_points) for consistency with # _partial_dependence_brute averaged_predictions = averaged_predictions.reshape(1, -1) return averaged_predictions def _partial_dependence_brute(est, grid, features, X, response_method): averaged_predictions = [] # define the prediction_method (predict, predict_proba, decision_function). if is_regressor(est): prediction_method = est.predict else: predict_proba = getattr(est, 'predict_proba', None) decision_function = getattr(est, 'decision_function', None) if response_method == 'auto': # try predict_proba, then decision_function if it doesn't exist prediction_method = predict_proba or decision_function else: prediction_method = (predict_proba if response_method == 'predict_proba' else decision_function) if prediction_method is None: if response_method == 'auto': raise ValueError( 'The estimator has no predict_proba and no ' 'decision_function method.' ) elif response_method == 'predict_proba': raise ValueError('The estimator has no predict_proba method.') else: raise ValueError( 'The estimator has no decision_function method.') for new_values in grid: X_eval = X.copy() for i, variable in enumerate(features): if hasattr(X_eval, 'iloc'): X_eval.iloc[:, variable] = new_values[i] else: X_eval[:, variable] = new_values[i] try: predictions = prediction_method(X_eval) except NotFittedError: raise ValueError( "'estimator' parameter must be a fitted estimator") # Note: predictions is of shape # (n_points,) for non-multioutput regressors # (n_points, n_tasks) for multioutput regressors # (n_points, 1) for the regressors in cross_decomposition (I think) # (n_points, 2) for binary classification # (n_points, n_classes) for multiclass classification # average over samples averaged_predictions.append(np.mean(predictions, axis=0)) # reshape to (n_targets, n_points) where n_targets is: # - 1 for non-multioutput regression and binary classification (shape is # already correct in those cases) # - n_tasks for multi-output regression # - n_classes for multiclass classification. averaged_predictions = np.array(averaged_predictions).T if is_regressor(est) and averaged_predictions.ndim == 1: # non-multioutput regression, shape is (n_points,) averaged_predictions = averaged_predictions.reshape(1, -1) elif is_classifier(est) and averaged_predictions.shape[0] == 2: # Binary classification, shape is (2, n_points). # we output the effect of **positive** class averaged_predictions = averaged_predictions[1] averaged_predictions = averaged_predictions.reshape(1, -1) return averaged_predictions @_deprecate_positional_args def partial_dependence(estimator, X, features, *, response_method='auto', percentiles=(0.05, 0.95), grid_resolution=100, method='auto'): """Partial dependence of ``features``. Partial dependence of a feature (or a set of features) corresponds to the average response of an estimator for each possible value of the feature. Read more in the :ref:`User Guide `. .. warning:: For :class:`~sklearn.ensemble.GradientBoostingClassifier` and :class:`~sklearn.ensemble.GradientBoostingRegressor`, the 'recursion' method (used by default) will not account for the `init` predictor of the boosting process. In practice, this will produce the same values as 'brute' up to a constant offset in the target response, provided that `init` is a constant estimator (which is the default). However, if `init` is not a constant estimator, the partial dependence values are incorrect for 'recursion' because the offset will be sample-dependent. It is preferable to use the 'brute' method. Note that this only applies to :class:`~sklearn.ensemble.GradientBoostingClassifier` and :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. Parameters ---------- estimator : BaseEstimator A fitted estimator object implementing :term:`predict`, :term:`predict_proba`, or :term:`decision_function`. Multioutput-multiclass classifiers are not supported. X : {array-like or dataframe} of shape (n_samples, n_features) ``X`` is used to generate a grid of values for the target ``features`` (where the partial dependence will be evaluated), and also to generate values for the complement features when the `method` is 'brute'. features : array-like of {int, str} The feature (e.g. `[0]`) or pair of interacting features (e.g. `[(0, 1)]`) for which the partial dependency should be computed. response_method : 'auto', 'predict_proba' or 'decision_function', \ optional (default='auto') Specifies whether to use :term:`predict_proba` or :term:`decision_function` as the target response. For regressors this parameter is ignored and the response is always the output of :term:`predict`. By default, :term:`predict_proba` is tried first and we revert to :term:`decision_function` if it doesn't exist. If ``method`` is 'recursion', the response is always the output of :term:`decision_function`. percentiles : tuple of float, optional (default=(0.05, 0.95)) The lower and upper percentile used to create the extreme values for the grid. Must be in [0, 1]. grid_resolution : int, optional (default=100) The number of equally spaced points on the grid, for each target feature. method : str, optional (default='auto') The method used to calculate the averaged predictions: - 'recursion' is only supported for some tree-based estimators (namely :class:`~sklearn.ensemble.GradientBoostingClassifier`, :class:`~sklearn.ensemble.GradientBoostingRegressor`, :class:`~sklearn.ensemble.HistGradientBoostingClassifier`, :class:`~sklearn.ensemble.HistGradientBoostingRegressor`, :class:`~sklearn.tree.DecisionTreeRegressor`, :class:`~sklearn.ensemble.RandomForestRegressor`, ) but is more efficient in terms of speed. With this method, the target response of a classifier is always the decision function, not the predicted probabilities. - 'brute' is supported for any estimator, but is more computationally intensive. - 'auto': the 'recursion' is used for estimators that support it, and 'brute' is used otherwise. Please see :ref:`this note ` for differences between the 'brute' and 'recursion' method. Returns ------- averaged_predictions : ndarray, \ shape (n_outputs, len(values[0]), len(values[1]), ...) The predictions for all the points in the grid, averaged over all samples in X (or over the training data if ``method`` is 'recursion'). ``n_outputs`` corresponds to the number of classes in a multi-class setting, or to the number of tasks for multi-output regression. For classical regression and binary classification ``n_outputs==1``. ``n_values_feature_j`` corresponds to the size ``values[j]``. values : seq of 1d ndarrays The values with which the grid has been created. The generated grid is a cartesian product of the arrays in ``values``. ``len(values) == len(features)``. The size of each array ``values[j]`` is either ``grid_resolution``, or the number of unique values in ``X[:, j]``, whichever is smaller. Examples -------- >>> X = [[0, 0, 2], [1, 0, 0]] >>> y = [0, 1] >>> from sklearn.ensemble import GradientBoostingClassifier >>> gb = GradientBoostingClassifier(random_state=0).fit(X, y) >>> partial_dependence(gb, features=[0], X=X, percentiles=(0, 1), ... grid_resolution=2) # doctest: +SKIP (array([[-4.52..., 4.52...]]), [array([ 0., 1.])]) See also -------- sklearn.inspection.plot_partial_dependence: Plot partial dependence """ if not (is_classifier(estimator) or is_regressor(estimator)): raise ValueError( "'estimator' must be a fitted regressor or classifier." ) if isinstance(estimator, Pipeline): # TODO: to be removed if/when pipeline get a `steps_` attributes # assuming Pipeline is the only estimator that does not store a new # attribute for est in estimator: # FIXME: remove the None option when it will be deprecated if est not in (None, 'drop'): check_is_fitted(est) else: check_is_fitted(estimator) if (is_classifier(estimator) and isinstance(estimator.classes_[0], np.ndarray)): raise ValueError( 'Multiclass-multioutput estimators are not supported' ) # Use check_array only on lists and other non-array-likes / sparse. Do not # convert DataFrame into a NumPy array. if not(hasattr(X, '__array__') or sparse.issparse(X)): X = check_array(X, force_all_finite='allow-nan', dtype=np.object) accepted_responses = ('auto', 'predict_proba', 'decision_function') if response_method not in accepted_responses: raise ValueError( 'response_method {} is invalid. Accepted response_method names ' 'are {}.'.format(response_method, ', '.join(accepted_responses))) if is_regressor(estimator) and response_method != 'auto': raise ValueError( "The response_method parameter is ignored for regressors and " "must be 'auto'." ) accepted_methods = ('brute', 'recursion', 'auto') if method not in accepted_methods: raise ValueError( 'method {} is invalid. Accepted method names are {}.'.format( method, ', '.join(accepted_methods))) if method == 'auto': if (isinstance(estimator, BaseGradientBoosting) and estimator.init is None): method = 'recursion' elif isinstance(estimator, (BaseHistGradientBoosting, DecisionTreeRegressor, RandomForestRegressor)): method = 'recursion' else: method = 'brute' if method == 'recursion': if not isinstance(estimator, (BaseGradientBoosting, BaseHistGradientBoosting, DecisionTreeRegressor, RandomForestRegressor)): supported_classes_recursion = ( 'GradientBoostingClassifier', 'GradientBoostingRegressor', 'HistGradientBoostingClassifier', 'HistGradientBoostingRegressor', 'HistGradientBoostingRegressor', 'DecisionTreeRegressor', 'RandomForestRegressor', ) raise ValueError( "Only the following estimators support the 'recursion' " "method: {}. Try using method='brute'." .format(', '.join(supported_classes_recursion))) if response_method == 'auto': response_method = 'decision_function' if response_method != 'decision_function': raise ValueError( "With the 'recursion' method, the response_method must be " "'decision_function'. Got {}.".format(response_method) ) if _determine_key_type(features, accept_slice=False) == 'int': # _get_column_indices() supports negative indexing. Here, we limit # the indexing to be positive. The upper bound will be checked # by _get_column_indices() if np.any(np.less(features, 0)): raise ValueError( 'all features must be in [0, {}]'.format(X.shape[1] - 1) ) features_indices = np.asarray( _get_column_indices(X, features), dtype=np.int32, order='C' ).ravel() grid, values = _grid_from_X( _safe_indexing(X, features_indices, axis=1), percentiles, grid_resolution ) if method == 'brute': averaged_predictions = _partial_dependence_brute( estimator, grid, features_indices, X, response_method ) else: averaged_predictions = _partial_dependence_recursion( estimator, grid, features_indices ) # reshape averaged_predictions to # (n_outputs, n_values_feature_0, n_values_feature_1, ...) averaged_predictions = averaged_predictions.reshape( -1, *[val.shape[0] for val in values]) return averaged_predictions, values