Uploaded Test files

2020-11-12 11:05:57 -05:00 · 2020-11-12 11:05:57 -05:00 · 2e81cb7d99
commit 2e81cb7d99
parent f584ad9d97
16627 changed files with 2065359 additions and 102444 deletions
--- a/venv/Lib/site-packages/sklearn/inspection/_partial_dependence.py
+++ b/venv/Lib/site-packages/sklearn/inspection/_partial_dependence.py
@ -0,0 +1,421 @@
+"""Partial dependence plots for regression and classification models."""
+
+# Authors: Peter Prettenhofer
+#          Trevor Stephens
+#          Nicolas Hug
+# License: BSD 3 clause
+
+from collections.abc import Iterable
+
+import numpy as np
+from scipy import sparse
+from scipy.stats.mstats import mquantiles
+
+from ..base import is_classifier, is_regressor
+from ..pipeline import Pipeline
+from ..utils.extmath import cartesian
+from ..utils import check_array
+from ..utils import check_matplotlib_support  # noqa
+from ..utils import _safe_indexing
+from ..utils import _determine_key_type
+from ..utils import _get_column_indices
+from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
+from ..tree import DecisionTreeRegressor
+from ..ensemble import RandomForestRegressor
+from ..exceptions import NotFittedError
+from ..ensemble._gb import BaseGradientBoosting
+from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import (
+    BaseHistGradientBoosting)
+
+
+__all__ = [
+    'partial_dependence',
+]
+
+
+def _grid_from_X(X, percentiles, grid_resolution):
+    """Generate a grid of points based on the percentiles of X.
+
+    The grid is a cartesian product between the columns of ``values``. The
+    ith column of ``values`` consists in ``grid_resolution`` equally-spaced
+    points between the percentiles of the jth column of X.
+    If ``grid_resolution`` is bigger than the number of unique values in the
+    jth column of X, then those unique values will be used instead.
+
+    Parameters
+    ----------
+    X : ndarray, shape (n_samples, n_target_features)
+        The data
+
+    percentiles : tuple of floats
+        The percentiles which are used to construct the extreme values of
+        the grid. Must be in [0, 1].
+
+    grid_resolution : int
+        The number of equally spaced points to be placed on the grid for each
+        feature.
+
+    Returns
+    -------
+    grid : ndarray, shape (n_points, n_target_features)
+        A value for each feature at each point in the grid. ``n_points`` is
+        always ``<= grid_resolution ** X.shape[1]``.
+
+    values : list of 1d ndarrays
+        The values with which the grid has been created. The size of each
+        array ``values[j]`` is either ``grid_resolution``, or the number of
+        unique values in ``X[:, j]``, whichever is smaller.
+    """
+    if not isinstance(percentiles, Iterable) or len(percentiles) != 2:
+        raise ValueError("'percentiles' must be a sequence of 2 elements.")
+    if not all(0 <= x <= 1 for x in percentiles):
+        raise ValueError("'percentiles' values must be in [0, 1].")
+    if percentiles[0] >= percentiles[1]:
+        raise ValueError('percentiles[0] must be strictly less '
+                         'than percentiles[1].')
+
+    if grid_resolution <= 1:
+        raise ValueError("'grid_resolution' must be strictly greater than 1.")
+
+    values = []
+    for feature in range(X.shape[1]):
+        uniques = np.unique(_safe_indexing(X, feature, axis=1))
+        if uniques.shape[0] < grid_resolution:
+            # feature has low resolution use unique vals
+            axis = uniques
+        else:
+            # create axis based on percentiles and grid resolution
+            emp_percentiles = mquantiles(
+                _safe_indexing(X, feature, axis=1), prob=percentiles, axis=0
+            )
+            if np.allclose(emp_percentiles[0], emp_percentiles[1]):
+                raise ValueError(
+                    'percentiles are too close to each other, '
+                    'unable to build the grid. Please choose percentiles '
+                    'that are further apart.')
+            axis = np.linspace(emp_percentiles[0],
+                               emp_percentiles[1],
+                               num=grid_resolution, endpoint=True)
+        values.append(axis)
+
+    return cartesian(values), values
+
+
+def _partial_dependence_recursion(est, grid, features):
+    averaged_predictions = est._compute_partial_dependence_recursion(grid,
+                                                                     features)
+    if averaged_predictions.ndim == 1:
+        # reshape to (1, n_points) for consistency with
+        # _partial_dependence_brute
+        averaged_predictions = averaged_predictions.reshape(1, -1)
+
+    return averaged_predictions
+
+
+def _partial_dependence_brute(est, grid, features, X, response_method):
+    averaged_predictions = []
+
+    # define the prediction_method (predict, predict_proba, decision_function).
+    if is_regressor(est):
+        prediction_method = est.predict
+    else:
+        predict_proba = getattr(est, 'predict_proba', None)
+        decision_function = getattr(est, 'decision_function', None)
+        if response_method == 'auto':
+            # try predict_proba, then decision_function if it doesn't exist
+            prediction_method = predict_proba or decision_function
+        else:
+            prediction_method = (predict_proba if response_method ==
+                                 'predict_proba' else decision_function)
+        if prediction_method is None:
+            if response_method == 'auto':
+                raise ValueError(
+                    'The estimator has no predict_proba and no '
+                    'decision_function method.'
+                )
+            elif response_method == 'predict_proba':
+                raise ValueError('The estimator has no predict_proba method.')
+            else:
+                raise ValueError(
+                    'The estimator has no decision_function method.')
+
+    for new_values in grid:
+        X_eval = X.copy()
+        for i, variable in enumerate(features):
+            if hasattr(X_eval, 'iloc'):
+                X_eval.iloc[:, variable] = new_values[i]
+            else:
+                X_eval[:, variable] = new_values[i]
+
+        try:
+            predictions = prediction_method(X_eval)
+        except NotFittedError:
+            raise ValueError(
+                "'estimator' parameter must be a fitted estimator")
+
+        # Note: predictions is of shape
+        # (n_points,) for non-multioutput regressors
+        # (n_points, n_tasks) for multioutput regressors
+        # (n_points, 1) for the regressors in cross_decomposition (I think)
+        # (n_points, 2) for binary classification
+        # (n_points, n_classes) for multiclass classification
+
+        # average over samples
+        averaged_predictions.append(np.mean(predictions, axis=0))
+
+    # reshape to (n_targets, n_points) where n_targets is:
+    # - 1 for non-multioutput regression and binary classification (shape is
+    #   already correct in those cases)
+    # - n_tasks for multi-output regression
+    # - n_classes for multiclass classification.
+    averaged_predictions = np.array(averaged_predictions).T
+    if is_regressor(est) and averaged_predictions.ndim == 1:
+        # non-multioutput regression, shape is (n_points,)
+        averaged_predictions = averaged_predictions.reshape(1, -1)
+    elif is_classifier(est) and averaged_predictions.shape[0] == 2:
+        # Binary classification, shape is (2, n_points).
+        # we output the effect of **positive** class
+        averaged_predictions = averaged_predictions[1]
+        averaged_predictions = averaged_predictions.reshape(1, -1)
+
+    return averaged_predictions
+
+
+@_deprecate_positional_args
+def partial_dependence(estimator, X, features, *, response_method='auto',
+                       percentiles=(0.05, 0.95), grid_resolution=100,
+                       method='auto'):
+    """Partial dependence of ``features``.
+
+    Partial dependence of a feature (or a set of features) corresponds to
+    the average response of an estimator for each possible value of the
+    feature.
+
+    Read more in the :ref:`User Guide <partial_dependence>`.
+
+    .. warning::
+
+        For :class:`~sklearn.ensemble.GradientBoostingClassifier` and
+        :class:`~sklearn.ensemble.GradientBoostingRegressor`, the
+        'recursion' method (used by default) will not account for the `init`
+        predictor of the boosting process. In practice, this will produce
+        the same values as 'brute' up to a constant offset in the target
+        response, provided that `init` is a constant estimator (which is the
+        default). However, if `init` is not a constant estimator, the
+        partial dependence values are incorrect for 'recursion' because the
+        offset will be sample-dependent. It is preferable to use the 'brute'
+        method. Note that this only applies to
+        :class:`~sklearn.ensemble.GradientBoostingClassifier` and
+        :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to
+        :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and
+        :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.
+
+    Parameters
+    ----------
+    estimator : BaseEstimator
+        A fitted estimator object implementing :term:`predict`,
+        :term:`predict_proba`, or :term:`decision_function`.
+        Multioutput-multiclass classifiers are not supported.
+
+    X : {array-like or dataframe} of shape (n_samples, n_features)
+        ``X`` is used to generate a grid of values for the target
+        ``features`` (where the partial dependence will be evaluated), and
+        also to generate values for the complement features when the
+        `method` is 'brute'.
+
+    features : array-like of {int, str}
+        The feature (e.g. `[0]`) or pair of interacting features
+        (e.g. `[(0, 1)]`) for which the partial dependency should be computed.
+
+    response_method : 'auto', 'predict_proba' or 'decision_function', \
+            optional (default='auto')
+        Specifies whether to use :term:`predict_proba` or
+        :term:`decision_function` as the target response. For regressors
+        this parameter is ignored and the response is always the output of
+        :term:`predict`. By default, :term:`predict_proba` is tried first
+        and we revert to :term:`decision_function` if it doesn't exist. If
+        ``method`` is 'recursion', the response is always the output of
+        :term:`decision_function`.
+
+    percentiles : tuple of float, optional (default=(0.05, 0.95))
+        The lower and upper percentile used to create the extreme values
+        for the grid. Must be in [0, 1].
+
+    grid_resolution : int, optional (default=100)
+        The number of equally spaced points on the grid, for each target
+        feature.
+
+    method : str, optional (default='auto')
+        The method used to calculate the averaged predictions:
+
+        - 'recursion' is only supported for some tree-based estimators (namely
+          :class:`~sklearn.ensemble.GradientBoostingClassifier`,
+          :class:`~sklearn.ensemble.GradientBoostingRegressor`,
+          :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,
+          :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,
+          :class:`~sklearn.tree.DecisionTreeRegressor`,
+          :class:`~sklearn.ensemble.RandomForestRegressor`,
+          )
+          but is more efficient in terms of speed.
+          With this method, the target response of a
+          classifier is always the decision function, not the predicted
+          probabilities.
+
+        - 'brute' is supported for any estimator, but is more
+          computationally intensive.
+
+        - 'auto': the 'recursion' is used for estimators that support it,
+          and 'brute' is used otherwise.
+
+        Please see :ref:`this note <pdp_method_differences>` for
+        differences between the 'brute' and 'recursion' method.
+
+    Returns
+    -------
+    averaged_predictions : ndarray, \
+            shape (n_outputs, len(values[0]), len(values[1]), ...)
+        The predictions for all the points in the grid, averaged over all
+        samples in X (or over the training data if ``method`` is
+        'recursion'). ``n_outputs`` corresponds to the number of classes in
+        a multi-class setting, or to the number of tasks for multi-output
+        regression. For classical regression and binary classification
+        ``n_outputs==1``. ``n_values_feature_j`` corresponds to the size
+        ``values[j]``.
+
+    values : seq of 1d ndarrays
+        The values with which the grid has been created. The generated grid
+        is a cartesian product of the arrays in ``values``. ``len(values) ==
+        len(features)``. The size of each array ``values[j]`` is either
+        ``grid_resolution``, or the number of unique values in ``X[:, j]``,
+        whichever is smaller.
+
+    Examples
+    --------
+    >>> X = [[0, 0, 2], [1, 0, 0]]
+    >>> y = [0, 1]
+    >>> from sklearn.ensemble import GradientBoostingClassifier
+    >>> gb = GradientBoostingClassifier(random_state=0).fit(X, y)
+    >>> partial_dependence(gb, features=[0], X=X, percentiles=(0, 1),
+    ...                    grid_resolution=2) # doctest: +SKIP
+    (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
+
+    See also
+    --------
+    sklearn.inspection.plot_partial_dependence: Plot partial dependence
+    """
+    if not (is_classifier(estimator) or is_regressor(estimator)):
+        raise ValueError(
+            "'estimator' must be a fitted regressor or classifier."
+        )
+
+    if isinstance(estimator, Pipeline):
+        # TODO: to be removed if/when pipeline get a `steps_` attributes
+        # assuming Pipeline is the only estimator that does not store a new
+        # attribute
+        for est in estimator:
+            # FIXME: remove the None option when it will be deprecated
+            if est not in (None, 'drop'):
+                check_is_fitted(est)
+    else:
+        check_is_fitted(estimator)
+
+    if (is_classifier(estimator) and
+            isinstance(estimator.classes_[0], np.ndarray)):
+        raise ValueError(
+            'Multiclass-multioutput estimators are not supported'
+        )
+
+    # Use check_array only on lists and other non-array-likes / sparse. Do not
+    # convert DataFrame into a NumPy array.
+    if not(hasattr(X, '__array__') or sparse.issparse(X)):
+        X = check_array(X, force_all_finite='allow-nan', dtype=np.object)
+
+    accepted_responses = ('auto', 'predict_proba', 'decision_function')
+    if response_method not in accepted_responses:
+        raise ValueError(
+            'response_method {} is invalid. Accepted response_method names '
+            'are {}.'.format(response_method, ', '.join(accepted_responses)))
+
+    if is_regressor(estimator) and response_method != 'auto':
+        raise ValueError(
+            "The response_method parameter is ignored for regressors and "
+            "must be 'auto'."
+        )
+
+    accepted_methods = ('brute', 'recursion', 'auto')
+    if method not in accepted_methods:
+        raise ValueError(
+            'method {} is invalid. Accepted method names are {}.'.format(
+                method, ', '.join(accepted_methods)))
+
+    if method == 'auto':
+        if (isinstance(estimator, BaseGradientBoosting) and
+                estimator.init is None):
+            method = 'recursion'
+        elif isinstance(estimator, (BaseHistGradientBoosting,
+                                    DecisionTreeRegressor,
+                                    RandomForestRegressor)):
+            method = 'recursion'
+        else:
+            method = 'brute'
+
+    if method == 'recursion':
+        if not isinstance(estimator,
+                          (BaseGradientBoosting, BaseHistGradientBoosting,
+                           DecisionTreeRegressor, RandomForestRegressor)):
+            supported_classes_recursion = (
+                'GradientBoostingClassifier',
+                'GradientBoostingRegressor',
+                'HistGradientBoostingClassifier',
+                'HistGradientBoostingRegressor',
+                'HistGradientBoostingRegressor',
+                'DecisionTreeRegressor',
+                'RandomForestRegressor',
+            )
+            raise ValueError(
+                "Only the following estimators support the 'recursion' "
+                "method: {}. Try using method='brute'."
+                .format(', '.join(supported_classes_recursion)))
+        if response_method == 'auto':
+            response_method = 'decision_function'
+
+        if response_method != 'decision_function':
+            raise ValueError(
+                "With the 'recursion' method, the response_method must be "
+                "'decision_function'. Got {}.".format(response_method)
+            )
+
+    if _determine_key_type(features, accept_slice=False) == 'int':
+        # _get_column_indices() supports negative indexing. Here, we limit
+        # the indexing to be positive. The upper bound will be checked
+        # by _get_column_indices()
+        if np.any(np.less(features, 0)):
+            raise ValueError(
+                'all features must be in [0, {}]'.format(X.shape[1] - 1)
+            )
+
+    features_indices = np.asarray(
+        _get_column_indices(X, features), dtype=np.int32, order='C'
+    ).ravel()
+
+    grid, values = _grid_from_X(
+        _safe_indexing(X, features_indices, axis=1), percentiles,
+        grid_resolution
+    )
+
+    if method == 'brute':
+        averaged_predictions = _partial_dependence_brute(
+            estimator, grid, features_indices, X, response_method
+        )
+    else:
+        averaged_predictions = _partial_dependence_recursion(
+            estimator, grid, features_indices
+        )
+
+    # reshape averaged_predictions to
+    # (n_outputs, n_values_feature_0, n_values_feature_1, ...)
+    averaged_predictions = averaged_predictions.reshape(
+        -1, *[val.shape[0] for val in values])
+
+    return averaged_predictions, values