1531 lines
62 KiB
Python
1531 lines
62 KiB
Python
"""
|
|
The :mod:`sklearn.model_selection._search` includes utilities to fine-tune the
|
|
parameters of an estimator.
|
|
"""
|
|
|
|
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>,
|
|
# Gael Varoquaux <gael.varoquaux@normalesup.org>
|
|
# Andreas Mueller <amueller@ais.uni-bonn.de>
|
|
# Olivier Grisel <olivier.grisel@ensta.org>
|
|
# Raghav RV <rvraghav93@gmail.com>
|
|
# License: BSD 3 clause
|
|
|
|
from abc import ABCMeta, abstractmethod
|
|
from collections import defaultdict
|
|
from collections.abc import Mapping, Sequence, Iterable
|
|
from functools import partial, reduce
|
|
from itertools import product
|
|
import numbers
|
|
import operator
|
|
import time
|
|
import warnings
|
|
|
|
import numpy as np
|
|
from numpy.ma import MaskedArray
|
|
from scipy.stats import rankdata
|
|
|
|
from ..base import BaseEstimator, is_classifier, clone
|
|
from ..base import MetaEstimatorMixin
|
|
from ._split import check_cv
|
|
from ._validation import _fit_and_score
|
|
from ._validation import _aggregate_score_dicts
|
|
from ..exceptions import NotFittedError
|
|
from joblib import Parallel, delayed
|
|
from ..utils import check_random_state
|
|
from ..utils.random import sample_without_replacement
|
|
from ..utils.validation import indexable, check_is_fitted, _check_fit_params
|
|
from ..utils.validation import _deprecate_positional_args
|
|
from ..utils.metaestimators import if_delegate_has_method
|
|
from ..metrics._scorer import _check_multimetric_scoring
|
|
from ..metrics import check_scoring
|
|
from ..utils import deprecated
|
|
|
|
__all__ = ['GridSearchCV', 'ParameterGrid', 'fit_grid_point',
|
|
'ParameterSampler', 'RandomizedSearchCV']
|
|
|
|
|
|
class ParameterGrid:
|
|
"""Grid of parameters with a discrete number of values for each.
|
|
|
|
Can be used to iterate over parameter value combinations with the
|
|
Python built-in function iter.
|
|
|
|
Read more in the :ref:`User Guide <grid_search>`.
|
|
|
|
Parameters
|
|
----------
|
|
param_grid : dict of str to sequence, or sequence of such
|
|
The parameter grid to explore, as a dictionary mapping estimator
|
|
parameters to sequences of allowed values.
|
|
|
|
An empty dict signifies default parameters.
|
|
|
|
A sequence of dicts signifies a sequence of grids to search, and is
|
|
useful to avoid exploring parameter combinations that make no sense
|
|
or have no effect. See the examples below.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection import ParameterGrid
|
|
>>> param_grid = {'a': [1, 2], 'b': [True, False]}
|
|
>>> list(ParameterGrid(param_grid)) == (
|
|
... [{'a': 1, 'b': True}, {'a': 1, 'b': False},
|
|
... {'a': 2, 'b': True}, {'a': 2, 'b': False}])
|
|
True
|
|
|
|
>>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}]
|
|
>>> list(ParameterGrid(grid)) == [{'kernel': 'linear'},
|
|
... {'kernel': 'rbf', 'gamma': 1},
|
|
... {'kernel': 'rbf', 'gamma': 10}]
|
|
True
|
|
>>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1}
|
|
True
|
|
|
|
See also
|
|
--------
|
|
:class:`GridSearchCV`:
|
|
Uses :class:`ParameterGrid` to perform a full parallelized parameter
|
|
search.
|
|
"""
|
|
|
|
def __init__(self, param_grid):
|
|
if not isinstance(param_grid, (Mapping, Iterable)):
|
|
raise TypeError('Parameter grid is not a dict or '
|
|
'a list ({!r})'.format(param_grid))
|
|
|
|
if isinstance(param_grid, Mapping):
|
|
# wrap dictionary in a singleton list to support either dict
|
|
# or list of dicts
|
|
param_grid = [param_grid]
|
|
|
|
# check if all entries are dictionaries of lists
|
|
for grid in param_grid:
|
|
if not isinstance(grid, dict):
|
|
raise TypeError('Parameter grid is not a '
|
|
'dict ({!r})'.format(grid))
|
|
for key in grid:
|
|
if not isinstance(grid[key], Iterable):
|
|
raise TypeError('Parameter grid value is not iterable '
|
|
'(key={!r}, value={!r})'
|
|
.format(key, grid[key]))
|
|
|
|
self.param_grid = param_grid
|
|
|
|
def __iter__(self):
|
|
"""Iterate over the points in the grid.
|
|
|
|
Returns
|
|
-------
|
|
params : iterator over dict of str to any
|
|
Yields dictionaries mapping each estimator parameter to one of its
|
|
allowed values.
|
|
"""
|
|
for p in self.param_grid:
|
|
# Always sort the keys of a dictionary, for reproducibility
|
|
items = sorted(p.items())
|
|
if not items:
|
|
yield {}
|
|
else:
|
|
keys, values = zip(*items)
|
|
for v in product(*values):
|
|
params = dict(zip(keys, v))
|
|
yield params
|
|
|
|
def __len__(self):
|
|
"""Number of points on the grid."""
|
|
# Product function that can handle iterables (np.product can't).
|
|
product = partial(reduce, operator.mul)
|
|
return sum(product(len(v) for v in p.values()) if p else 1
|
|
for p in self.param_grid)
|
|
|
|
def __getitem__(self, ind):
|
|
"""Get the parameters that would be ``ind``th in iteration
|
|
|
|
Parameters
|
|
----------
|
|
ind : int
|
|
The iteration index
|
|
|
|
Returns
|
|
-------
|
|
params : dict of str to any
|
|
Equal to list(self)[ind]
|
|
"""
|
|
# This is used to make discrete sampling without replacement memory
|
|
# efficient.
|
|
for sub_grid in self.param_grid:
|
|
# XXX: could memoize information used here
|
|
if not sub_grid:
|
|
if ind == 0:
|
|
return {}
|
|
else:
|
|
ind -= 1
|
|
continue
|
|
|
|
# Reverse so most frequent cycling parameter comes first
|
|
keys, values_lists = zip(*sorted(sub_grid.items())[::-1])
|
|
sizes = [len(v_list) for v_list in values_lists]
|
|
total = np.product(sizes)
|
|
|
|
if ind >= total:
|
|
# Try the next grid
|
|
ind -= total
|
|
else:
|
|
out = {}
|
|
for key, v_list, n in zip(keys, values_lists, sizes):
|
|
ind, offset = divmod(ind, n)
|
|
out[key] = v_list[offset]
|
|
return out
|
|
|
|
raise IndexError('ParameterGrid index out of range')
|
|
|
|
|
|
class ParameterSampler:
|
|
"""Generator on parameters sampled from given distributions.
|
|
|
|
Non-deterministic iterable over random candidate combinations for hyper-
|
|
parameter search. If all parameters are presented as a list,
|
|
sampling without replacement is performed. If at least one parameter
|
|
is given as a distribution, sampling with replacement is used.
|
|
It is highly recommended to use continuous distributions for continuous
|
|
parameters.
|
|
|
|
Read more in the :ref:`User Guide <grid_search>`.
|
|
|
|
Parameters
|
|
----------
|
|
param_distributions : dict
|
|
Dictionary with parameters names (`str`) as keys and distributions
|
|
or lists of parameters to try. Distributions must provide a ``rvs``
|
|
method for sampling (such as those from scipy.stats.distributions).
|
|
If a list is given, it is sampled uniformly.
|
|
If a list of dicts is given, first a dict is sampled uniformly, and
|
|
then a parameter is sampled using that dict as above.
|
|
|
|
n_iter : integer
|
|
Number of parameter settings that are produced.
|
|
|
|
random_state : int or RandomState instance, default=None
|
|
Pseudo random number generator state used for random uniform sampling
|
|
from lists of possible values instead of scipy.stats distributions.
|
|
Pass an int for reproducible output across multiple
|
|
function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
params : dict of str to any
|
|
**Yields** dictionaries mapping each estimator parameter to
|
|
as sampled value.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection import ParameterSampler
|
|
>>> from scipy.stats.distributions import expon
|
|
>>> import numpy as np
|
|
>>> rng = np.random.RandomState(0)
|
|
>>> param_grid = {'a':[1, 2], 'b': expon()}
|
|
>>> param_list = list(ParameterSampler(param_grid, n_iter=4,
|
|
... random_state=rng))
|
|
>>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items())
|
|
... for d in param_list]
|
|
>>> rounded_list == [{'b': 0.89856, 'a': 1},
|
|
... {'b': 0.923223, 'a': 1},
|
|
... {'b': 1.878964, 'a': 2},
|
|
... {'b': 1.038159, 'a': 2}]
|
|
True
|
|
"""
|
|
@_deprecate_positional_args
|
|
def __init__(self, param_distributions, n_iter, *, random_state=None):
|
|
if not isinstance(param_distributions, (Mapping, Iterable)):
|
|
raise TypeError('Parameter distribution is not a dict or '
|
|
'a list ({!r})'.format(param_distributions))
|
|
|
|
if isinstance(param_distributions, Mapping):
|
|
# wrap dictionary in a singleton list to support either dict
|
|
# or list of dicts
|
|
param_distributions = [param_distributions]
|
|
|
|
for dist in param_distributions:
|
|
if not isinstance(dist, dict):
|
|
raise TypeError('Parameter distribution is not a '
|
|
'dict ({!r})'.format(dist))
|
|
for key in dist:
|
|
if (not isinstance(dist[key], Iterable)
|
|
and not hasattr(dist[key], 'rvs')):
|
|
raise TypeError('Parameter value is not iterable '
|
|
'or distribution (key={!r}, value={!r})'
|
|
.format(key, dist[key]))
|
|
self.n_iter = n_iter
|
|
self.random_state = random_state
|
|
self.param_distributions = param_distributions
|
|
|
|
def __iter__(self):
|
|
# check if all distributions are given as lists
|
|
# in this case we want to sample without replacement
|
|
all_lists = all(
|
|
all(not hasattr(v, "rvs") for v in dist.values())
|
|
for dist in self.param_distributions)
|
|
rng = check_random_state(self.random_state)
|
|
|
|
if all_lists:
|
|
# look up sampled parameter settings in parameter grid
|
|
param_grid = ParameterGrid(self.param_distributions)
|
|
grid_size = len(param_grid)
|
|
n_iter = self.n_iter
|
|
|
|
if grid_size < n_iter:
|
|
warnings.warn(
|
|
'The total space of parameters %d is smaller '
|
|
'than n_iter=%d. Running %d iterations. For exhaustive '
|
|
'searches, use GridSearchCV.'
|
|
% (grid_size, self.n_iter, grid_size), UserWarning)
|
|
n_iter = grid_size
|
|
for i in sample_without_replacement(grid_size, n_iter,
|
|
random_state=rng):
|
|
yield param_grid[i]
|
|
|
|
else:
|
|
for _ in range(self.n_iter):
|
|
dist = rng.choice(self.param_distributions)
|
|
# Always sort the keys of a dictionary, for reproducibility
|
|
items = sorted(dist.items())
|
|
params = dict()
|
|
for k, v in items:
|
|
if hasattr(v, "rvs"):
|
|
params[k] = v.rvs(random_state=rng)
|
|
else:
|
|
params[k] = v[rng.randint(len(v))]
|
|
yield params
|
|
|
|
def __len__(self):
|
|
"""Number of points that will be sampled."""
|
|
return self.n_iter
|
|
|
|
|
|
# FIXME Remove fit_grid_point in 0.25
|
|
@deprecated(
|
|
"fit_grid_point is deprecated in version 0.23 "
|
|
"and will be removed in version 0.25"
|
|
)
|
|
def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
|
|
verbose, error_score=np.nan, **fit_params):
|
|
"""Run fit on one set of parameters.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, sparse matrix or list
|
|
Input data.
|
|
|
|
y : array-like or None
|
|
Targets for input data.
|
|
|
|
estimator : estimator object
|
|
A object of that type is instantiated for each grid point.
|
|
This is assumed to implement the scikit-learn estimator interface.
|
|
Either estimator needs to provide a ``score`` function,
|
|
or ``scoring`` must be passed.
|
|
|
|
parameters : dict
|
|
Parameters to be set on estimator for this grid point.
|
|
|
|
train : ndarray, dtype int or bool
|
|
Boolean mask or indices for training set.
|
|
|
|
test : ndarray, dtype int or bool
|
|
Boolean mask or indices for test set.
|
|
|
|
scorer : callable or None
|
|
The scorer callable object / function must have its signature as
|
|
``scorer(estimator, X, y)``.
|
|
|
|
If ``None`` the estimator's score method is used.
|
|
|
|
verbose : int
|
|
Verbosity level.
|
|
|
|
**fit_params : kwargs
|
|
Additional parameter passed to the fit function of the estimator.
|
|
|
|
error_score : 'raise' or numeric, default=np.nan
|
|
Value to assign to the score if an error occurs in estimator fitting.
|
|
If set to 'raise', the error is raised. If a numeric value is given,
|
|
FitFailedWarning is raised. This parameter does not affect the refit
|
|
step, which will always raise the error.
|
|
|
|
Returns
|
|
-------
|
|
score : float
|
|
Score of this parameter setting on given test split.
|
|
|
|
parameters : dict
|
|
The parameters that have been evaluated.
|
|
|
|
n_samples_test : int
|
|
Number of test samples in this split.
|
|
"""
|
|
# NOTE we are not using the return value as the scorer by itself should be
|
|
# validated before. We use check_scoring only to reject multimetric scorer
|
|
check_scoring(estimator, scorer)
|
|
scores, n_samples_test = _fit_and_score(estimator, X, y,
|
|
scorer, train,
|
|
test, verbose, parameters,
|
|
fit_params=fit_params,
|
|
return_n_test_samples=True,
|
|
error_score=error_score)
|
|
return scores, parameters, n_samples_test
|
|
|
|
|
|
def _check_param_grid(param_grid):
|
|
if hasattr(param_grid, 'items'):
|
|
param_grid = [param_grid]
|
|
|
|
for p in param_grid:
|
|
for name, v in p.items():
|
|
if isinstance(v, np.ndarray) and v.ndim > 1:
|
|
raise ValueError("Parameter array should be one-dimensional.")
|
|
|
|
if (isinstance(v, str) or
|
|
not isinstance(v, (np.ndarray, Sequence))):
|
|
raise ValueError("Parameter grid for parameter ({0}) needs to"
|
|
" be a list or numpy array, but got ({1})."
|
|
" Single values need to be wrapped in a list"
|
|
" with one element.".format(name, type(v)))
|
|
|
|
if len(v) == 0:
|
|
raise ValueError("Parameter values for parameter ({0}) need "
|
|
"to be a non-empty sequence.".format(name))
|
|
|
|
|
|
class BaseSearchCV(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
|
|
"""Abstract base class for hyper parameter search with cross-validation.
|
|
"""
|
|
|
|
@abstractmethod
|
|
@_deprecate_positional_args
|
|
def __init__(self, estimator, *, scoring=None, n_jobs=None,
|
|
iid='deprecated', refit=True, cv=None, verbose=0,
|
|
pre_dispatch='2*n_jobs', error_score=np.nan,
|
|
return_train_score=True):
|
|
|
|
self.scoring = scoring
|
|
self.estimator = estimator
|
|
self.n_jobs = n_jobs
|
|
self.iid = iid
|
|
self.refit = refit
|
|
self.cv = cv
|
|
self.verbose = verbose
|
|
self.pre_dispatch = pre_dispatch
|
|
self.error_score = error_score
|
|
self.return_train_score = return_train_score
|
|
|
|
@property
|
|
def _estimator_type(self):
|
|
return self.estimator._estimator_type
|
|
|
|
@property
|
|
def _pairwise(self):
|
|
# allows cross-validation to see 'precomputed' metrics
|
|
return getattr(self.estimator, '_pairwise', False)
|
|
|
|
def score(self, X, y=None):
|
|
"""Returns the score on the given data, if the estimator has been refit.
|
|
|
|
This uses the score defined by ``scoring`` where provided, and the
|
|
``best_estimator_.score`` method otherwise.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features)
|
|
Input data, where n_samples is the number of samples and
|
|
n_features is the number of features.
|
|
|
|
y : array-like of shape (n_samples, n_output) \
|
|
or (n_samples,), default=None
|
|
Target relative to X for classification or regression;
|
|
None for unsupervised learning.
|
|
|
|
Returns
|
|
-------
|
|
score : float
|
|
"""
|
|
self._check_is_fitted('score')
|
|
if self.scorer_ is None:
|
|
raise ValueError("No score function explicitly defined, "
|
|
"and the estimator doesn't provide one %s"
|
|
% self.best_estimator_)
|
|
score = self.scorer_[self.refit] if self.multimetric_ else self.scorer_
|
|
return score(self.best_estimator_, X, y)
|
|
|
|
def _check_is_fitted(self, method_name):
|
|
if not self.refit:
|
|
raise NotFittedError('This %s instance was initialized '
|
|
'with refit=False. %s is '
|
|
'available only after refitting on the best '
|
|
'parameters. You can refit an estimator '
|
|
'manually using the ``best_params_`` '
|
|
'attribute'
|
|
% (type(self).__name__, method_name))
|
|
else:
|
|
check_is_fitted(self)
|
|
|
|
@if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
|
|
def predict(self, X):
|
|
"""Call predict on the estimator with the best found parameters.
|
|
|
|
Only available if ``refit=True`` and the underlying estimator supports
|
|
``predict``.
|
|
|
|
Parameters
|
|
----------
|
|
X : indexable, length n_samples
|
|
Must fulfill the input assumptions of the
|
|
underlying estimator.
|
|
|
|
"""
|
|
self._check_is_fitted('predict')
|
|
return self.best_estimator_.predict(X)
|
|
|
|
@if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
|
|
def predict_proba(self, X):
|
|
"""Call predict_proba on the estimator with the best found parameters.
|
|
|
|
Only available if ``refit=True`` and the underlying estimator supports
|
|
``predict_proba``.
|
|
|
|
Parameters
|
|
----------
|
|
X : indexable, length n_samples
|
|
Must fulfill the input assumptions of the
|
|
underlying estimator.
|
|
|
|
"""
|
|
self._check_is_fitted('predict_proba')
|
|
return self.best_estimator_.predict_proba(X)
|
|
|
|
@if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
|
|
def predict_log_proba(self, X):
|
|
"""Call predict_log_proba on the estimator with the best found parameters.
|
|
|
|
Only available if ``refit=True`` and the underlying estimator supports
|
|
``predict_log_proba``.
|
|
|
|
Parameters
|
|
----------
|
|
X : indexable, length n_samples
|
|
Must fulfill the input assumptions of the
|
|
underlying estimator.
|
|
|
|
"""
|
|
self._check_is_fitted('predict_log_proba')
|
|
return self.best_estimator_.predict_log_proba(X)
|
|
|
|
@if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
|
|
def decision_function(self, X):
|
|
"""Call decision_function on the estimator with the best found parameters.
|
|
|
|
Only available if ``refit=True`` and the underlying estimator supports
|
|
``decision_function``.
|
|
|
|
Parameters
|
|
----------
|
|
X : indexable, length n_samples
|
|
Must fulfill the input assumptions of the
|
|
underlying estimator.
|
|
|
|
"""
|
|
self._check_is_fitted('decision_function')
|
|
return self.best_estimator_.decision_function(X)
|
|
|
|
@if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
|
|
def transform(self, X):
|
|
"""Call transform on the estimator with the best found parameters.
|
|
|
|
Only available if the underlying estimator supports ``transform`` and
|
|
``refit=True``.
|
|
|
|
Parameters
|
|
----------
|
|
X : indexable, length n_samples
|
|
Must fulfill the input assumptions of the
|
|
underlying estimator.
|
|
|
|
"""
|
|
self._check_is_fitted('transform')
|
|
return self.best_estimator_.transform(X)
|
|
|
|
@if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
|
|
def inverse_transform(self, Xt):
|
|
"""Call inverse_transform on the estimator with the best found params.
|
|
|
|
Only available if the underlying estimator implements
|
|
``inverse_transform`` and ``refit=True``.
|
|
|
|
Parameters
|
|
----------
|
|
Xt : indexable, length n_samples
|
|
Must fulfill the input assumptions of the
|
|
underlying estimator.
|
|
|
|
"""
|
|
self._check_is_fitted('inverse_transform')
|
|
return self.best_estimator_.inverse_transform(Xt)
|
|
|
|
@property
|
|
def n_features_in_(self):
|
|
# For consistency with other estimators we raise a AttributeError so
|
|
# that hasattr() fails if the search estimator isn't fitted.
|
|
try:
|
|
check_is_fitted(self)
|
|
except NotFittedError as nfe:
|
|
raise AttributeError(
|
|
"{} object has no n_features_in_ attribute."
|
|
.format(self.__class__.__name__)
|
|
) from nfe
|
|
|
|
return self.best_estimator_.n_features_in_
|
|
|
|
@property
|
|
def classes_(self):
|
|
self._check_is_fitted("classes_")
|
|
return self.best_estimator_.classes_
|
|
|
|
def _run_search(self, evaluate_candidates):
|
|
"""Repeatedly calls `evaluate_candidates` to conduct a search.
|
|
|
|
This method, implemented in sub-classes, makes it possible to
|
|
customize the the scheduling of evaluations: GridSearchCV and
|
|
RandomizedSearchCV schedule evaluations for their whole parameter
|
|
search space at once but other more sequential approaches are also
|
|
possible: for instance is possible to iteratively schedule evaluations
|
|
for new regions of the parameter search space based on previously
|
|
collected evaluation results. This makes it possible to implement
|
|
Bayesian optimization or more generally sequential model-based
|
|
optimization by deriving from the BaseSearchCV abstract base class.
|
|
|
|
Parameters
|
|
----------
|
|
evaluate_candidates : callable
|
|
This callback accepts a list of candidates, where each candidate is
|
|
a dict of parameter settings. It returns a dict of all results so
|
|
far, formatted like ``cv_results_``.
|
|
|
|
Examples
|
|
--------
|
|
|
|
::
|
|
|
|
def _run_search(self, evaluate_candidates):
|
|
'Try C=0.1 only if C=1 is better than C=10'
|
|
all_results = evaluate_candidates([{'C': 1}, {'C': 10}])
|
|
score = all_results['mean_test_score']
|
|
if score[0] < score[1]:
|
|
evaluate_candidates([{'C': 0.1}])
|
|
"""
|
|
raise NotImplementedError("_run_search not implemented.")
|
|
|
|
@_deprecate_positional_args
|
|
def fit(self, X, y=None, *, groups=None, **fit_params):
|
|
"""Run fit with all sets of parameters.
|
|
|
|
Parameters
|
|
----------
|
|
|
|
X : array-like of shape (n_samples, n_features)
|
|
Training vector, where n_samples is the number of samples and
|
|
n_features is the number of features.
|
|
|
|
y : array-like of shape (n_samples, n_output) \
|
|
or (n_samples,), default=None
|
|
Target relative to X for classification or regression;
|
|
None for unsupervised learning.
|
|
|
|
groups : array-like of shape (n_samples,), default=None
|
|
Group labels for the samples used while splitting the dataset into
|
|
train/test set. Only used in conjunction with a "Group" :term:`cv`
|
|
instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
|
|
|
|
**fit_params : dict of str -> object
|
|
Parameters passed to the ``fit`` method of the estimator
|
|
"""
|
|
estimator = self.estimator
|
|
cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
|
|
|
|
scorers, self.multimetric_ = _check_multimetric_scoring(
|
|
self.estimator, scoring=self.scoring)
|
|
|
|
if self.multimetric_:
|
|
if self.refit is not False and (
|
|
not isinstance(self.refit, str) or
|
|
# This will work for both dict / list (tuple)
|
|
self.refit not in scorers) and not callable(self.refit):
|
|
raise ValueError("For multi-metric scoring, the parameter "
|
|
"refit must be set to a scorer key or a "
|
|
"callable to refit an estimator with the "
|
|
"best parameter setting on the whole "
|
|
"data and make the best_* attributes "
|
|
"available for that metric. If this is "
|
|
"not needed, refit should be set to "
|
|
"False explicitly. %r was passed."
|
|
% self.refit)
|
|
else:
|
|
refit_metric = self.refit
|
|
else:
|
|
refit_metric = 'score'
|
|
|
|
X, y, groups = indexable(X, y, groups)
|
|
fit_params = _check_fit_params(X, fit_params)
|
|
|
|
n_splits = cv.get_n_splits(X, y, groups)
|
|
|
|
base_estimator = clone(self.estimator)
|
|
|
|
parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
|
|
pre_dispatch=self.pre_dispatch)
|
|
|
|
fit_and_score_kwargs = dict(scorer=scorers,
|
|
fit_params=fit_params,
|
|
return_train_score=self.return_train_score,
|
|
return_n_test_samples=True,
|
|
return_times=True,
|
|
return_parameters=False,
|
|
error_score=self.error_score,
|
|
verbose=self.verbose)
|
|
results = {}
|
|
with parallel:
|
|
all_candidate_params = []
|
|
all_out = []
|
|
|
|
def evaluate_candidates(candidate_params):
|
|
candidate_params = list(candidate_params)
|
|
n_candidates = len(candidate_params)
|
|
|
|
if self.verbose > 0:
|
|
print("Fitting {0} folds for each of {1} candidates,"
|
|
" totalling {2} fits".format(
|
|
n_splits, n_candidates, n_candidates * n_splits))
|
|
|
|
out = parallel(delayed(_fit_and_score)(clone(base_estimator),
|
|
X, y,
|
|
train=train, test=test,
|
|
parameters=parameters,
|
|
**fit_and_score_kwargs)
|
|
for parameters, (train, test)
|
|
in product(candidate_params,
|
|
cv.split(X, y, groups)))
|
|
|
|
if len(out) < 1:
|
|
raise ValueError('No fits were performed. '
|
|
'Was the CV iterator empty? '
|
|
'Were there no candidates?')
|
|
elif len(out) != n_candidates * n_splits:
|
|
raise ValueError('cv.split and cv.get_n_splits returned '
|
|
'inconsistent results. Expected {} '
|
|
'splits, got {}'
|
|
.format(n_splits,
|
|
len(out) // n_candidates))
|
|
|
|
all_candidate_params.extend(candidate_params)
|
|
all_out.extend(out)
|
|
|
|
nonlocal results
|
|
results = self._format_results(
|
|
all_candidate_params, scorers, n_splits, all_out)
|
|
return results
|
|
|
|
self._run_search(evaluate_candidates)
|
|
|
|
# For multi-metric evaluation, store the best_index_, best_params_ and
|
|
# best_score_ iff refit is one of the scorer names
|
|
# In single metric evaluation, refit_metric is "score"
|
|
if self.refit or not self.multimetric_:
|
|
# If callable, refit is expected to return the index of the best
|
|
# parameter set.
|
|
if callable(self.refit):
|
|
self.best_index_ = self.refit(results)
|
|
if not isinstance(self.best_index_, numbers.Integral):
|
|
raise TypeError('best_index_ returned is not an integer')
|
|
if (self.best_index_ < 0 or
|
|
self.best_index_ >= len(results["params"])):
|
|
raise IndexError('best_index_ index out of range')
|
|
else:
|
|
self.best_index_ = results["rank_test_%s"
|
|
% refit_metric].argmin()
|
|
self.best_score_ = results["mean_test_%s" % refit_metric][
|
|
self.best_index_]
|
|
self.best_params_ = results["params"][self.best_index_]
|
|
|
|
if self.refit:
|
|
# we clone again after setting params in case some
|
|
# of the params are estimators as well.
|
|
self.best_estimator_ = clone(clone(base_estimator).set_params(
|
|
**self.best_params_))
|
|
refit_start_time = time.time()
|
|
if y is not None:
|
|
self.best_estimator_.fit(X, y, **fit_params)
|
|
else:
|
|
self.best_estimator_.fit(X, **fit_params)
|
|
refit_end_time = time.time()
|
|
self.refit_time_ = refit_end_time - refit_start_time
|
|
|
|
# Store the only scorer not as a dict for single metric evaluation
|
|
self.scorer_ = scorers if self.multimetric_ else scorers['score']
|
|
|
|
self.cv_results_ = results
|
|
self.n_splits_ = n_splits
|
|
|
|
return self
|
|
|
|
def _format_results(self, candidate_params, scorers, n_splits, out):
|
|
n_candidates = len(candidate_params)
|
|
|
|
# if one choose to see train score, "out" will contain train score info
|
|
if self.return_train_score:
|
|
(train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
|
|
score_time) = zip(*out)
|
|
else:
|
|
(test_score_dicts, test_sample_counts, fit_time,
|
|
score_time) = zip(*out)
|
|
|
|
# test_score_dicts and train_score dicts are lists of dictionaries and
|
|
# we make them into dict of lists
|
|
test_scores = _aggregate_score_dicts(test_score_dicts)
|
|
if self.return_train_score:
|
|
train_scores = _aggregate_score_dicts(train_score_dicts)
|
|
|
|
results = {}
|
|
|
|
def _store(key_name, array, weights=None, splits=False, rank=False):
|
|
"""A small helper to store the scores/times to the cv_results_"""
|
|
# When iterated first by splits, then by parameters
|
|
# We want `array` to have `n_candidates` rows and `n_splits` cols.
|
|
array = np.array(array, dtype=np.float64).reshape(n_candidates,
|
|
n_splits)
|
|
if splits:
|
|
for split_i in range(n_splits):
|
|
# Uses closure to alter the results
|
|
results["split%d_%s"
|
|
% (split_i, key_name)] = array[:, split_i]
|
|
|
|
array_means = np.average(array, axis=1, weights=weights)
|
|
results['mean_%s' % key_name] = array_means
|
|
# Weighted std is not directly available in numpy
|
|
array_stds = np.sqrt(np.average((array -
|
|
array_means[:, np.newaxis]) ** 2,
|
|
axis=1, weights=weights))
|
|
results['std_%s' % key_name] = array_stds
|
|
|
|
if rank:
|
|
results["rank_%s" % key_name] = np.asarray(
|
|
rankdata(-array_means, method='min'), dtype=np.int32)
|
|
|
|
_store('fit_time', fit_time)
|
|
_store('score_time', score_time)
|
|
# Use one MaskedArray and mask all the places where the param is not
|
|
# applicable for that candidate. Use defaultdict as each candidate may
|
|
# not contain all the params
|
|
param_results = defaultdict(partial(MaskedArray,
|
|
np.empty(n_candidates,),
|
|
mask=True,
|
|
dtype=object))
|
|
for cand_i, params in enumerate(candidate_params):
|
|
for name, value in params.items():
|
|
# An all masked empty array gets created for the key
|
|
# `"param_%s" % name` at the first occurrence of `name`.
|
|
# Setting the value at an index also unmasks that index
|
|
param_results["param_%s" % name][cand_i] = value
|
|
|
|
results.update(param_results)
|
|
# Store a list of param dicts at the key 'params'
|
|
results['params'] = candidate_params
|
|
|
|
# NOTE test_sample counts (weights) remain the same for all candidates
|
|
test_sample_counts = np.array(test_sample_counts[:n_splits],
|
|
dtype=np.int)
|
|
|
|
if self.iid != 'deprecated':
|
|
warnings.warn(
|
|
"The parameter 'iid' is deprecated in 0.22 and will be "
|
|
"removed in 0.24.", FutureWarning
|
|
)
|
|
iid = self.iid
|
|
else:
|
|
iid = False
|
|
|
|
for scorer_name in scorers.keys():
|
|
# Computed the (weighted) mean and std for test scores alone
|
|
_store('test_%s' % scorer_name, test_scores[scorer_name],
|
|
splits=True, rank=True,
|
|
weights=test_sample_counts if iid else None)
|
|
if self.return_train_score:
|
|
_store('train_%s' % scorer_name, train_scores[scorer_name],
|
|
splits=True)
|
|
|
|
return results
|
|
|
|
|
|
class GridSearchCV(BaseSearchCV):
|
|
"""Exhaustive search over specified parameter values for an estimator.
|
|
|
|
Important members are fit, predict.
|
|
|
|
GridSearchCV implements a "fit" and a "score" method.
|
|
It also implements "predict", "predict_proba", "decision_function",
|
|
"transform" and "inverse_transform" if they are implemented in the
|
|
estimator used.
|
|
|
|
The parameters of the estimator used to apply these methods are optimized
|
|
by cross-validated grid-search over a parameter grid.
|
|
|
|
Read more in the :ref:`User Guide <grid_search>`.
|
|
|
|
Parameters
|
|
----------
|
|
estimator : estimator object.
|
|
This is assumed to implement the scikit-learn estimator interface.
|
|
Either estimator needs to provide a ``score`` function,
|
|
or ``scoring`` must be passed.
|
|
|
|
param_grid : dict or list of dictionaries
|
|
Dictionary with parameters names (`str`) as keys and lists of
|
|
parameter settings to try as values, or a list of such
|
|
dictionaries, in which case the grids spanned by each dictionary
|
|
in the list are explored. This enables searching over any sequence
|
|
of parameter settings.
|
|
|
|
scoring : str, callable, list/tuple or dict, default=None
|
|
A single str (see :ref:`scoring_parameter`) or a callable
|
|
(see :ref:`scoring`) to evaluate the predictions on the test set.
|
|
|
|
For evaluating multiple metrics, either give a list of (unique) strings
|
|
or a dict with names as keys and callables as values.
|
|
|
|
NOTE that when using custom scorers, each scorer should return a single
|
|
value. Metric functions returning a list/array of values can be wrapped
|
|
into multiple scorers that return one value each.
|
|
|
|
See :ref:`multimetric_grid_search` for an example.
|
|
|
|
If None, the estimator's score method is used.
|
|
|
|
n_jobs : int, default=None
|
|
Number of jobs to run in parallel.
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|
for more details.
|
|
|
|
.. versionchanged:: v0.20
|
|
`n_jobs` default changed from 1 to None
|
|
|
|
pre_dispatch : int, or str, default=n_jobs
|
|
Controls the number of jobs that get dispatched during parallel
|
|
execution. Reducing this number can be useful to avoid an
|
|
explosion of memory consumption when more jobs get dispatched
|
|
than CPUs can process. This parameter can be:
|
|
|
|
- None, in which case all the jobs are immediately
|
|
created and spawned. Use this for lightweight and
|
|
fast-running jobs, to avoid delays due to on-demand
|
|
spawning of the jobs
|
|
|
|
- An int, giving the exact number of total jobs that are
|
|
spawned
|
|
|
|
- A str, giving an expression as a function of n_jobs,
|
|
as in '2*n_jobs'
|
|
|
|
iid : bool, default=False
|
|
If True, return the average score across folds, weighted by the number
|
|
of samples in each test set. In this case, the data is assumed to be
|
|
identically distributed across the folds, and the loss minimized is
|
|
the total loss per sample, and not the mean loss across the folds.
|
|
|
|
.. deprecated:: 0.22
|
|
Parameter ``iid`` is deprecated in 0.22 and will be removed in 0.24
|
|
|
|
cv : int, cross-validation generator or an iterable, default=None
|
|
Determines the cross-validation splitting strategy.
|
|
Possible inputs for cv are:
|
|
|
|
- None, to use the default 5-fold cross validation,
|
|
- integer, to specify the number of folds in a `(Stratified)KFold`,
|
|
- :term:`CV splitter`,
|
|
- An iterable yielding (train, test) splits as arrays of indices.
|
|
|
|
For integer/None inputs, if the estimator is a classifier and ``y`` is
|
|
either binary or multiclass, :class:`StratifiedKFold` is used. In all
|
|
other cases, :class:`KFold` is used.
|
|
|
|
Refer :ref:`User Guide <cross_validation>` for the various
|
|
cross-validation strategies that can be used here.
|
|
|
|
.. versionchanged:: 0.22
|
|
``cv`` default value if None changed from 3-fold to 5-fold.
|
|
|
|
refit : bool, str, or callable, default=True
|
|
Refit an estimator using the best found parameters on the whole
|
|
dataset.
|
|
|
|
For multiple metric evaluation, this needs to be a `str` denoting the
|
|
scorer that would be used to find the best parameters for refitting
|
|
the estimator at the end.
|
|
|
|
Where there are considerations other than maximum score in
|
|
choosing a best estimator, ``refit`` can be set to a function which
|
|
returns the selected ``best_index_`` given ``cv_results_``. In that
|
|
case, the ``best_estimator_`` and ``best_params_`` will be set
|
|
according to the returned ``best_index_`` while the ``best_score_``
|
|
attribute will not be available.
|
|
|
|
The refitted estimator is made available at the ``best_estimator_``
|
|
attribute and permits using ``predict`` directly on this
|
|
``GridSearchCV`` instance.
|
|
|
|
Also for multiple metric evaluation, the attributes ``best_index_``,
|
|
``best_score_`` and ``best_params_`` will only be available if
|
|
``refit`` is set and all of them will be determined w.r.t this specific
|
|
scorer.
|
|
|
|
See ``scoring`` parameter to know more about multiple metric
|
|
evaluation.
|
|
|
|
.. versionchanged:: 0.20
|
|
Support for callable added.
|
|
|
|
verbose : integer
|
|
Controls the verbosity: the higher, the more messages.
|
|
|
|
error_score : 'raise' or numeric, default=np.nan
|
|
Value to assign to the score if an error occurs in estimator fitting.
|
|
If set to 'raise', the error is raised. If a numeric value is given,
|
|
FitFailedWarning is raised. This parameter does not affect the refit
|
|
step, which will always raise the error.
|
|
|
|
return_train_score : bool, default=False
|
|
If ``False``, the ``cv_results_`` attribute will not include training
|
|
scores.
|
|
Computing training scores is used to get insights on how different
|
|
parameter settings impact the overfitting/underfitting trade-off.
|
|
However computing the scores on the training set can be computationally
|
|
expensive and is not strictly required to select the parameters that
|
|
yield the best generalization performance.
|
|
|
|
.. versionadded:: 0.19
|
|
|
|
.. versionchanged:: 0.21
|
|
Default value was changed from ``True`` to ``False``
|
|
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn import svm, datasets
|
|
>>> from sklearn.model_selection import GridSearchCV
|
|
>>> iris = datasets.load_iris()
|
|
>>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
|
|
>>> svc = svm.SVC()
|
|
>>> clf = GridSearchCV(svc, parameters)
|
|
>>> clf.fit(iris.data, iris.target)
|
|
GridSearchCV(estimator=SVC(),
|
|
param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')})
|
|
>>> sorted(clf.cv_results_.keys())
|
|
['mean_fit_time', 'mean_score_time', 'mean_test_score',...
|
|
'param_C', 'param_kernel', 'params',...
|
|
'rank_test_score', 'split0_test_score',...
|
|
'split2_test_score', ...
|
|
'std_fit_time', 'std_score_time', 'std_test_score']
|
|
|
|
Attributes
|
|
----------
|
|
cv_results_ : dict of numpy (masked) ndarrays
|
|
A dict with keys as column headers and values as columns, that can be
|
|
imported into a pandas ``DataFrame``.
|
|
|
|
For instance the below given table
|
|
|
|
+------------+-----------+------------+-----------------+---+---------+
|
|
|param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...|
|
|
+============+===========+============+=================+===+=========+
|
|
| 'poly' | -- | 2 | 0.80 |...| 2 |
|
|
+------------+-----------+------------+-----------------+---+---------+
|
|
| 'poly' | -- | 3 | 0.70 |...| 4 |
|
|
+------------+-----------+------------+-----------------+---+---------+
|
|
| 'rbf' | 0.1 | -- | 0.80 |...| 3 |
|
|
+------------+-----------+------------+-----------------+---+---------+
|
|
| 'rbf' | 0.2 | -- | 0.93 |...| 1 |
|
|
+------------+-----------+------------+-----------------+---+---------+
|
|
|
|
will be represented by a ``cv_results_`` dict of::
|
|
|
|
{
|
|
'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
|
|
mask = [False False False False]...)
|
|
'param_gamma': masked_array(data = [-- -- 0.1 0.2],
|
|
mask = [ True True False False]...),
|
|
'param_degree': masked_array(data = [2.0 3.0 -- --],
|
|
mask = [False False True True]...),
|
|
'split0_test_score' : [0.80, 0.70, 0.80, 0.93],
|
|
'split1_test_score' : [0.82, 0.50, 0.70, 0.78],
|
|
'mean_test_score' : [0.81, 0.60, 0.75, 0.85],
|
|
'std_test_score' : [0.01, 0.10, 0.05, 0.08],
|
|
'rank_test_score' : [2, 4, 3, 1],
|
|
'split0_train_score' : [0.80, 0.92, 0.70, 0.93],
|
|
'split1_train_score' : [0.82, 0.55, 0.70, 0.87],
|
|
'mean_train_score' : [0.81, 0.74, 0.70, 0.90],
|
|
'std_train_score' : [0.01, 0.19, 0.00, 0.03],
|
|
'mean_fit_time' : [0.73, 0.63, 0.43, 0.49],
|
|
'std_fit_time' : [0.01, 0.02, 0.01, 0.01],
|
|
'mean_score_time' : [0.01, 0.06, 0.04, 0.04],
|
|
'std_score_time' : [0.00, 0.00, 0.00, 0.01],
|
|
'params' : [{'kernel': 'poly', 'degree': 2}, ...],
|
|
}
|
|
|
|
NOTE
|
|
|
|
The key ``'params'`` is used to store a list of parameter
|
|
settings dicts for all the parameter candidates.
|
|
|
|
The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
|
|
``std_score_time`` are all in seconds.
|
|
|
|
For multi-metric evaluation, the scores for all the scorers are
|
|
available in the ``cv_results_`` dict at the keys ending with that
|
|
scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` shown
|
|
above. ('split0_test_precision', 'mean_train_precision' etc.)
|
|
|
|
best_estimator_ : estimator
|
|
Estimator that was chosen by the search, i.e. estimator
|
|
which gave highest score (or smallest loss if specified)
|
|
on the left out data. Not available if ``refit=False``.
|
|
|
|
See ``refit`` parameter for more information on allowed values.
|
|
|
|
best_score_ : float
|
|
Mean cross-validated score of the best_estimator
|
|
|
|
For multi-metric evaluation, this is present only if ``refit`` is
|
|
specified.
|
|
|
|
This attribute is not available if ``refit`` is a function.
|
|
|
|
best_params_ : dict
|
|
Parameter setting that gave the best results on the hold out data.
|
|
|
|
For multi-metric evaluation, this is present only if ``refit`` is
|
|
specified.
|
|
|
|
best_index_ : int
|
|
The index (of the ``cv_results_`` arrays) which corresponds to the best
|
|
candidate parameter setting.
|
|
|
|
The dict at ``search.cv_results_['params'][search.best_index_]`` gives
|
|
the parameter setting for the best model, that gives the highest
|
|
mean score (``search.best_score_``).
|
|
|
|
For multi-metric evaluation, this is present only if ``refit`` is
|
|
specified.
|
|
|
|
scorer_ : function or a dict
|
|
Scorer function used on the held out data to choose the best
|
|
parameters for the model.
|
|
|
|
For multi-metric evaluation, this attribute holds the validated
|
|
``scoring`` dict which maps the scorer key to the scorer callable.
|
|
|
|
n_splits_ : int
|
|
The number of cross-validation splits (folds/iterations).
|
|
|
|
refit_time_ : float
|
|
Seconds used for refitting the best model on the whole dataset.
|
|
|
|
This is present only if ``refit`` is not False.
|
|
|
|
.. versionadded:: 0.20
|
|
|
|
Notes
|
|
-----
|
|
The parameters selected are those that maximize the score of the left out
|
|
data, unless an explicit score is passed in which case it is used instead.
|
|
|
|
If `n_jobs` was set to a value higher than one, the data is copied for each
|
|
point in the grid (and not `n_jobs` times). This is done for efficiency
|
|
reasons if individual jobs take very little time, but may raise errors if
|
|
the dataset is large and not enough memory is available. A workaround in
|
|
this case is to set `pre_dispatch`. Then, the memory is copied only
|
|
`pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
|
|
n_jobs`.
|
|
|
|
See Also
|
|
---------
|
|
:class:`ParameterGrid`:
|
|
generates all the combinations of a hyperparameter grid.
|
|
|
|
:func:`sklearn.model_selection.train_test_split`:
|
|
utility function to split the data into a development set usable
|
|
for fitting a GridSearchCV instance and an evaluation set for
|
|
its final evaluation.
|
|
|
|
:func:`sklearn.metrics.make_scorer`:
|
|
Make a scorer from a performance metric or loss function.
|
|
|
|
"""
|
|
_required_parameters = ["estimator", "param_grid"]
|
|
|
|
@_deprecate_positional_args
|
|
def __init__(self, estimator, param_grid, *, scoring=None,
|
|
n_jobs=None, iid='deprecated', refit=True, cv=None,
|
|
verbose=0, pre_dispatch='2*n_jobs',
|
|
error_score=np.nan, return_train_score=False):
|
|
super().__init__(
|
|
estimator=estimator, scoring=scoring,
|
|
n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
|
|
pre_dispatch=pre_dispatch, error_score=error_score,
|
|
return_train_score=return_train_score)
|
|
self.param_grid = param_grid
|
|
_check_param_grid(param_grid)
|
|
|
|
def _run_search(self, evaluate_candidates):
|
|
"""Search all candidates in param_grid"""
|
|
evaluate_candidates(ParameterGrid(self.param_grid))
|
|
|
|
|
|
class RandomizedSearchCV(BaseSearchCV):
|
|
"""Randomized search on hyper parameters.
|
|
|
|
RandomizedSearchCV implements a "fit" and a "score" method.
|
|
It also implements "predict", "predict_proba", "decision_function",
|
|
"transform" and "inverse_transform" if they are implemented in the
|
|
estimator used.
|
|
|
|
The parameters of the estimator used to apply these methods are optimized
|
|
by cross-validated search over parameter settings.
|
|
|
|
In contrast to GridSearchCV, not all parameter values are tried out, but
|
|
rather a fixed number of parameter settings is sampled from the specified
|
|
distributions. The number of parameter settings that are tried is
|
|
given by n_iter.
|
|
|
|
If all parameters are presented as a list,
|
|
sampling without replacement is performed. If at least one parameter
|
|
is given as a distribution, sampling with replacement is used.
|
|
It is highly recommended to use continuous distributions for continuous
|
|
parameters.
|
|
|
|
Read more in the :ref:`User Guide <randomized_parameter_search>`.
|
|
|
|
.. versionadded:: 0.14
|
|
|
|
Parameters
|
|
----------
|
|
estimator : estimator object.
|
|
A object of that type is instantiated for each grid point.
|
|
This is assumed to implement the scikit-learn estimator interface.
|
|
Either estimator needs to provide a ``score`` function,
|
|
or ``scoring`` must be passed.
|
|
|
|
param_distributions : dict or list of dicts
|
|
Dictionary with parameters names (`str`) as keys and distributions
|
|
or lists of parameters to try. Distributions must provide a ``rvs``
|
|
method for sampling (such as those from scipy.stats.distributions).
|
|
If a list is given, it is sampled uniformly.
|
|
If a list of dicts is given, first a dict is sampled uniformly, and
|
|
then a parameter is sampled using that dict as above.
|
|
|
|
n_iter : int, default=10
|
|
Number of parameter settings that are sampled. n_iter trades
|
|
off runtime vs quality of the solution.
|
|
|
|
scoring : str, callable, list/tuple or dict, default=None
|
|
A single str (see :ref:`scoring_parameter`) or a callable
|
|
(see :ref:`scoring`) to evaluate the predictions on the test set.
|
|
|
|
For evaluating multiple metrics, either give a list of (unique) strings
|
|
or a dict with names as keys and callables as values.
|
|
|
|
NOTE that when using custom scorers, each scorer should return a single
|
|
value. Metric functions returning a list/array of values can be wrapped
|
|
into multiple scorers that return one value each.
|
|
|
|
See :ref:`multimetric_grid_search` for an example.
|
|
|
|
If None, the estimator's score method is used.
|
|
|
|
n_jobs : int, default=None
|
|
Number of jobs to run in parallel.
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|
for more details.
|
|
|
|
.. versionchanged:: v0.20
|
|
`n_jobs` default changed from 1 to None
|
|
|
|
pre_dispatch : int, or str, default=None
|
|
Controls the number of jobs that get dispatched during parallel
|
|
execution. Reducing this number can be useful to avoid an
|
|
explosion of memory consumption when more jobs get dispatched
|
|
than CPUs can process. This parameter can be:
|
|
|
|
- None, in which case all the jobs are immediately
|
|
created and spawned. Use this for lightweight and
|
|
fast-running jobs, to avoid delays due to on-demand
|
|
spawning of the jobs
|
|
|
|
- An int, giving the exact number of total jobs that are
|
|
spawned
|
|
|
|
- A str, giving an expression as a function of n_jobs,
|
|
as in '2*n_jobs'
|
|
|
|
iid : bool, default=False
|
|
If True, return the average score across folds, weighted by the number
|
|
of samples in each test set. In this case, the data is assumed to be
|
|
identically distributed across the folds, and the loss minimized is
|
|
the total loss per sample, and not the mean loss across the folds.
|
|
|
|
.. deprecated:: 0.22
|
|
Parameter ``iid`` is deprecated in 0.22 and will be removed in 0.24
|
|
|
|
cv : int, cross-validation generator or an iterable, default=None
|
|
Determines the cross-validation splitting strategy.
|
|
Possible inputs for cv are:
|
|
|
|
- None, to use the default 5-fold cross validation,
|
|
- integer, to specify the number of folds in a `(Stratified)KFold`,
|
|
- :term:`CV splitter`,
|
|
- An iterable yielding (train, test) splits as arrays of indices.
|
|
|
|
For integer/None inputs, if the estimator is a classifier and ``y`` is
|
|
either binary or multiclass, :class:`StratifiedKFold` is used. In all
|
|
other cases, :class:`KFold` is used.
|
|
|
|
Refer :ref:`User Guide <cross_validation>` for the various
|
|
cross-validation strategies that can be used here.
|
|
|
|
.. versionchanged:: 0.22
|
|
``cv`` default value if None changed from 3-fold to 5-fold.
|
|
|
|
refit : bool, str, or callable, default=True
|
|
Refit an estimator using the best found parameters on the whole
|
|
dataset.
|
|
|
|
For multiple metric evaluation, this needs to be a `str` denoting the
|
|
scorer that would be used to find the best parameters for refitting
|
|
the estimator at the end.
|
|
|
|
Where there are considerations other than maximum score in
|
|
choosing a best estimator, ``refit`` can be set to a function which
|
|
returns the selected ``best_index_`` given the ``cv_results``. In that
|
|
case, the ``best_estimator_`` and ``best_params_`` will be set
|
|
according to the returned ``best_index_`` while the ``best_score_``
|
|
attribute will not be available.
|
|
|
|
The refitted estimator is made available at the ``best_estimator_``
|
|
attribute and permits using ``predict`` directly on this
|
|
``RandomizedSearchCV`` instance.
|
|
|
|
Also for multiple metric evaluation, the attributes ``best_index_``,
|
|
``best_score_`` and ``best_params_`` will only be available if
|
|
``refit`` is set and all of them will be determined w.r.t this specific
|
|
scorer.
|
|
|
|
See ``scoring`` parameter to know more about multiple metric
|
|
evaluation.
|
|
|
|
.. versionchanged:: 0.20
|
|
Support for callable added.
|
|
|
|
verbose : integer
|
|
Controls the verbosity: the higher, the more messages.
|
|
|
|
random_state : int or RandomState instance, default=None
|
|
Pseudo random number generator state used for random uniform sampling
|
|
from lists of possible values instead of scipy.stats distributions.
|
|
Pass an int for reproducible output across multiple
|
|
function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
error_score : 'raise' or numeric, default=np.nan
|
|
Value to assign to the score if an error occurs in estimator fitting.
|
|
If set to 'raise', the error is raised. If a numeric value is given,
|
|
FitFailedWarning is raised. This parameter does not affect the refit
|
|
step, which will always raise the error.
|
|
|
|
return_train_score : bool, default=False
|
|
If ``False``, the ``cv_results_`` attribute will not include training
|
|
scores.
|
|
Computing training scores is used to get insights on how different
|
|
parameter settings impact the overfitting/underfitting trade-off.
|
|
However computing the scores on the training set can be computationally
|
|
expensive and is not strictly required to select the parameters that
|
|
yield the best generalization performance.
|
|
|
|
.. versionadded:: 0.19
|
|
|
|
.. versionchanged:: 0.21
|
|
Default value was changed from ``True`` to ``False``
|
|
|
|
Attributes
|
|
----------
|
|
cv_results_ : dict of numpy (masked) ndarrays
|
|
A dict with keys as column headers and values as columns, that can be
|
|
imported into a pandas ``DataFrame``.
|
|
|
|
For instance the below given table
|
|
|
|
+--------------+-------------+-------------------+---+---------------+
|
|
| param_kernel | param_gamma | split0_test_score |...|rank_test_score|
|
|
+==============+=============+===================+===+===============+
|
|
| 'rbf' | 0.1 | 0.80 |...| 2 |
|
|
+--------------+-------------+-------------------+---+---------------+
|
|
| 'rbf' | 0.2 | 0.90 |...| 1 |
|
|
+--------------+-------------+-------------------+---+---------------+
|
|
| 'rbf' | 0.3 | 0.70 |...| 1 |
|
|
+--------------+-------------+-------------------+---+---------------+
|
|
|
|
will be represented by a ``cv_results_`` dict of::
|
|
|
|
{
|
|
'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'],
|
|
mask = False),
|
|
'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False),
|
|
'split0_test_score' : [0.80, 0.90, 0.70],
|
|
'split1_test_score' : [0.82, 0.50, 0.70],
|
|
'mean_test_score' : [0.81, 0.70, 0.70],
|
|
'std_test_score' : [0.01, 0.20, 0.00],
|
|
'rank_test_score' : [3, 1, 1],
|
|
'split0_train_score' : [0.80, 0.92, 0.70],
|
|
'split1_train_score' : [0.82, 0.55, 0.70],
|
|
'mean_train_score' : [0.81, 0.74, 0.70],
|
|
'std_train_score' : [0.01, 0.19, 0.00],
|
|
'mean_fit_time' : [0.73, 0.63, 0.43],
|
|
'std_fit_time' : [0.01, 0.02, 0.01],
|
|
'mean_score_time' : [0.01, 0.06, 0.04],
|
|
'std_score_time' : [0.00, 0.00, 0.00],
|
|
'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...],
|
|
}
|
|
|
|
NOTE
|
|
|
|
The key ``'params'`` is used to store a list of parameter
|
|
settings dicts for all the parameter candidates.
|
|
|
|
The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
|
|
``std_score_time`` are all in seconds.
|
|
|
|
For multi-metric evaluation, the scores for all the scorers are
|
|
available in the ``cv_results_`` dict at the keys ending with that
|
|
scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` shown
|
|
above. ('split0_test_precision', 'mean_train_precision' etc.)
|
|
|
|
best_estimator_ : estimator
|
|
Estimator that was chosen by the search, i.e. estimator
|
|
which gave highest score (or smallest loss if specified)
|
|
on the left out data. Not available if ``refit=False``.
|
|
|
|
For multi-metric evaluation, this attribute is present only if
|
|
``refit`` is specified.
|
|
|
|
See ``refit`` parameter for more information on allowed values.
|
|
|
|
best_score_ : float
|
|
Mean cross-validated score of the best_estimator.
|
|
|
|
For multi-metric evaluation, this is not available if ``refit`` is
|
|
``False``. See ``refit`` parameter for more information.
|
|
|
|
This attribute is not available if ``refit`` is a function.
|
|
|
|
best_params_ : dict
|
|
Parameter setting that gave the best results on the hold out data.
|
|
|
|
For multi-metric evaluation, this is not available if ``refit`` is
|
|
``False``. See ``refit`` parameter for more information.
|
|
|
|
best_index_ : int
|
|
The index (of the ``cv_results_`` arrays) which corresponds to the best
|
|
candidate parameter setting.
|
|
|
|
The dict at ``search.cv_results_['params'][search.best_index_]`` gives
|
|
the parameter setting for the best model, that gives the highest
|
|
mean score (``search.best_score_``).
|
|
|
|
For multi-metric evaluation, this is not available if ``refit`` is
|
|
``False``. See ``refit`` parameter for more information.
|
|
|
|
scorer_ : function or a dict
|
|
Scorer function used on the held out data to choose the best
|
|
parameters for the model.
|
|
|
|
For multi-metric evaluation, this attribute holds the validated
|
|
``scoring`` dict which maps the scorer key to the scorer callable.
|
|
|
|
n_splits_ : int
|
|
The number of cross-validation splits (folds/iterations).
|
|
|
|
refit_time_ : float
|
|
Seconds used for refitting the best model on the whole dataset.
|
|
|
|
This is present only if ``refit`` is not False.
|
|
|
|
.. versionadded:: 0.20
|
|
|
|
Notes
|
|
-----
|
|
The parameters selected are those that maximize the score of the held-out
|
|
data, according to the scoring parameter.
|
|
|
|
If `n_jobs` was set to a value higher than one, the data is copied for each
|
|
parameter setting(and not `n_jobs` times). This is done for efficiency
|
|
reasons if individual jobs take very little time, but may raise errors if
|
|
the dataset is large and not enough memory is available. A workaround in
|
|
this case is to set `pre_dispatch`. Then, the memory is copied only
|
|
`pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
|
|
n_jobs`.
|
|
|
|
See Also
|
|
--------
|
|
:class:`GridSearchCV`:
|
|
Does exhaustive search over a grid of parameters.
|
|
|
|
:class:`ParameterSampler`:
|
|
A generator over parameter settings, constructed from
|
|
param_distributions.
|
|
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import load_iris
|
|
>>> from sklearn.linear_model import LogisticRegression
|
|
>>> from sklearn.model_selection import RandomizedSearchCV
|
|
>>> from scipy.stats import uniform
|
|
>>> iris = load_iris()
|
|
>>> logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,
|
|
... random_state=0)
|
|
>>> distributions = dict(C=uniform(loc=0, scale=4),
|
|
... penalty=['l2', 'l1'])
|
|
>>> clf = RandomizedSearchCV(logistic, distributions, random_state=0)
|
|
>>> search = clf.fit(iris.data, iris.target)
|
|
>>> search.best_params_
|
|
{'C': 2..., 'penalty': 'l1'}
|
|
"""
|
|
_required_parameters = ["estimator", "param_distributions"]
|
|
|
|
@_deprecate_positional_args
|
|
def __init__(self, estimator, param_distributions, *, n_iter=10,
|
|
scoring=None, n_jobs=None, iid='deprecated', refit=True,
|
|
cv=None, verbose=0, pre_dispatch='2*n_jobs',
|
|
random_state=None, error_score=np.nan,
|
|
return_train_score=False):
|
|
self.param_distributions = param_distributions
|
|
self.n_iter = n_iter
|
|
self.random_state = random_state
|
|
super().__init__(
|
|
estimator=estimator, scoring=scoring,
|
|
n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
|
|
pre_dispatch=pre_dispatch, error_score=error_score,
|
|
return_train_score=return_train_score)
|
|
|
|
def _run_search(self, evaluate_candidates):
|
|
"""Search n_iter candidates from param_distributions"""
|
|
evaluate_candidates(ParameterSampler(
|
|
self.param_distributions, self.n_iter,
|
|
random_state=self.random_state))
|