1239 lines
39 KiB
Python
1239 lines
39 KiB
Python
"""
|
|
The :mod:`sklearn.utils` module includes various utilities.
|
|
"""
|
|
import pkgutil
|
|
import inspect
|
|
from importlib import import_module
|
|
from operator import itemgetter
|
|
from collections.abc import Sequence
|
|
from contextlib import contextmanager
|
|
from itertools import compress
|
|
from itertools import islice
|
|
import numbers
|
|
import platform
|
|
import struct
|
|
import timeit
|
|
from pathlib import Path
|
|
|
|
import warnings
|
|
import numpy as np
|
|
from scipy.sparse import issparse
|
|
|
|
from .murmurhash import murmurhash3_32
|
|
from .class_weight import compute_class_weight, compute_sample_weight
|
|
from . import _joblib
|
|
from ..exceptions import DataConversionWarning
|
|
from .deprecation import deprecated
|
|
from .fixes import np_version, parse_version
|
|
from ._estimator_html_repr import estimator_html_repr
|
|
from .validation import (as_float_array,
|
|
assert_all_finite,
|
|
check_random_state, column_or_1d, check_array,
|
|
check_consistent_length, check_X_y, indexable,
|
|
check_symmetric, check_scalar,
|
|
_deprecate_positional_args)
|
|
from .. import get_config
|
|
|
|
|
|
# Do not deprecate parallel_backend and register_parallel_backend as they are
|
|
# needed to tune `scikit-learn` behavior and have different effect if called
|
|
# from the vendored version or or the site-package version. The other are
|
|
# utilities that are independent of scikit-learn so they are not part of
|
|
# scikit-learn public API.
|
|
parallel_backend = _joblib.parallel_backend
|
|
register_parallel_backend = _joblib.register_parallel_backend
|
|
|
|
|
|
__all__ = ["murmurhash3_32", "as_float_array",
|
|
"assert_all_finite", "check_array",
|
|
"check_random_state",
|
|
"compute_class_weight", "compute_sample_weight",
|
|
"column_or_1d", "safe_indexing",
|
|
"check_consistent_length", "check_X_y", "check_scalar", 'indexable',
|
|
"check_symmetric", "indices_to_mask", "deprecated",
|
|
"parallel_backend", "register_parallel_backend",
|
|
"resample", "shuffle", "check_matplotlib_support", "all_estimators",
|
|
"DataConversionWarning", "estimator_html_repr"
|
|
]
|
|
|
|
IS_PYPY = platform.python_implementation() == 'PyPy'
|
|
_IS_32BIT = 8 * struct.calcsize("P") == 32
|
|
|
|
|
|
class Bunch(dict):
|
|
"""Container object exposing keys as attributes
|
|
|
|
Bunch objects are sometimes used as an output for functions and methods.
|
|
They extend dictionaries by enabling values to be accessed by key,
|
|
`bunch["value_key"]`, or by an attribute, `bunch.value_key`.
|
|
|
|
Examples
|
|
--------
|
|
>>> b = Bunch(a=1, b=2)
|
|
>>> b['b']
|
|
2
|
|
>>> b.b
|
|
2
|
|
>>> b.a = 3
|
|
>>> b['a']
|
|
3
|
|
>>> b.c = 6
|
|
>>> b['c']
|
|
6
|
|
"""
|
|
|
|
def __init__(self, **kwargs):
|
|
super().__init__(kwargs)
|
|
|
|
def __setattr__(self, key, value):
|
|
self[key] = value
|
|
|
|
def __dir__(self):
|
|
return self.keys()
|
|
|
|
def __getattr__(self, key):
|
|
try:
|
|
return self[key]
|
|
except KeyError:
|
|
raise AttributeError(key)
|
|
|
|
def __setstate__(self, state):
|
|
# Bunch pickles generated with scikit-learn 0.16.* have an non
|
|
# empty __dict__. This causes a surprising behaviour when
|
|
# loading these pickles scikit-learn 0.17: reading bunch.key
|
|
# uses __dict__ but assigning to bunch.key use __setattr__ and
|
|
# only changes bunch['key']. More details can be found at:
|
|
# https://github.com/scikit-learn/scikit-learn/issues/6196.
|
|
# Overriding __setstate__ to be a noop has the effect of
|
|
# ignoring the pickled __dict__
|
|
pass
|
|
|
|
|
|
def safe_mask(X, mask):
|
|
"""Return a mask which is safe to use on X.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}
|
|
Data on which to apply mask.
|
|
|
|
mask : array
|
|
Mask to be used on X.
|
|
|
|
Returns
|
|
-------
|
|
mask
|
|
"""
|
|
mask = np.asarray(mask)
|
|
if np.issubdtype(mask.dtype, np.signedinteger):
|
|
return mask
|
|
|
|
if hasattr(X, "toarray"):
|
|
ind = np.arange(mask.shape[0])
|
|
mask = ind[mask]
|
|
return mask
|
|
|
|
|
|
def axis0_safe_slice(X, mask, len_mask):
|
|
"""
|
|
This mask is safer than safe_mask since it returns an
|
|
empty array, when a sparse matrix is sliced with a boolean mask
|
|
with all False, instead of raising an unhelpful error in older
|
|
versions of SciPy.
|
|
|
|
See: https://github.com/scipy/scipy/issues/5361
|
|
|
|
Also note that we can avoid doing the dot product by checking if
|
|
the len_mask is not zero in _huber_loss_and_gradient but this
|
|
is not going to be the bottleneck, since the number of outliers
|
|
and non_outliers are typically non-zero and it makes the code
|
|
tougher to follow.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}
|
|
Data on which to apply mask.
|
|
|
|
mask : array
|
|
Mask to be used on X.
|
|
|
|
len_mask : int
|
|
The length of the mask.
|
|
|
|
Returns
|
|
-------
|
|
mask
|
|
"""
|
|
if len_mask != 0:
|
|
return X[safe_mask(X, mask), :]
|
|
return np.zeros(shape=(0, X.shape[1]))
|
|
|
|
|
|
def _array_indexing(array, key, key_dtype, axis):
|
|
"""Index an array or scipy.sparse consistently across NumPy version."""
|
|
if np_version < parse_version('1.12') or issparse(array):
|
|
# FIXME: Remove the check for NumPy when using >= 1.12
|
|
# check if we have an boolean array-likes to make the proper indexing
|
|
if key_dtype == 'bool':
|
|
key = np.asarray(key)
|
|
if isinstance(key, tuple):
|
|
key = list(key)
|
|
return array[key] if axis == 0 else array[:, key]
|
|
|
|
|
|
def _pandas_indexing(X, key, key_dtype, axis):
|
|
"""Index a pandas dataframe or a series."""
|
|
if hasattr(key, 'shape'):
|
|
# Work-around for indexing with read-only key in pandas
|
|
# FIXME: solved in pandas 0.25
|
|
key = np.asarray(key)
|
|
key = key if key.flags.writeable else key.copy()
|
|
elif isinstance(key, tuple):
|
|
key = list(key)
|
|
# check whether we should index with loc or iloc
|
|
indexer = X.iloc if key_dtype == 'int' else X.loc
|
|
return indexer[:, key] if axis else indexer[key]
|
|
|
|
|
|
def _list_indexing(X, key, key_dtype):
|
|
"""Index a Python list."""
|
|
if np.isscalar(key) or isinstance(key, slice):
|
|
# key is a slice or a scalar
|
|
return X[key]
|
|
if key_dtype == 'bool':
|
|
# key is a boolean array-like
|
|
return list(compress(X, key))
|
|
# key is a integer array-like of key
|
|
return [X[idx] for idx in key]
|
|
|
|
|
|
def _determine_key_type(key, accept_slice=True):
|
|
"""Determine the data type of key.
|
|
|
|
Parameters
|
|
----------
|
|
key : scalar, slice or array-like
|
|
The key from which we want to infer the data type.
|
|
|
|
accept_slice : bool, default=True
|
|
Whether or not to raise an error if the key is a slice.
|
|
|
|
Returns
|
|
-------
|
|
dtype : {'int', 'str', 'bool', None}
|
|
Returns the data type of key.
|
|
"""
|
|
err_msg = ("No valid specification of the columns. Only a scalar, list or "
|
|
"slice of all integers or all strings, or boolean mask is "
|
|
"allowed")
|
|
|
|
dtype_to_str = {int: 'int', str: 'str', bool: 'bool', np.bool_: 'bool'}
|
|
array_dtype_to_str = {'i': 'int', 'u': 'int', 'b': 'bool', 'O': 'str',
|
|
'U': 'str', 'S': 'str'}
|
|
|
|
if key is None:
|
|
return None
|
|
if isinstance(key, tuple(dtype_to_str.keys())):
|
|
try:
|
|
return dtype_to_str[type(key)]
|
|
except KeyError:
|
|
raise ValueError(err_msg)
|
|
if isinstance(key, slice):
|
|
if not accept_slice:
|
|
raise TypeError(
|
|
'Only array-like or scalar are supported. '
|
|
'A Python slice was given.'
|
|
)
|
|
if key.start is None and key.stop is None:
|
|
return None
|
|
key_start_type = _determine_key_type(key.start)
|
|
key_stop_type = _determine_key_type(key.stop)
|
|
if key_start_type is not None and key_stop_type is not None:
|
|
if key_start_type != key_stop_type:
|
|
raise ValueError(err_msg)
|
|
if key_start_type is not None:
|
|
return key_start_type
|
|
return key_stop_type
|
|
if isinstance(key, (list, tuple)):
|
|
unique_key = set(key)
|
|
key_type = {_determine_key_type(elt) for elt in unique_key}
|
|
if not key_type:
|
|
return None
|
|
if len(key_type) != 1:
|
|
raise ValueError(err_msg)
|
|
return key_type.pop()
|
|
if hasattr(key, 'dtype'):
|
|
try:
|
|
return array_dtype_to_str[key.dtype.kind]
|
|
except KeyError:
|
|
raise ValueError(err_msg)
|
|
raise ValueError(err_msg)
|
|
|
|
|
|
# TODO: remove in 0.24
|
|
@deprecated("safe_indexing is deprecated in version "
|
|
"0.22 and will be removed in version 0.24.")
|
|
def safe_indexing(X, indices, *, axis=0):
|
|
"""Return rows, items or columns of X using indices.
|
|
|
|
.. deprecated:: 0.22
|
|
This function was deprecated in version 0.22 and will be removed in
|
|
version 0.24.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series
|
|
Data from which to sample rows, items or columns. `list` are only
|
|
supported when `axis=0`.
|
|
|
|
indices : bool, int, str, slice, array-like
|
|
|
|
- If `axis=0`, boolean and integer array-like, integer slice,
|
|
and scalar integer are supported.
|
|
- If `axis=1`:
|
|
|
|
- to select a single column, `indices` can be of `int` type for
|
|
all `X` types and `str` only for dataframe. The selected subset
|
|
will be 1D, unless `X` is a sparse matrix in which case it will
|
|
be 2D.
|
|
- to select multiples columns, `indices` can be one of the
|
|
following: `list`, `array`, `slice`. The type used in
|
|
these containers can be one of the following: `int`, 'bool' and
|
|
`str`. However, `str` is only supported when `X` is a dataframe.
|
|
The selected subset will be 2D.
|
|
|
|
axis : int, default=0
|
|
The axis along which `X` will be subsampled. `axis=0` will select
|
|
rows while `axis=1` will select columns.
|
|
|
|
Returns
|
|
-------
|
|
subset
|
|
Subset of X on axis 0 or 1.
|
|
|
|
Notes
|
|
-----
|
|
CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
|
|
not supported.
|
|
"""
|
|
return _safe_indexing(X, indices, axis=axis)
|
|
|
|
|
|
def _safe_indexing(X, indices, *, axis=0):
|
|
"""Return rows, items or columns of X using indices.
|
|
|
|
.. warning::
|
|
|
|
This utility is documented, but **private**. This means that
|
|
backward compatibility might be broken without any deprecation
|
|
cycle.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series
|
|
Data from which to sample rows, items or columns. `list` are only
|
|
supported when `axis=0`.
|
|
indices : bool, int, str, slice, array-like
|
|
- If `axis=0`, boolean and integer array-like, integer slice,
|
|
and scalar integer are supported.
|
|
- If `axis=1`:
|
|
- to select a single column, `indices` can be of `int` type for
|
|
all `X` types and `str` only for dataframe. The selected subset
|
|
will be 1D, unless `X` is a sparse matrix in which case it will
|
|
be 2D.
|
|
- to select multiples columns, `indices` can be one of the
|
|
following: `list`, `array`, `slice`. The type used in
|
|
these containers can be one of the following: `int`, 'bool' and
|
|
`str`. However, `str` is only supported when `X` is a dataframe.
|
|
The selected subset will be 2D.
|
|
axis : int, default=0
|
|
The axis along which `X` will be subsampled. `axis=0` will select
|
|
rows while `axis=1` will select columns.
|
|
|
|
Returns
|
|
-------
|
|
subset
|
|
Subset of X on axis 0 or 1.
|
|
|
|
Notes
|
|
-----
|
|
CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
|
|
not supported.
|
|
"""
|
|
if indices is None:
|
|
return X
|
|
|
|
if axis not in (0, 1):
|
|
raise ValueError(
|
|
"'axis' should be either 0 (to index rows) or 1 (to index "
|
|
" column). Got {} instead.".format(axis)
|
|
)
|
|
|
|
indices_dtype = _determine_key_type(indices)
|
|
|
|
if axis == 0 and indices_dtype == 'str':
|
|
raise ValueError(
|
|
"String indexing is not supported with 'axis=0'"
|
|
)
|
|
|
|
if axis == 1 and X.ndim != 2:
|
|
raise ValueError(
|
|
"'X' should be a 2D NumPy array, 2D sparse matrix or pandas "
|
|
"dataframe when indexing the columns (i.e. 'axis=1'). "
|
|
"Got {} instead with {} dimension(s).".format(type(X), X.ndim)
|
|
)
|
|
|
|
if axis == 1 and indices_dtype == 'str' and not hasattr(X, 'loc'):
|
|
raise ValueError(
|
|
"Specifying the columns using strings is only supported for "
|
|
"pandas DataFrames"
|
|
)
|
|
|
|
if hasattr(X, "iloc"):
|
|
return _pandas_indexing(X, indices, indices_dtype, axis=axis)
|
|
elif hasattr(X, "shape"):
|
|
return _array_indexing(X, indices, indices_dtype, axis=axis)
|
|
else:
|
|
return _list_indexing(X, indices, indices_dtype)
|
|
|
|
|
|
def _get_column_indices(X, key):
|
|
"""Get feature column indices for input data X and key.
|
|
|
|
For accepted values of `key`, see the docstring of
|
|
:func:`_safe_indexing_column`.
|
|
"""
|
|
n_columns = X.shape[1]
|
|
|
|
key_dtype = _determine_key_type(key)
|
|
|
|
if isinstance(key, (list, tuple)) and not key:
|
|
# we get an empty list
|
|
return []
|
|
elif key_dtype in ('bool', 'int'):
|
|
# Convert key into positive indexes
|
|
try:
|
|
idx = _safe_indexing(np.arange(n_columns), key)
|
|
except IndexError as e:
|
|
raise ValueError(
|
|
'all features must be in [0, {}] or [-{}, 0]'
|
|
.format(n_columns - 1, n_columns)
|
|
) from e
|
|
return np.atleast_1d(idx).tolist()
|
|
elif key_dtype == 'str':
|
|
try:
|
|
all_columns = X.columns
|
|
except AttributeError:
|
|
raise ValueError("Specifying the columns using strings is only "
|
|
"supported for pandas DataFrames")
|
|
if isinstance(key, str):
|
|
columns = [key]
|
|
elif isinstance(key, slice):
|
|
start, stop = key.start, key.stop
|
|
if start is not None:
|
|
start = all_columns.get_loc(start)
|
|
if stop is not None:
|
|
# pandas indexing with strings is endpoint included
|
|
stop = all_columns.get_loc(stop) + 1
|
|
else:
|
|
stop = n_columns + 1
|
|
return list(range(n_columns)[slice(start, stop)])
|
|
else:
|
|
columns = list(key)
|
|
|
|
try:
|
|
column_indices = []
|
|
for col in columns:
|
|
col_idx = all_columns.get_loc(col)
|
|
if not isinstance(col_idx, numbers.Integral):
|
|
raise ValueError(f"Selected columns, {columns}, are not "
|
|
"unique in dataframe")
|
|
column_indices.append(col_idx)
|
|
|
|
except KeyError as e:
|
|
raise ValueError(
|
|
"A given column is not a column of the dataframe"
|
|
) from e
|
|
|
|
return column_indices
|
|
else:
|
|
raise ValueError("No valid specification of the columns. Only a "
|
|
"scalar, list or slice of all integers or all "
|
|
"strings, or boolean mask is allowed")
|
|
|
|
|
|
def resample(*arrays, **options):
|
|
"""Resample arrays or sparse matrices in a consistent way
|
|
|
|
The default strategy implements one step of the bootstrapping
|
|
procedure.
|
|
|
|
Parameters
|
|
----------
|
|
*arrays : sequence of indexable data-structures
|
|
Indexable data-structures can be arrays, lists, dataframes or scipy
|
|
sparse matrices with consistent first dimension.
|
|
|
|
Other Parameters
|
|
----------------
|
|
replace : boolean, True by default
|
|
Implements resampling with replacement. If False, this will implement
|
|
(sliced) random permutations.
|
|
|
|
n_samples : int, None by default
|
|
Number of samples to generate. If left to None this is
|
|
automatically set to the first dimension of the arrays.
|
|
If replace is False it should not be larger than the length of
|
|
arrays.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
Determines random number generation for shuffling
|
|
the data.
|
|
Pass an int for reproducible results across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
stratify : array-like or None (default=None)
|
|
If not None, data is split in a stratified fashion, using this as
|
|
the class labels.
|
|
|
|
Returns
|
|
-------
|
|
resampled_arrays : sequence of indexable data-structures
|
|
Sequence of resampled copies of the collections. The original arrays
|
|
are not impacted.
|
|
|
|
Examples
|
|
--------
|
|
It is possible to mix sparse and dense arrays in the same run::
|
|
|
|
>>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
|
|
>>> y = np.array([0, 1, 2])
|
|
|
|
>>> from scipy.sparse import coo_matrix
|
|
>>> X_sparse = coo_matrix(X)
|
|
|
|
>>> from sklearn.utils import resample
|
|
>>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0)
|
|
>>> X
|
|
array([[1., 0.],
|
|
[2., 1.],
|
|
[1., 0.]])
|
|
|
|
>>> X_sparse
|
|
<3x2 sparse matrix of type '<... 'numpy.float64'>'
|
|
with 4 stored elements in Compressed Sparse Row format>
|
|
|
|
>>> X_sparse.toarray()
|
|
array([[1., 0.],
|
|
[2., 1.],
|
|
[1., 0.]])
|
|
|
|
>>> y
|
|
array([0, 1, 0])
|
|
|
|
>>> resample(y, n_samples=2, random_state=0)
|
|
array([0, 1])
|
|
|
|
Example using stratification::
|
|
|
|
>>> y = [0, 0, 1, 1, 1, 1, 1, 1, 1]
|
|
>>> resample(y, n_samples=5, replace=False, stratify=y,
|
|
... random_state=0)
|
|
[1, 1, 1, 0, 1]
|
|
|
|
|
|
See also
|
|
--------
|
|
:func:`sklearn.utils.shuffle`
|
|
"""
|
|
|
|
random_state = check_random_state(options.pop('random_state', None))
|
|
replace = options.pop('replace', True)
|
|
max_n_samples = options.pop('n_samples', None)
|
|
stratify = options.pop('stratify', None)
|
|
if options:
|
|
raise ValueError("Unexpected kw arguments: %r" % options.keys())
|
|
|
|
if len(arrays) == 0:
|
|
return None
|
|
|
|
first = arrays[0]
|
|
n_samples = first.shape[0] if hasattr(first, 'shape') else len(first)
|
|
|
|
if max_n_samples is None:
|
|
max_n_samples = n_samples
|
|
elif (max_n_samples > n_samples) and (not replace):
|
|
raise ValueError("Cannot sample %d out of arrays with dim %d "
|
|
"when replace is False" % (max_n_samples,
|
|
n_samples))
|
|
|
|
check_consistent_length(*arrays)
|
|
|
|
if stratify is None:
|
|
if replace:
|
|
indices = random_state.randint(0, n_samples, size=(max_n_samples,))
|
|
else:
|
|
indices = np.arange(n_samples)
|
|
random_state.shuffle(indices)
|
|
indices = indices[:max_n_samples]
|
|
else:
|
|
# Code adapted from StratifiedShuffleSplit()
|
|
y = check_array(stratify, ensure_2d=False, dtype=None)
|
|
if y.ndim == 2:
|
|
# for multi-label y, map each distinct row to a string repr
|
|
# using join because str(row) uses an ellipsis if len(row) > 1000
|
|
y = np.array([' '.join(row.astype('str')) for row in y])
|
|
|
|
classes, y_indices = np.unique(y, return_inverse=True)
|
|
n_classes = classes.shape[0]
|
|
|
|
class_counts = np.bincount(y_indices)
|
|
|
|
# Find the sorted list of instances for each class:
|
|
# (np.unique above performs a sort, so code is O(n logn) already)
|
|
class_indices = np.split(np.argsort(y_indices, kind='mergesort'),
|
|
np.cumsum(class_counts)[:-1])
|
|
|
|
n_i = _approximate_mode(class_counts, max_n_samples, random_state)
|
|
|
|
indices = []
|
|
|
|
for i in range(n_classes):
|
|
indices_i = random_state.choice(class_indices[i], n_i[i],
|
|
replace=replace)
|
|
indices.extend(indices_i)
|
|
|
|
indices = random_state.permutation(indices)
|
|
|
|
|
|
# convert sparse matrices to CSR for row-based indexing
|
|
arrays = [a.tocsr() if issparse(a) else a for a in arrays]
|
|
resampled_arrays = [_safe_indexing(a, indices) for a in arrays]
|
|
if len(resampled_arrays) == 1:
|
|
# syntactic sugar for the unit argument case
|
|
return resampled_arrays[0]
|
|
else:
|
|
return resampled_arrays
|
|
|
|
|
|
def shuffle(*arrays, **options):
|
|
"""Shuffle arrays or sparse matrices in a consistent way
|
|
|
|
This is a convenience alias to ``resample(*arrays, replace=False)`` to do
|
|
random permutations of the collections.
|
|
|
|
Parameters
|
|
----------
|
|
*arrays : sequence of indexable data-structures
|
|
Indexable data-structures can be arrays, lists, dataframes or scipy
|
|
sparse matrices with consistent first dimension.
|
|
|
|
Other Parameters
|
|
----------------
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
Determines random number generation for shuffling
|
|
the data.
|
|
Pass an int for reproducible results across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
n_samples : int, None by default
|
|
Number of samples to generate. If left to None this is
|
|
automatically set to the first dimension of the arrays.
|
|
|
|
Returns
|
|
-------
|
|
shuffled_arrays : sequence of indexable data-structures
|
|
Sequence of shuffled copies of the collections. The original arrays
|
|
are not impacted.
|
|
|
|
Examples
|
|
--------
|
|
It is possible to mix sparse and dense arrays in the same run::
|
|
|
|
>>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
|
|
>>> y = np.array([0, 1, 2])
|
|
|
|
>>> from scipy.sparse import coo_matrix
|
|
>>> X_sparse = coo_matrix(X)
|
|
|
|
>>> from sklearn.utils import shuffle
|
|
>>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)
|
|
>>> X
|
|
array([[0., 0.],
|
|
[2., 1.],
|
|
[1., 0.]])
|
|
|
|
>>> X_sparse
|
|
<3x2 sparse matrix of type '<... 'numpy.float64'>'
|
|
with 3 stored elements in Compressed Sparse Row format>
|
|
|
|
>>> X_sparse.toarray()
|
|
array([[0., 0.],
|
|
[2., 1.],
|
|
[1., 0.]])
|
|
|
|
>>> y
|
|
array([2, 1, 0])
|
|
|
|
>>> shuffle(y, n_samples=2, random_state=0)
|
|
array([0, 1])
|
|
|
|
See also
|
|
--------
|
|
:func:`sklearn.utils.resample`
|
|
"""
|
|
options['replace'] = False
|
|
return resample(*arrays, **options)
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def safe_sqr(X, *, copy=True):
|
|
"""Element wise squaring of array-likes and sparse matrices.
|
|
|
|
Parameters
|
|
----------
|
|
X : array like, matrix, sparse matrix
|
|
|
|
copy : boolean, optional, default True
|
|
Whether to create a copy of X and operate on it or to perform
|
|
inplace computation (default behaviour).
|
|
|
|
Returns
|
|
-------
|
|
X ** 2 : element wise square
|
|
"""
|
|
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], ensure_2d=False)
|
|
if issparse(X):
|
|
if copy:
|
|
X = X.copy()
|
|
X.data **= 2
|
|
else:
|
|
if copy:
|
|
X = X ** 2
|
|
else:
|
|
X **= 2
|
|
return X
|
|
|
|
|
|
def _chunk_generator(gen, chunksize):
|
|
"""Chunk generator, ``gen`` into lists of length ``chunksize``. The last
|
|
chunk may have a length less than ``chunksize``."""
|
|
while True:
|
|
chunk = list(islice(gen, chunksize))
|
|
if chunk:
|
|
yield chunk
|
|
else:
|
|
return
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def gen_batches(n, batch_size, *, min_batch_size=0):
|
|
"""Generator to create slices containing batch_size elements, from 0 to n.
|
|
|
|
The last slice may contain less than batch_size elements, when batch_size
|
|
does not divide n.
|
|
|
|
Parameters
|
|
----------
|
|
n : int
|
|
batch_size : int
|
|
Number of element in each batch
|
|
min_batch_size : int, default=0
|
|
Minimum batch size to produce.
|
|
|
|
Yields
|
|
------
|
|
slice of batch_size elements
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.utils import gen_batches
|
|
>>> list(gen_batches(7, 3))
|
|
[slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
|
|
>>> list(gen_batches(6, 3))
|
|
[slice(0, 3, None), slice(3, 6, None)]
|
|
>>> list(gen_batches(2, 3))
|
|
[slice(0, 2, None)]
|
|
>>> list(gen_batches(7, 3, min_batch_size=0))
|
|
[slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
|
|
>>> list(gen_batches(7, 3, min_batch_size=2))
|
|
[slice(0, 3, None), slice(3, 7, None)]
|
|
"""
|
|
if not isinstance(batch_size, numbers.Integral):
|
|
raise TypeError("gen_batches got batch_size=%s, must be an"
|
|
" integer" % batch_size)
|
|
if batch_size <= 0:
|
|
raise ValueError("gen_batches got batch_size=%s, must be"
|
|
" positive" % batch_size)
|
|
start = 0
|
|
for _ in range(int(n // batch_size)):
|
|
end = start + batch_size
|
|
if end + min_batch_size > n:
|
|
continue
|
|
yield slice(start, end)
|
|
start = end
|
|
if start < n:
|
|
yield slice(start, n)
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def gen_even_slices(n, n_packs, *, n_samples=None):
|
|
"""Generator to create n_packs slices going up to n.
|
|
|
|
Parameters
|
|
----------
|
|
n : int
|
|
n_packs : int
|
|
Number of slices to generate.
|
|
n_samples : int or None (default = None)
|
|
Number of samples. Pass n_samples when the slices are to be used for
|
|
sparse matrix indexing; slicing off-the-end raises an exception, while
|
|
it works for NumPy arrays.
|
|
|
|
Yields
|
|
------
|
|
slice
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.utils import gen_even_slices
|
|
>>> list(gen_even_slices(10, 1))
|
|
[slice(0, 10, None)]
|
|
>>> list(gen_even_slices(10, 10))
|
|
[slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)]
|
|
>>> list(gen_even_slices(10, 5))
|
|
[slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)]
|
|
>>> list(gen_even_slices(10, 3))
|
|
[slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)]
|
|
"""
|
|
start = 0
|
|
if n_packs < 1:
|
|
raise ValueError("gen_even_slices got n_packs=%s, must be >=1"
|
|
% n_packs)
|
|
for pack_num in range(n_packs):
|
|
this_n = n // n_packs
|
|
if pack_num < n % n_packs:
|
|
this_n += 1
|
|
if this_n > 0:
|
|
end = start + this_n
|
|
if n_samples is not None:
|
|
end = min(n_samples, end)
|
|
yield slice(start, end, None)
|
|
start = end
|
|
|
|
|
|
def tosequence(x):
|
|
"""Cast iterable x to a Sequence, avoiding a copy if possible.
|
|
|
|
Parameters
|
|
----------
|
|
x : iterable
|
|
"""
|
|
if isinstance(x, np.ndarray):
|
|
return np.asarray(x)
|
|
elif isinstance(x, Sequence):
|
|
return x
|
|
else:
|
|
return list(x)
|
|
|
|
|
|
def _to_object_array(sequence):
|
|
"""Convert sequence to a 1-D NumPy array of object dtype.
|
|
|
|
numpy.array constructor has a similar use but it's output
|
|
is ambiguous. It can be 1-D NumPy array of object dtype if
|
|
the input is a ragged array, but if the input is a list of
|
|
equal length arrays, then the output is a 2D numpy.array.
|
|
_to_object_array solves this ambiguity by guarantying that
|
|
the output is a 1-D NumPy array of objects for any input.
|
|
|
|
Parameters
|
|
----------
|
|
sequence : array-like of shape (n_elements,)
|
|
The sequence to be converted.
|
|
|
|
Returns
|
|
-------
|
|
out : ndarray of shape (n_elements,), dtype=object
|
|
The converted sequence into a 1-D NumPy array of object dtype.
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> from sklearn.utils import _to_object_array
|
|
>>> _to_object_array([np.array([0]), np.array([1])])
|
|
array([array([0]), array([1])], dtype=object)
|
|
>>> _to_object_array([np.array([0]), np.array([1, 2])])
|
|
array([array([0]), array([1, 2])], dtype=object)
|
|
>>> np.array([np.array([0]), np.array([1])])
|
|
array([[0],
|
|
[1]])
|
|
>>> np.array([np.array([0]), np.array([1, 2])])
|
|
array([array([0]), array([1, 2])], dtype=object)
|
|
"""
|
|
out = np.empty(len(sequence), dtype=object)
|
|
out[:] = sequence
|
|
return out
|
|
|
|
|
|
def indices_to_mask(indices, mask_length):
|
|
"""Convert list of indices to boolean mask.
|
|
|
|
Parameters
|
|
----------
|
|
indices : list-like
|
|
List of integers treated as indices.
|
|
mask_length : int
|
|
Length of boolean mask to be generated.
|
|
This parameter must be greater than max(indices)
|
|
|
|
Returns
|
|
-------
|
|
mask : 1d boolean nd-array
|
|
Boolean array that is True where indices are present, else False.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.utils import indices_to_mask
|
|
>>> indices = [1, 2 , 3, 4]
|
|
>>> indices_to_mask(indices, 5)
|
|
array([False, True, True, True, True])
|
|
"""
|
|
if mask_length <= np.max(indices):
|
|
raise ValueError("mask_length must be greater than max(indices)")
|
|
|
|
mask = np.zeros(mask_length, dtype=np.bool)
|
|
mask[indices] = True
|
|
|
|
return mask
|
|
|
|
|
|
def _message_with_time(source, message, time):
|
|
"""Create one line message for logging purposes
|
|
|
|
Parameters
|
|
----------
|
|
source : str
|
|
String indicating the source or the reference of the message
|
|
|
|
message : str
|
|
Short message
|
|
|
|
time : int
|
|
Time in seconds
|
|
"""
|
|
start_message = "[%s] " % source
|
|
|
|
# adapted from joblib.logger.short_format_time without the Windows -.1s
|
|
# adjustment
|
|
if time > 60:
|
|
time_str = "%4.1fmin" % (time / 60)
|
|
else:
|
|
time_str = " %5.1fs" % time
|
|
end_message = " %s, total=%s" % (message, time_str)
|
|
dots_len = (70 - len(start_message) - len(end_message))
|
|
return "%s%s%s" % (start_message, dots_len * '.', end_message)
|
|
|
|
|
|
@contextmanager
|
|
def _print_elapsed_time(source, message=None):
|
|
"""Log elapsed time to stdout when the context is exited
|
|
|
|
Parameters
|
|
----------
|
|
source : str
|
|
String indicating the source or the reference of the message
|
|
|
|
message : str or None
|
|
Short message. If None, nothing will be printed
|
|
|
|
Returns
|
|
-------
|
|
context_manager
|
|
Prints elapsed time upon exit if verbose
|
|
"""
|
|
if message is None:
|
|
yield
|
|
else:
|
|
start = timeit.default_timer()
|
|
yield
|
|
print(
|
|
_message_with_time(source, message,
|
|
timeit.default_timer() - start))
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):
|
|
"""Calculates how many rows can be processed within working_memory
|
|
|
|
Parameters
|
|
----------
|
|
row_bytes : int
|
|
The expected number of bytes of memory that will be consumed
|
|
during the processing of each row.
|
|
max_n_rows : int, optional
|
|
The maximum return value.
|
|
working_memory : int or float, optional
|
|
The number of rows to fit inside this number of MiB will be returned.
|
|
When None (default), the value of
|
|
``sklearn.get_config()['working_memory']`` is used.
|
|
|
|
Returns
|
|
-------
|
|
int or the value of n_samples
|
|
|
|
Warns
|
|
-----
|
|
Issues a UserWarning if ``row_bytes`` exceeds ``working_memory`` MiB.
|
|
"""
|
|
|
|
if working_memory is None:
|
|
working_memory = get_config()['working_memory']
|
|
|
|
chunk_n_rows = int(working_memory * (2 ** 20) // row_bytes)
|
|
if max_n_rows is not None:
|
|
chunk_n_rows = min(chunk_n_rows, max_n_rows)
|
|
if chunk_n_rows < 1:
|
|
warnings.warn('Could not adhere to working_memory config. '
|
|
'Currently %.0fMiB, %.0fMiB required.' %
|
|
(working_memory, np.ceil(row_bytes * 2 ** -20)))
|
|
chunk_n_rows = 1
|
|
return chunk_n_rows
|
|
|
|
|
|
def is_scalar_nan(x):
|
|
"""Tests if x is NaN
|
|
|
|
This function is meant to overcome the issue that np.isnan does not allow
|
|
non-numerical types as input, and that np.nan is not np.float('nan').
|
|
|
|
Parameters
|
|
----------
|
|
x : any type
|
|
|
|
Returns
|
|
-------
|
|
boolean
|
|
|
|
Examples
|
|
--------
|
|
>>> is_scalar_nan(np.nan)
|
|
True
|
|
>>> is_scalar_nan(float("nan"))
|
|
True
|
|
>>> is_scalar_nan(None)
|
|
False
|
|
>>> is_scalar_nan("")
|
|
False
|
|
>>> is_scalar_nan([np.nan])
|
|
False
|
|
"""
|
|
# convert from numpy.bool_ to python bool to ensure that testing
|
|
# is_scalar_nan(x) is True does not fail.
|
|
return bool(isinstance(x, numbers.Real) and np.isnan(x))
|
|
|
|
|
|
def _approximate_mode(class_counts, n_draws, rng):
|
|
"""Computes approximate mode of multivariate hypergeometric.
|
|
|
|
This is an approximation to the mode of the multivariate
|
|
hypergeometric given by class_counts and n_draws.
|
|
It shouldn't be off by more than one.
|
|
|
|
It is the mostly likely outcome of drawing n_draws many
|
|
samples from the population given by class_counts.
|
|
|
|
Parameters
|
|
----------
|
|
class_counts : ndarray of int
|
|
Population per class.
|
|
n_draws : int
|
|
Number of draws (samples to draw) from the overall population.
|
|
rng : random state
|
|
Used to break ties.
|
|
|
|
Returns
|
|
-------
|
|
sampled_classes : ndarray of int
|
|
Number of samples drawn from each class.
|
|
np.sum(sampled_classes) == n_draws
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> from sklearn.utils import _approximate_mode
|
|
>>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)
|
|
array([2, 1])
|
|
>>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)
|
|
array([3, 1])
|
|
>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
|
|
... n_draws=2, rng=0)
|
|
array([0, 1, 1, 0])
|
|
>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
|
|
... n_draws=2, rng=42)
|
|
array([1, 1, 0, 0])
|
|
"""
|
|
rng = check_random_state(rng)
|
|
# this computes a bad approximation to the mode of the
|
|
# multivariate hypergeometric given by class_counts and n_draws
|
|
continuous = n_draws * class_counts / class_counts.sum()
|
|
# floored means we don't overshoot n_samples, but probably undershoot
|
|
floored = np.floor(continuous)
|
|
# we add samples according to how much "left over" probability
|
|
# they had, until we arrive at n_samples
|
|
need_to_add = int(n_draws - floored.sum())
|
|
if need_to_add > 0:
|
|
remainder = continuous - floored
|
|
values = np.sort(np.unique(remainder))[::-1]
|
|
# add according to remainder, but break ties
|
|
# randomly to avoid biases
|
|
for value in values:
|
|
inds, = np.where(remainder == value)
|
|
# if we need_to_add less than what's in inds
|
|
# we draw randomly from them.
|
|
# if we need to add more, we add them all and
|
|
# go to the next value
|
|
add_now = min(len(inds), need_to_add)
|
|
inds = rng.choice(inds, size=add_now, replace=False)
|
|
floored[inds] += 1
|
|
need_to_add -= add_now
|
|
if need_to_add == 0:
|
|
break
|
|
return floored.astype(np.int)
|
|
|
|
|
|
def check_matplotlib_support(caller_name):
|
|
"""Raise ImportError with detailed error message if mpl is not installed.
|
|
|
|
Plot utilities like :func:`plot_partial_dependence` should lazily import
|
|
matplotlib and call this helper before any computation.
|
|
|
|
Parameters
|
|
----------
|
|
caller_name : str
|
|
The name of the caller that requires matplotlib.
|
|
"""
|
|
try:
|
|
import matplotlib # noqa
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"{} requires matplotlib. You can install matplotlib with "
|
|
"`pip install matplotlib`".format(caller_name)
|
|
) from e
|
|
|
|
|
|
def check_pandas_support(caller_name):
|
|
"""Raise ImportError with detailed error message if pandsa is not
|
|
installed.
|
|
|
|
Plot utilities like :func:`fetch_openml` should lazily import
|
|
pandas and call this helper before any computation.
|
|
|
|
Parameters
|
|
----------
|
|
caller_name : str
|
|
The name of the caller that requires pandas.
|
|
"""
|
|
try:
|
|
import pandas # noqa
|
|
return pandas
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"{} requires pandas.".format(caller_name)
|
|
) from e
|
|
|
|
|
|
def all_estimators(type_filter=None):
|
|
"""Get a list of all estimators from sklearn.
|
|
|
|
This function crawls the module and gets all classes that inherit
|
|
from BaseEstimator. Classes that are defined in test-modules are not
|
|
included.
|
|
By default meta_estimators such as GridSearchCV are also not included.
|
|
|
|
Parameters
|
|
----------
|
|
type_filter : string, list of string, or None, default=None
|
|
Which kind of estimators should be returned. If None, no filter is
|
|
applied and all estimators are returned. Possible values are
|
|
'classifier', 'regressor', 'cluster' and 'transformer' to get
|
|
estimators only of these specific types, or a list of these to
|
|
get the estimators that fit at least one of the types.
|
|
|
|
Returns
|
|
-------
|
|
estimators : list of tuples
|
|
List of (name, class), where ``name`` is the class name as string
|
|
and ``class`` is the actuall type of the class.
|
|
"""
|
|
# lazy import to avoid circular imports from sklearn.base
|
|
from ._testing import ignore_warnings
|
|
from ..base import (BaseEstimator, ClassifierMixin, RegressorMixin,
|
|
TransformerMixin, ClusterMixin)
|
|
|
|
def is_abstract(c):
|
|
if not(hasattr(c, '__abstractmethods__')):
|
|
return False
|
|
if not len(c.__abstractmethods__):
|
|
return False
|
|
return True
|
|
|
|
all_classes = []
|
|
modules_to_ignore = {"tests", "externals", "setup", "conftest"}
|
|
root = str(Path(__file__).parent.parent) # sklearn package
|
|
# Ignore deprecation warnings triggered at import time and from walking
|
|
# packages
|
|
with ignore_warnings(category=FutureWarning):
|
|
for importer, modname, ispkg in pkgutil.walk_packages(
|
|
path=[root], prefix='sklearn.'):
|
|
mod_parts = modname.split(".")
|
|
if (any(part in modules_to_ignore for part in mod_parts)
|
|
or '._' in modname):
|
|
continue
|
|
module = import_module(modname)
|
|
classes = inspect.getmembers(module, inspect.isclass)
|
|
classes = [(name, est_cls) for name, est_cls in classes
|
|
if not name.startswith("_")]
|
|
|
|
# TODO: Remove when FeatureHasher is implemented in PYPY
|
|
# Skips FeatureHasher for PYPY
|
|
if IS_PYPY and 'feature_extraction' in modname:
|
|
classes = [(name, est_cls) for name, est_cls in classes
|
|
if name == "FeatureHasher"]
|
|
|
|
all_classes.extend(classes)
|
|
|
|
all_classes = set(all_classes)
|
|
|
|
estimators = [c for c in all_classes
|
|
if (issubclass(c[1], BaseEstimator) and
|
|
c[0] != 'BaseEstimator')]
|
|
# get rid of abstract base classes
|
|
estimators = [c for c in estimators if not is_abstract(c[1])]
|
|
|
|
if type_filter is not None:
|
|
if not isinstance(type_filter, list):
|
|
type_filter = [type_filter]
|
|
else:
|
|
type_filter = list(type_filter) # copy
|
|
filtered_estimators = []
|
|
filters = {'classifier': ClassifierMixin,
|
|
'regressor': RegressorMixin,
|
|
'transformer': TransformerMixin,
|
|
'cluster': ClusterMixin}
|
|
for name, mixin in filters.items():
|
|
if name in type_filter:
|
|
type_filter.remove(name)
|
|
filtered_estimators.extend([est for est in estimators
|
|
if issubclass(est[1], mixin)])
|
|
estimators = filtered_estimators
|
|
if type_filter:
|
|
raise ValueError("Parameter type_filter must be 'classifier', "
|
|
"'regressor', 'transformer', 'cluster' or "
|
|
"None, got"
|
|
" %s." % repr(type_filter))
|
|
|
|
# drop duplicates, sort for reproducibility
|
|
# itemgetter is used to ensure the sort does not extend to the 2nd item of
|
|
# the tuple
|
|
return sorted(set(estimators), key=itemgetter(0))
|