Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
67
venv/Lib/site-packages/sklearn/preprocessing/__init__.py
Normal file
67
venv/Lib/site-packages/sklearn/preprocessing/__init__.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
"""
|
||||
The :mod:`sklearn.preprocessing` module includes scaling, centering,
|
||||
normalization, binarization methods.
|
||||
"""
|
||||
|
||||
from ._function_transformer import FunctionTransformer
|
||||
|
||||
from ._data import Binarizer
|
||||
from ._data import KernelCenterer
|
||||
from ._data import MinMaxScaler
|
||||
from ._data import MaxAbsScaler
|
||||
from ._data import Normalizer
|
||||
from ._data import RobustScaler
|
||||
from ._data import StandardScaler
|
||||
from ._data import QuantileTransformer
|
||||
from ._data import add_dummy_feature
|
||||
from ._data import binarize
|
||||
from ._data import normalize
|
||||
from ._data import scale
|
||||
from ._data import robust_scale
|
||||
from ._data import maxabs_scale
|
||||
from ._data import minmax_scale
|
||||
from ._data import quantile_transform
|
||||
from ._data import power_transform
|
||||
from ._data import PowerTransformer
|
||||
from ._data import PolynomialFeatures
|
||||
|
||||
from ._encoders import OneHotEncoder
|
||||
from ._encoders import OrdinalEncoder
|
||||
|
||||
from ._label import label_binarize
|
||||
from ._label import LabelBinarizer
|
||||
from ._label import LabelEncoder
|
||||
from ._label import MultiLabelBinarizer
|
||||
|
||||
from ._discretization import KBinsDiscretizer
|
||||
|
||||
|
||||
__all__ = [
|
||||
'Binarizer',
|
||||
'FunctionTransformer',
|
||||
'KBinsDiscretizer',
|
||||
'KernelCenterer',
|
||||
'LabelBinarizer',
|
||||
'LabelEncoder',
|
||||
'MultiLabelBinarizer',
|
||||
'MinMaxScaler',
|
||||
'MaxAbsScaler',
|
||||
'QuantileTransformer',
|
||||
'Normalizer',
|
||||
'OneHotEncoder',
|
||||
'OrdinalEncoder',
|
||||
'PowerTransformer',
|
||||
'RobustScaler',
|
||||
'StandardScaler',
|
||||
'add_dummy_feature',
|
||||
'PolynomialFeatures',
|
||||
'binarize',
|
||||
'normalize',
|
||||
'scale',
|
||||
'robust_scale',
|
||||
'maxabs_scale',
|
||||
'minmax_scale',
|
||||
'label_binarize',
|
||||
'quantile_transform',
|
||||
'power_transform',
|
||||
]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
3138
venv/Lib/site-packages/sklearn/preprocessing/_data.py
Normal file
3138
venv/Lib/site-packages/sklearn/preprocessing/_data.py
Normal file
File diff suppressed because it is too large
Load diff
324
venv/Lib/site-packages/sklearn/preprocessing/_discretization.py
Normal file
324
venv/Lib/site-packages/sklearn/preprocessing/_discretization.py
Normal file
|
@ -0,0 +1,324 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Author: Henry Lin <hlin117@gmail.com>
|
||||
# Tom Dupré la Tour
|
||||
|
||||
# License: BSD
|
||||
|
||||
|
||||
import numbers
|
||||
import numpy as np
|
||||
import warnings
|
||||
|
||||
from . import OneHotEncoder
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin
|
||||
from ..utils.validation import check_array
|
||||
from ..utils.validation import check_is_fitted
|
||||
from ..utils.validation import FLOAT_DTYPES
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
|
||||
|
||||
class KBinsDiscretizer(TransformerMixin, BaseEstimator):
|
||||
"""
|
||||
Bin continuous data into intervals.
|
||||
|
||||
Read more in the :ref:`User Guide <preprocessing_discretization>`.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_bins : int or array-like, shape (n_features,) (default=5)
|
||||
The number of bins to produce. Raises ValueError if ``n_bins < 2``.
|
||||
|
||||
encode : {'onehot', 'onehot-dense', 'ordinal'}, (default='onehot')
|
||||
Method used to encode the transformed result.
|
||||
|
||||
onehot
|
||||
Encode the transformed result with one-hot encoding
|
||||
and return a sparse matrix. Ignored features are always
|
||||
stacked to the right.
|
||||
onehot-dense
|
||||
Encode the transformed result with one-hot encoding
|
||||
and return a dense array. Ignored features are always
|
||||
stacked to the right.
|
||||
ordinal
|
||||
Return the bin identifier encoded as an integer value.
|
||||
|
||||
strategy : {'uniform', 'quantile', 'kmeans'}, (default='quantile')
|
||||
Strategy used to define the widths of the bins.
|
||||
|
||||
uniform
|
||||
All bins in each feature have identical widths.
|
||||
quantile
|
||||
All bins in each feature have the same number of points.
|
||||
kmeans
|
||||
Values in each bin have the same nearest center of a 1D k-means
|
||||
cluster.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
n_bins_ : int array, shape (n_features,)
|
||||
Number of bins per feature. Bins whose width are too small
|
||||
(i.e., <= 1e-8) are removed with a warning.
|
||||
|
||||
bin_edges_ : array of arrays, shape (n_features, )
|
||||
The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
|
||||
Ignored features will have empty arrays.
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.preprocessing.Binarizer : Class used to bin values as ``0`` or
|
||||
``1`` based on a parameter ``threshold``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
In bin edges for feature ``i``, the first and last values are used only for
|
||||
``inverse_transform``. During transform, bin edges are extended to::
|
||||
|
||||
np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])
|
||||
|
||||
You can combine ``KBinsDiscretizer`` with
|
||||
:class:`sklearn.compose.ColumnTransformer` if you only want to preprocess
|
||||
part of the features.
|
||||
|
||||
``KBinsDiscretizer`` might produce constant features (e.g., when
|
||||
``encode = 'onehot'`` and certain bins do not contain any data).
|
||||
These features can be removed with feature selection algorithms
|
||||
(e.g., :class:`sklearn.feature_selection.VarianceThreshold`).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> X = [[-2, 1, -4, -1],
|
||||
... [-1, 2, -3, -0.5],
|
||||
... [ 0, 3, -2, 0.5],
|
||||
... [ 1, 4, -1, 2]]
|
||||
>>> est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
|
||||
>>> est.fit(X)
|
||||
KBinsDiscretizer(...)
|
||||
>>> Xt = est.transform(X)
|
||||
>>> Xt # doctest: +SKIP
|
||||
array([[ 0., 0., 0., 0.],
|
||||
[ 1., 1., 1., 0.],
|
||||
[ 2., 2., 2., 1.],
|
||||
[ 2., 2., 2., 2.]])
|
||||
|
||||
Sometimes it may be useful to convert the data back into the original
|
||||
feature space. The ``inverse_transform`` function converts the binned
|
||||
data into the original feature space. Each value will be equal to the mean
|
||||
of the two bin edges.
|
||||
|
||||
>>> est.bin_edges_[0]
|
||||
array([-2., -1., 0., 1.])
|
||||
>>> est.inverse_transform(Xt)
|
||||
array([[-1.5, 1.5, -3.5, -0.5],
|
||||
[-0.5, 2.5, -2.5, -0.5],
|
||||
[ 0.5, 3.5, -1.5, 0.5],
|
||||
[ 0.5, 3.5, -1.5, 1.5]])
|
||||
|
||||
"""
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, n_bins=5, *, encode='onehot', strategy='quantile'):
|
||||
self.n_bins = n_bins
|
||||
self.encode = encode
|
||||
self.strategy = strategy
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""
|
||||
Fit the estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : numeric array-like, shape (n_samples, n_features)
|
||||
Data to be discretized.
|
||||
|
||||
y : None
|
||||
Ignored. This parameter exists only for compatibility with
|
||||
:class:`sklearn.pipeline.Pipeline`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
X = self._validate_data(X, dtype='numeric')
|
||||
|
||||
valid_encode = ('onehot', 'onehot-dense', 'ordinal')
|
||||
if self.encode not in valid_encode:
|
||||
raise ValueError("Valid options for 'encode' are {}. "
|
||||
"Got encode={!r} instead."
|
||||
.format(valid_encode, self.encode))
|
||||
valid_strategy = ('uniform', 'quantile', 'kmeans')
|
||||
if self.strategy not in valid_strategy:
|
||||
raise ValueError("Valid options for 'strategy' are {}. "
|
||||
"Got strategy={!r} instead."
|
||||
.format(valid_strategy, self.strategy))
|
||||
|
||||
n_features = X.shape[1]
|
||||
n_bins = self._validate_n_bins(n_features)
|
||||
|
||||
bin_edges = np.zeros(n_features, dtype=object)
|
||||
for jj in range(n_features):
|
||||
column = X[:, jj]
|
||||
col_min, col_max = column.min(), column.max()
|
||||
|
||||
if col_min == col_max:
|
||||
warnings.warn("Feature %d is constant and will be "
|
||||
"replaced with 0." % jj)
|
||||
n_bins[jj] = 1
|
||||
bin_edges[jj] = np.array([-np.inf, np.inf])
|
||||
continue
|
||||
|
||||
if self.strategy == 'uniform':
|
||||
bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)
|
||||
|
||||
elif self.strategy == 'quantile':
|
||||
quantiles = np.linspace(0, 100, n_bins[jj] + 1)
|
||||
bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
|
||||
|
||||
elif self.strategy == 'kmeans':
|
||||
from ..cluster import KMeans # fixes import loops
|
||||
|
||||
# Deterministic initialization with uniform spacing
|
||||
uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)
|
||||
init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
|
||||
|
||||
# 1D k-means procedure
|
||||
km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
|
||||
centers = km.fit(column[:, None]).cluster_centers_[:, 0]
|
||||
# Must sort, centers may be unsorted even with sorted init
|
||||
centers.sort()
|
||||
bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
|
||||
bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
|
||||
|
||||
# Remove bins whose width are too small (i.e., <= 1e-8)
|
||||
if self.strategy in ('quantile', 'kmeans'):
|
||||
mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
|
||||
bin_edges[jj] = bin_edges[jj][mask]
|
||||
if len(bin_edges[jj]) - 1 != n_bins[jj]:
|
||||
warnings.warn('Bins whose width are too small (i.e., <= '
|
||||
'1e-8) in feature %d are removed. Consider '
|
||||
'decreasing the number of bins.' % jj)
|
||||
n_bins[jj] = len(bin_edges[jj]) - 1
|
||||
|
||||
self.bin_edges_ = bin_edges
|
||||
self.n_bins_ = n_bins
|
||||
|
||||
if 'onehot' in self.encode:
|
||||
self._encoder = OneHotEncoder(
|
||||
categories=[np.arange(i) for i in self.n_bins_],
|
||||
sparse=self.encode == 'onehot')
|
||||
# Fit the OneHotEncoder with toy datasets
|
||||
# so that it's ready for use after the KBinsDiscretizer is fitted
|
||||
self._encoder.fit(np.zeros((1, len(self.n_bins_)), dtype=int))
|
||||
|
||||
return self
|
||||
|
||||
def _validate_n_bins(self, n_features):
|
||||
"""Returns n_bins_, the number of bins per feature.
|
||||
"""
|
||||
orig_bins = self.n_bins
|
||||
if isinstance(orig_bins, numbers.Number):
|
||||
if not isinstance(orig_bins, numbers.Integral):
|
||||
raise ValueError("{} received an invalid n_bins type. "
|
||||
"Received {}, expected int."
|
||||
.format(KBinsDiscretizer.__name__,
|
||||
type(orig_bins).__name__))
|
||||
if orig_bins < 2:
|
||||
raise ValueError("{} received an invalid number "
|
||||
"of bins. Received {}, expected at least 2."
|
||||
.format(KBinsDiscretizer.__name__, orig_bins))
|
||||
return np.full(n_features, orig_bins, dtype=np.int)
|
||||
|
||||
n_bins = check_array(orig_bins, dtype=np.int, copy=True,
|
||||
ensure_2d=False)
|
||||
|
||||
if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
|
||||
raise ValueError("n_bins must be a scalar or array "
|
||||
"of shape (n_features,).")
|
||||
|
||||
bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)
|
||||
|
||||
violating_indices = np.where(bad_nbins_value)[0]
|
||||
if violating_indices.shape[0] > 0:
|
||||
indices = ", ".join(str(i) for i in violating_indices)
|
||||
raise ValueError("{} received an invalid number "
|
||||
"of bins at indices {}. Number of bins "
|
||||
"must be at least 2, and must be an int."
|
||||
.format(KBinsDiscretizer.__name__, indices))
|
||||
return n_bins
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Discretize the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : numeric array-like, shape (n_samples, n_features)
|
||||
Data to be discretized.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : numeric array-like or sparse matrix
|
||||
Data in the binned space.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
Xt = check_array(X, copy=True, dtype=FLOAT_DTYPES)
|
||||
n_features = self.n_bins_.shape[0]
|
||||
if Xt.shape[1] != n_features:
|
||||
raise ValueError("Incorrect number of features. Expecting {}, "
|
||||
"received {}.".format(n_features, Xt.shape[1]))
|
||||
|
||||
bin_edges = self.bin_edges_
|
||||
for jj in range(Xt.shape[1]):
|
||||
# Values which are close to a bin edge are susceptible to numeric
|
||||
# instability. Add eps to X so these values are binned correctly
|
||||
# with respect to their decimal truncation. See documentation of
|
||||
# numpy.isclose for an explanation of ``rtol`` and ``atol``.
|
||||
rtol = 1.e-5
|
||||
atol = 1.e-8
|
||||
eps = atol + rtol * np.abs(Xt[:, jj])
|
||||
Xt[:, jj] = np.digitize(Xt[:, jj] + eps, bin_edges[jj][1:])
|
||||
np.clip(Xt, 0, self.n_bins_ - 1, out=Xt)
|
||||
|
||||
if self.encode == 'ordinal':
|
||||
return Xt
|
||||
|
||||
return self._encoder.transform(Xt)
|
||||
|
||||
def inverse_transform(self, Xt):
|
||||
"""
|
||||
Transform discretized data back to original feature space.
|
||||
|
||||
Note that this function does not regenerate the original data
|
||||
due to discretization rounding.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
Xt : numeric array-like, shape (n_sample, n_features)
|
||||
Transformed data in the binned space.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xinv : numeric array-like
|
||||
Data in the original feature space.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
if 'onehot' in self.encode:
|
||||
Xt = self._encoder.inverse_transform(Xt)
|
||||
|
||||
Xinv = check_array(Xt, copy=True, dtype=FLOAT_DTYPES)
|
||||
n_features = self.n_bins_.shape[0]
|
||||
if Xinv.shape[1] != n_features:
|
||||
raise ValueError("Incorrect number of features. Expecting {}, "
|
||||
"received {}.".format(n_features, Xinv.shape[1]))
|
||||
|
||||
for jj in range(n_features):
|
||||
bin_edges = self.bin_edges_[jj]
|
||||
bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5
|
||||
Xinv[:, jj] = bin_centers[np.int_(Xinv[:, jj])]
|
||||
|
||||
return Xinv
|
737
venv/Lib/site-packages/sklearn/preprocessing/_encoders.py
Normal file
737
venv/Lib/site-packages/sklearn/preprocessing/_encoders.py
Normal file
|
@ -0,0 +1,737 @@
|
|||
# Authors: Andreas Mueller <amueller@ais.uni-bonn.de>
|
||||
# Joris Van den Bossche <jorisvandenbossche@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin
|
||||
from ..utils import check_array
|
||||
from ..utils.validation import check_is_fitted
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
|
||||
from ._label import _encode, _encode_check_unknown
|
||||
|
||||
|
||||
__all__ = [
|
||||
'OneHotEncoder',
|
||||
'OrdinalEncoder'
|
||||
]
|
||||
|
||||
|
||||
class _BaseEncoder(TransformerMixin, BaseEstimator):
|
||||
"""
|
||||
Base class for encoders that includes the code to categorize and
|
||||
transform the input features.
|
||||
|
||||
"""
|
||||
|
||||
def _check_X(self, X):
|
||||
"""
|
||||
Perform custom check_array:
|
||||
- convert list of strings to object dtype
|
||||
- check for missing values for object dtype data (check_array does
|
||||
not do that)
|
||||
- return list of features (arrays): this list of features is
|
||||
constructed feature by feature to preserve the data types
|
||||
of pandas DataFrame columns, as otherwise information is lost
|
||||
and cannot be used, eg for the `categories_` attribute.
|
||||
|
||||
"""
|
||||
if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
|
||||
# if not a dataframe, do normal check_array validation
|
||||
X_temp = check_array(X, dtype=None)
|
||||
if (not hasattr(X, 'dtype')
|
||||
and np.issubdtype(X_temp.dtype, np.str_)):
|
||||
X = check_array(X, dtype=np.object)
|
||||
else:
|
||||
X = X_temp
|
||||
needs_validation = False
|
||||
else:
|
||||
# pandas dataframe, do validation later column by column, in order
|
||||
# to keep the dtype information to be used in the encoder.
|
||||
needs_validation = True
|
||||
|
||||
n_samples, n_features = X.shape
|
||||
X_columns = []
|
||||
|
||||
for i in range(n_features):
|
||||
Xi = self._get_feature(X, feature_idx=i)
|
||||
Xi = check_array(Xi, ensure_2d=False, dtype=None,
|
||||
force_all_finite=needs_validation)
|
||||
X_columns.append(Xi)
|
||||
|
||||
return X_columns, n_samples, n_features
|
||||
|
||||
def _get_feature(self, X, feature_idx):
|
||||
if hasattr(X, 'iloc'):
|
||||
# pandas dataframes
|
||||
return X.iloc[:, feature_idx]
|
||||
# numpy arrays, sparse arrays
|
||||
return X[:, feature_idx]
|
||||
|
||||
def _fit(self, X, handle_unknown='error'):
|
||||
X_list, n_samples, n_features = self._check_X(X)
|
||||
|
||||
if self.categories != 'auto':
|
||||
if len(self.categories) != n_features:
|
||||
raise ValueError("Shape mismatch: if categories is an array,"
|
||||
" it has to be of shape (n_features,).")
|
||||
|
||||
self.categories_ = []
|
||||
|
||||
for i in range(n_features):
|
||||
Xi = X_list[i]
|
||||
if self.categories == 'auto':
|
||||
cats = _encode(Xi)
|
||||
else:
|
||||
cats = np.array(self.categories[i], dtype=Xi.dtype)
|
||||
if Xi.dtype != object:
|
||||
if not np.all(np.sort(cats) == cats):
|
||||
raise ValueError("Unsorted categories are not "
|
||||
"supported for numerical categories")
|
||||
if handle_unknown == 'error':
|
||||
diff = _encode_check_unknown(Xi, cats)
|
||||
if diff:
|
||||
msg = ("Found unknown categories {0} in column {1}"
|
||||
" during fit".format(diff, i))
|
||||
raise ValueError(msg)
|
||||
self.categories_.append(cats)
|
||||
|
||||
def _transform(self, X, handle_unknown='error'):
|
||||
X_list, n_samples, n_features = self._check_X(X)
|
||||
|
||||
X_int = np.zeros((n_samples, n_features), dtype=np.int)
|
||||
X_mask = np.ones((n_samples, n_features), dtype=np.bool)
|
||||
|
||||
if n_features != len(self.categories_):
|
||||
raise ValueError(
|
||||
"The number of features in X is different to the number of "
|
||||
"features of the fitted data. The fitted data had {} features "
|
||||
"and the X has {} features."
|
||||
.format(len(self.categories_,), n_features)
|
||||
)
|
||||
|
||||
for i in range(n_features):
|
||||
Xi = X_list[i]
|
||||
diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
|
||||
return_mask=True)
|
||||
|
||||
if not np.all(valid_mask):
|
||||
if handle_unknown == 'error':
|
||||
msg = ("Found unknown categories {0} in column {1}"
|
||||
" during transform".format(diff, i))
|
||||
raise ValueError(msg)
|
||||
else:
|
||||
# Set the problematic rows to an acceptable value and
|
||||
# continue `The rows are marked `X_mask` and will be
|
||||
# removed later.
|
||||
X_mask[:, i] = valid_mask
|
||||
# cast Xi into the largest string type necessary
|
||||
# to handle different lengths of numpy strings
|
||||
if (self.categories_[i].dtype.kind in ('U', 'S')
|
||||
and self.categories_[i].itemsize > Xi.itemsize):
|
||||
Xi = Xi.astype(self.categories_[i].dtype)
|
||||
else:
|
||||
Xi = Xi.copy()
|
||||
|
||||
Xi[~valid_mask] = self.categories_[i][0]
|
||||
# We use check_unknown=False, since _encode_check_unknown was
|
||||
# already called above.
|
||||
_, encoded = _encode(Xi, self.categories_[i], encode=True,
|
||||
check_unknown=False)
|
||||
X_int[:, i] = encoded
|
||||
|
||||
return X_int, X_mask
|
||||
|
||||
def _more_tags(self):
|
||||
return {'X_types': ['categorical']}
|
||||
|
||||
|
||||
class OneHotEncoder(_BaseEncoder):
|
||||
"""
|
||||
Encode categorical features as a one-hot numeric array.
|
||||
|
||||
The input to this transformer should be an array-like of integers or
|
||||
strings, denoting the values taken on by categorical (discrete) features.
|
||||
The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
|
||||
encoding scheme. This creates a binary column for each category and
|
||||
returns a sparse matrix or dense array (depending on the ``sparse``
|
||||
parameter)
|
||||
|
||||
By default, the encoder derives the categories based on the unique values
|
||||
in each feature. Alternatively, you can also specify the `categories`
|
||||
manually.
|
||||
|
||||
This encoding is needed for feeding categorical data to many scikit-learn
|
||||
estimators, notably linear models and SVMs with the standard kernels.
|
||||
|
||||
Note: a one-hot encoding of y labels should use a LabelBinarizer
|
||||
instead.
|
||||
|
||||
Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
|
||||
|
||||
.. versionchanged:: 0.20
|
||||
|
||||
Parameters
|
||||
----------
|
||||
categories : 'auto' or a list of array-like, default='auto'
|
||||
Categories (unique values) per feature:
|
||||
|
||||
- 'auto' : Determine categories automatically from the training data.
|
||||
- list : ``categories[i]`` holds the categories expected in the ith
|
||||
column. The passed categories should not mix strings and numeric
|
||||
values within a single feature, and should be sorted in case of
|
||||
numeric values.
|
||||
|
||||
The used categories can be found in the ``categories_`` attribute.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
drop : {'first', 'if_binary'} or a array-like of shape (n_features,), \
|
||||
default=None
|
||||
Specifies a methodology to use to drop one of the categories per
|
||||
feature. This is useful in situations where perfectly collinear
|
||||
features cause problems, such as when feeding the resulting data
|
||||
into a neural network or an unregularized regression.
|
||||
|
||||
However, dropping one category breaks the symmetry of the original
|
||||
representation and can therefore induce a bias in downstream models,
|
||||
for instance for penalized linear classification or regression models.
|
||||
|
||||
- None : retain all features (the default).
|
||||
- 'first' : drop the first category in each feature. If only one
|
||||
category is present, the feature will be dropped entirely.
|
||||
- 'if_binary' : drop the first category in each feature with two
|
||||
categories. Features with 1 or more than 2 categories are
|
||||
left intact.
|
||||
- array : ``drop[i]`` is the category in feature ``X[:, i]`` that
|
||||
should be dropped.
|
||||
|
||||
sparse : bool, default=True
|
||||
Will return sparse matrix if set True else will return an array.
|
||||
|
||||
dtype : number type, default=np.float
|
||||
Desired dtype of output.
|
||||
|
||||
handle_unknown : {'error', 'ignore'}, default='error'
|
||||
Whether to raise an error or ignore if an unknown categorical feature
|
||||
is present during transform (default is to raise). When this parameter
|
||||
is set to 'ignore' and an unknown category is encountered during
|
||||
transform, the resulting one-hot encoded columns for this feature
|
||||
will be all zeros. In the inverse transform, an unknown category
|
||||
will be denoted as None.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
categories_ : list of arrays
|
||||
The categories of each feature determined during fitting
|
||||
(in order of the features in X and corresponding with the output
|
||||
of ``transform``). This includes the category specified in ``drop``
|
||||
(if any).
|
||||
|
||||
drop_idx_ : array of shape (n_features,)
|
||||
- ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
|
||||
to be dropped for each feature.
|
||||
- ``drop_idx_[i] = None`` if no category is to be dropped from the
|
||||
feature with index ``i``, e.g. when `drop='if_binary'` and the
|
||||
feature isn't binary.
|
||||
- ``drop_idx_ = None`` if all the transformed features will be
|
||||
retained.
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.preprocessing.OrdinalEncoder : Performs an ordinal (integer)
|
||||
encoding of the categorical features.
|
||||
sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of
|
||||
dictionary items (also handles string-valued features).
|
||||
sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot
|
||||
encoding of dictionary items or strings.
|
||||
sklearn.preprocessing.LabelBinarizer : Binarizes labels in a one-vs-all
|
||||
fashion.
|
||||
sklearn.preprocessing.MultiLabelBinarizer : Transforms between iterable of
|
||||
iterables and a multilabel format, e.g. a (samples x classes) binary
|
||||
matrix indicating the presence of a class label.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Given a dataset with two features, we let the encoder find the unique
|
||||
values per feature and transform the data to a binary one-hot encoding.
|
||||
|
||||
>>> from sklearn.preprocessing import OneHotEncoder
|
||||
|
||||
One can discard categories not seen during `fit`:
|
||||
|
||||
>>> enc = OneHotEncoder(handle_unknown='ignore')
|
||||
>>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
|
||||
>>> enc.fit(X)
|
||||
OneHotEncoder(handle_unknown='ignore')
|
||||
>>> enc.categories_
|
||||
[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
|
||||
>>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
|
||||
array([[1., 0., 1., 0., 0.],
|
||||
[0., 1., 0., 0., 0.]])
|
||||
>>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
|
||||
array([['Male', 1],
|
||||
[None, 2]], dtype=object)
|
||||
>>> enc.get_feature_names(['gender', 'group'])
|
||||
array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'],
|
||||
dtype=object)
|
||||
|
||||
One can always drop the first column for each feature:
|
||||
|
||||
>>> drop_enc = OneHotEncoder(drop='first').fit(X)
|
||||
>>> drop_enc.categories_
|
||||
[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
|
||||
>>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()
|
||||
array([[0., 0., 0.],
|
||||
[1., 1., 0.]])
|
||||
|
||||
Or drop a column for feature only having 2 categories:
|
||||
|
||||
>>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)
|
||||
>>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()
|
||||
array([[0., 1., 0., 0.],
|
||||
[1., 0., 1., 0.]])
|
||||
"""
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, categories='auto', drop=None, sparse=True,
|
||||
dtype=np.float64, handle_unknown='error'):
|
||||
self.categories = categories
|
||||
self.sparse = sparse
|
||||
self.dtype = dtype
|
||||
self.handle_unknown = handle_unknown
|
||||
self.drop = drop
|
||||
|
||||
def _validate_keywords(self):
|
||||
if self.handle_unknown not in ('error', 'ignore'):
|
||||
msg = ("handle_unknown should be either 'error' or 'ignore', "
|
||||
"got {0}.".format(self.handle_unknown))
|
||||
raise ValueError(msg)
|
||||
# If we have both dropped columns and ignored unknown
|
||||
# values, there will be ambiguous cells. This creates difficulties
|
||||
# in interpreting the model.
|
||||
if self.drop is not None and self.handle_unknown != 'error':
|
||||
raise ValueError(
|
||||
"`handle_unknown` must be 'error' when the drop parameter is "
|
||||
"specified, as both would create categories that are all "
|
||||
"zero.")
|
||||
|
||||
def _compute_drop_idx(self):
|
||||
if self.drop is None:
|
||||
return None
|
||||
elif isinstance(self.drop, str):
|
||||
if self.drop == 'first':
|
||||
return np.zeros(len(self.categories_), dtype=np.object)
|
||||
elif self.drop == 'if_binary':
|
||||
return np.array([0 if len(cats) == 2 else None
|
||||
for cats in self.categories_], dtype=np.object)
|
||||
else:
|
||||
msg = (
|
||||
"Wrong input for parameter `drop`. Expected "
|
||||
"'first', 'if_binary', None or array of objects, got {}"
|
||||
)
|
||||
raise ValueError(msg.format(type(self.drop)))
|
||||
|
||||
else:
|
||||
try:
|
||||
self.drop = np.asarray(self.drop, dtype=object)
|
||||
droplen = len(self.drop)
|
||||
except (ValueError, TypeError):
|
||||
msg = (
|
||||
"Wrong input for parameter `drop`. Expected "
|
||||
"'first', 'if_binary', None or array of objects, got {}"
|
||||
)
|
||||
raise ValueError(msg.format(type(self.drop)))
|
||||
if droplen != len(self.categories_):
|
||||
msg = ("`drop` should have length equal to the number "
|
||||
"of features ({}), got {}")
|
||||
raise ValueError(msg.format(len(self.categories_),
|
||||
len(self.drop)))
|
||||
missing_drops = [(i, val) for i, val in enumerate(self.drop)
|
||||
if val not in self.categories_[i]]
|
||||
if any(missing_drops):
|
||||
msg = ("The following categories were supposed to be "
|
||||
"dropped, but were not found in the training "
|
||||
"data.\n{}".format(
|
||||
"\n".join(
|
||||
["Category: {}, Feature: {}".format(c, v)
|
||||
for c, v in missing_drops])))
|
||||
raise ValueError(msg)
|
||||
return np.array([np.where(cat_list == val)[0][0]
|
||||
for (val, cat_list) in
|
||||
zip(self.drop, self.categories_)],
|
||||
dtype=np.object)
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""
|
||||
Fit OneHotEncoder to X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape [n_samples, n_features]
|
||||
The data to determine the categories of each feature.
|
||||
|
||||
y : None
|
||||
Ignored. This parameter exists only for compatibility with
|
||||
:class:`sklearn.pipeline.Pipeline`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
self._validate_keywords()
|
||||
self._fit(X, handle_unknown=self.handle_unknown)
|
||||
self.drop_idx_ = self._compute_drop_idx()
|
||||
return self
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""
|
||||
Fit OneHotEncoder to X, then transform X.
|
||||
|
||||
Equivalent to fit(X).transform(X) but more convenient.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape [n_samples, n_features]
|
||||
The data to encode.
|
||||
|
||||
y : None
|
||||
Ignored. This parameter exists only for compatibility with
|
||||
:class:`sklearn.pipeline.Pipeline`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_out : sparse matrix if sparse=True else a 2-d array
|
||||
Transformed input.
|
||||
"""
|
||||
self._validate_keywords()
|
||||
return super().fit_transform(X, y)
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Transform X using one-hot encoding.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape [n_samples, n_features]
|
||||
The data to encode.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_out : sparse matrix if sparse=True else a 2-d array
|
||||
Transformed input.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
# validation of X happens in _check_X called by _transform
|
||||
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
|
||||
|
||||
n_samples, n_features = X_int.shape
|
||||
|
||||
if self.drop_idx_ is not None:
|
||||
to_drop = self.drop_idx_.copy()
|
||||
# We remove all the dropped categories from mask, and decrement all
|
||||
# categories that occur after them to avoid an empty column.
|
||||
keep_cells = X_int != to_drop
|
||||
n_values = []
|
||||
for i, cats in enumerate(self.categories_):
|
||||
n_cats = len(cats)
|
||||
|
||||
# drop='if_binary' but feature isn't binary
|
||||
if to_drop[i] is None:
|
||||
# set to cardinality to not drop from X_int
|
||||
to_drop[i] = n_cats
|
||||
n_values.append(n_cats)
|
||||
else: # dropped
|
||||
n_values.append(n_cats - 1)
|
||||
|
||||
to_drop = to_drop.reshape(1, -1)
|
||||
X_int[X_int > to_drop] -= 1
|
||||
X_mask &= keep_cells
|
||||
else:
|
||||
n_values = [len(cats) for cats in self.categories_]
|
||||
|
||||
mask = X_mask.ravel()
|
||||
feature_indices = np.cumsum([0] + n_values)
|
||||
indices = (X_int + feature_indices[:-1]).ravel()[mask]
|
||||
|
||||
indptr = np.empty(n_samples + 1, dtype=np.int)
|
||||
indptr[0] = 0
|
||||
np.sum(X_mask, axis=1, out=indptr[1:])
|
||||
np.cumsum(indptr[1:], out=indptr[1:])
|
||||
data = np.ones(indptr[-1])
|
||||
|
||||
out = sparse.csr_matrix((data, indices, indptr),
|
||||
shape=(n_samples, feature_indices[-1]),
|
||||
dtype=self.dtype)
|
||||
if not self.sparse:
|
||||
return out.toarray()
|
||||
else:
|
||||
return out
|
||||
|
||||
def inverse_transform(self, X):
|
||||
"""
|
||||
Convert the data back to the original representation.
|
||||
|
||||
In case unknown categories are encountered (all zeros in the
|
||||
one-hot encoding), ``None`` is used to represent this category.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
|
||||
The transformed data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_tr : array-like, shape [n_samples, n_features]
|
||||
Inverse transformed array.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = check_array(X, accept_sparse='csr')
|
||||
|
||||
n_samples, _ = X.shape
|
||||
n_features = len(self.categories_)
|
||||
if self.drop_idx_ is None:
|
||||
n_transformed_features = sum(len(cats)
|
||||
for cats in self.categories_)
|
||||
else:
|
||||
n_transformed_features = sum(
|
||||
len(cats) - 1 if to_drop is not None else len(cats)
|
||||
for cats, to_drop in zip(self.categories_, self.drop_idx_)
|
||||
)
|
||||
|
||||
# validate shape of passed X
|
||||
msg = ("Shape of the passed X data is not correct. Expected {0} "
|
||||
"columns, got {1}.")
|
||||
if X.shape[1] != n_transformed_features:
|
||||
raise ValueError(msg.format(n_transformed_features, X.shape[1]))
|
||||
|
||||
# create resulting array of appropriate dtype
|
||||
dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
|
||||
X_tr = np.empty((n_samples, n_features), dtype=dt)
|
||||
|
||||
j = 0
|
||||
found_unknown = {}
|
||||
|
||||
for i in range(n_features):
|
||||
if self.drop_idx_ is None or self.drop_idx_[i] is None:
|
||||
cats = self.categories_[i]
|
||||
else:
|
||||
cats = np.delete(self.categories_[i], self.drop_idx_[i])
|
||||
n_categories = len(cats)
|
||||
|
||||
# Only happens if there was a column with a unique
|
||||
# category. In this case we just fill the column with this
|
||||
# unique category value.
|
||||
if n_categories == 0:
|
||||
X_tr[:, i] = self.categories_[i][self.drop_idx_[i]]
|
||||
j += n_categories
|
||||
continue
|
||||
sub = X[:, j:j + n_categories]
|
||||
# for sparse X argmax returns 2D matrix, ensure 1D array
|
||||
labels = np.asarray(sub.argmax(axis=1)).flatten()
|
||||
X_tr[:, i] = cats[labels]
|
||||
if self.handle_unknown == 'ignore':
|
||||
unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
|
||||
# ignored unknown categories: we have a row of all zero
|
||||
if unknown.any():
|
||||
found_unknown[i] = unknown
|
||||
# drop will either be None or handle_unknown will be error. If
|
||||
# self.drop_idx_ is not None, then we can safely assume that all of
|
||||
# the nulls in each column are the dropped value
|
||||
elif self.drop_idx_ is not None:
|
||||
dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
|
||||
if dropped.any():
|
||||
X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]]
|
||||
|
||||
j += n_categories
|
||||
|
||||
# if ignored are found: potentially need to upcast result to
|
||||
# insert None values
|
||||
if found_unknown:
|
||||
if X_tr.dtype != object:
|
||||
X_tr = X_tr.astype(object)
|
||||
|
||||
for idx, mask in found_unknown.items():
|
||||
X_tr[mask, idx] = None
|
||||
|
||||
return X_tr
|
||||
|
||||
def get_feature_names(self, input_features=None):
|
||||
"""
|
||||
Return feature names for output features.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_features : list of str of shape (n_features,)
|
||||
String names for input features if available. By default,
|
||||
"x0", "x1", ... "xn_features" is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
output_feature_names : ndarray of shape (n_output_features,)
|
||||
Array of feature names.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
cats = self.categories_
|
||||
if input_features is None:
|
||||
input_features = ['x%d' % i for i in range(len(cats))]
|
||||
elif len(input_features) != len(self.categories_):
|
||||
raise ValueError(
|
||||
"input_features should have length equal to number of "
|
||||
"features ({}), got {}".format(len(self.categories_),
|
||||
len(input_features)))
|
||||
|
||||
feature_names = []
|
||||
for i in range(len(cats)):
|
||||
names = [
|
||||
input_features[i] + '_' + str(t) for t in cats[i]]
|
||||
if self.drop_idx_ is not None and self.drop_idx_[i] is not None:
|
||||
names.pop(self.drop_idx_[i])
|
||||
feature_names.extend(names)
|
||||
|
||||
return np.array(feature_names, dtype=object)
|
||||
|
||||
|
||||
class OrdinalEncoder(_BaseEncoder):
|
||||
"""
|
||||
Encode categorical features as an integer array.
|
||||
|
||||
The input to this transformer should be an array-like of integers or
|
||||
strings, denoting the values taken on by categorical (discrete) features.
|
||||
The features are converted to ordinal integers. This results in
|
||||
a single column of integers (0 to n_categories - 1) per feature.
|
||||
|
||||
Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
Parameters
|
||||
----------
|
||||
categories : 'auto' or a list of array-like, default='auto'
|
||||
Categories (unique values) per feature:
|
||||
|
||||
- 'auto' : Determine categories automatically from the training data.
|
||||
- list : ``categories[i]`` holds the categories expected in the ith
|
||||
column. The passed categories should not mix strings and numeric
|
||||
values, and should be sorted in case of numeric values.
|
||||
|
||||
The used categories can be found in the ``categories_`` attribute.
|
||||
|
||||
dtype : number type, default np.float64
|
||||
Desired dtype of output.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
categories_ : list of arrays
|
||||
The categories of each feature determined during fitting
|
||||
(in order of the features in X and corresponding with the output
|
||||
of ``transform``).
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.preprocessing.OneHotEncoder : Performs a one-hot encoding of
|
||||
categorical features.
|
||||
sklearn.preprocessing.LabelEncoder : Encodes target labels with values
|
||||
between 0 and n_classes-1.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Given a dataset with two features, we let the encoder find the unique
|
||||
values per feature and transform the data to an ordinal encoding.
|
||||
|
||||
>>> from sklearn.preprocessing import OrdinalEncoder
|
||||
>>> enc = OrdinalEncoder()
|
||||
>>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
|
||||
>>> enc.fit(X)
|
||||
OrdinalEncoder()
|
||||
>>> enc.categories_
|
||||
[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
|
||||
>>> enc.transform([['Female', 3], ['Male', 1]])
|
||||
array([[0., 2.],
|
||||
[1., 0.]])
|
||||
|
||||
>>> enc.inverse_transform([[1, 0], [0, 1]])
|
||||
array([['Male', 1],
|
||||
['Female', 2]], dtype=object)
|
||||
"""
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, categories='auto', dtype=np.float64):
|
||||
self.categories = categories
|
||||
self.dtype = dtype
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""
|
||||
Fit the OrdinalEncoder to X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape [n_samples, n_features]
|
||||
The data to determine the categories of each feature.
|
||||
|
||||
y : None
|
||||
Ignored. This parameter exists only for compatibility with
|
||||
:class:`sklearn.pipeline.Pipeline`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
self._fit(X)
|
||||
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Transform X to ordinal codes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape [n_samples, n_features]
|
||||
The data to encode.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_out : sparse matrix or a 2-d array
|
||||
Transformed input.
|
||||
"""
|
||||
X_int, _ = self._transform(X)
|
||||
return X_int.astype(self.dtype, copy=False)
|
||||
|
||||
def inverse_transform(self, X):
|
||||
"""
|
||||
Convert the data back to the original representation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
|
||||
The transformed data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_tr : array-like, shape [n_samples, n_features]
|
||||
Inverse transformed array.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = check_array(X, accept_sparse='csr')
|
||||
|
||||
n_samples, _ = X.shape
|
||||
n_features = len(self.categories_)
|
||||
|
||||
# validate shape of passed X
|
||||
msg = ("Shape of the passed X data is not correct. Expected {0} "
|
||||
"columns, got {1}.")
|
||||
if X.shape[1] != n_features:
|
||||
raise ValueError(msg.format(n_features, X.shape[1]))
|
||||
|
||||
# create resulting array of appropriate dtype
|
||||
dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
|
||||
X_tr = np.empty((n_samples, n_features), dtype=dt)
|
||||
|
||||
for i in range(n_features):
|
||||
labels = X[:, i].astype('int64', copy=False)
|
||||
X_tr[:, i] = self.categories_[i][labels]
|
||||
|
||||
return X_tr
|
|
@ -0,0 +1,175 @@
|
|||
import warnings
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin
|
||||
from ..utils.validation import _allclose_dense_sparse
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
|
||||
|
||||
def _identity(X):
|
||||
"""The identity function.
|
||||
"""
|
||||
return X
|
||||
|
||||
|
||||
class FunctionTransformer(TransformerMixin, BaseEstimator):
|
||||
"""Constructs a transformer from an arbitrary callable.
|
||||
|
||||
A FunctionTransformer forwards its X (and optionally y) arguments to a
|
||||
user-defined function or function object and returns the result of this
|
||||
function. This is useful for stateless transformations such as taking the
|
||||
log of frequencies, doing custom scaling, etc.
|
||||
|
||||
Note: If a lambda is used as the function, then the resulting
|
||||
transformer will not be pickleable.
|
||||
|
||||
.. versionadded:: 0.17
|
||||
|
||||
Read more in the :ref:`User Guide <function_transformer>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : callable, optional default=None
|
||||
The callable to use for the transformation. This will be passed
|
||||
the same arguments as transform, with args and kwargs forwarded.
|
||||
If func is None, then func will be the identity function.
|
||||
|
||||
inverse_func : callable, optional default=None
|
||||
The callable to use for the inverse transformation. This will be
|
||||
passed the same arguments as inverse transform, with args and
|
||||
kwargs forwarded. If inverse_func is None, then inverse_func
|
||||
will be the identity function.
|
||||
|
||||
validate : bool, optional default=False
|
||||
Indicate that the input X array should be checked before calling
|
||||
``func``. The possibilities are:
|
||||
|
||||
- If False, there is no input validation.
|
||||
- If True, then X will be converted to a 2-dimensional NumPy array or
|
||||
sparse matrix. If the conversion is not possible an exception is
|
||||
raised.
|
||||
|
||||
.. versionchanged:: 0.22
|
||||
The default of ``validate`` changed from True to False.
|
||||
|
||||
accept_sparse : boolean, optional
|
||||
Indicate that func accepts a sparse matrix as input. If validate is
|
||||
False, this has no effect. Otherwise, if accept_sparse is false,
|
||||
sparse matrix inputs will cause an exception to be raised.
|
||||
|
||||
check_inverse : bool, default=True
|
||||
Whether to check that or ``func`` followed by ``inverse_func`` leads to
|
||||
the original inputs. It can be used for a sanity check, raising a
|
||||
warning when the condition is not fulfilled.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
kw_args : dict, optional
|
||||
Dictionary of additional keyword arguments to pass to func.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
inv_kw_args : dict, optional
|
||||
Dictionary of additional keyword arguments to pass to inverse_func.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.preprocessing import FunctionTransformer
|
||||
>>> transformer = FunctionTransformer(np.log1p)
|
||||
>>> X = np.array([[0, 1], [2, 3]])
|
||||
>>> transformer.transform(X)
|
||||
array([[0. , 0.6931...],
|
||||
[1.0986..., 1.3862...]])
|
||||
"""
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, func=None, inverse_func=None, *, validate=False,
|
||||
accept_sparse=False, check_inverse=True, kw_args=None,
|
||||
inv_kw_args=None):
|
||||
self.func = func
|
||||
self.inverse_func = inverse_func
|
||||
self.validate = validate
|
||||
self.accept_sparse = accept_sparse
|
||||
self.check_inverse = check_inverse
|
||||
self.kw_args = kw_args
|
||||
self.inv_kw_args = inv_kw_args
|
||||
|
||||
def _check_input(self, X):
|
||||
if self.validate:
|
||||
return self._validate_data(X, accept_sparse=self.accept_sparse)
|
||||
return X
|
||||
|
||||
def _check_inverse_transform(self, X):
|
||||
"""Check that func and inverse_func are the inverse."""
|
||||
idx_selected = slice(None, None, max(1, X.shape[0] // 100))
|
||||
X_round_trip = self.inverse_transform(self.transform(X[idx_selected]))
|
||||
if not _allclose_dense_sparse(X[idx_selected], X_round_trip):
|
||||
warnings.warn("The provided functions are not strictly"
|
||||
" inverse of each other. If you are sure you"
|
||||
" want to proceed regardless, set"
|
||||
" 'check_inverse=False'.", UserWarning)
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit transformer by checking X.
|
||||
|
||||
If ``validate`` is ``True``, ``X`` will be checked.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Input array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
X = self._check_input(X)
|
||||
if (self.check_inverse and not (self.func is None or
|
||||
self.inverse_func is None)):
|
||||
self._check_inverse_transform(X)
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Transform X using the forward function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Input array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_out : array-like, shape (n_samples, n_features)
|
||||
Transformed input.
|
||||
"""
|
||||
return self._transform(X, func=self.func, kw_args=self.kw_args)
|
||||
|
||||
def inverse_transform(self, X):
|
||||
"""Transform X using the inverse function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Input array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_out : array-like, shape (n_samples, n_features)
|
||||
Transformed input.
|
||||
"""
|
||||
return self._transform(X, func=self.inverse_func,
|
||||
kw_args=self.inv_kw_args)
|
||||
|
||||
def _transform(self, X, func=None, kw_args=None):
|
||||
X = self._check_input(X)
|
||||
|
||||
if func is None:
|
||||
func = _identity
|
||||
|
||||
return func(X, **(kw_args if kw_args else {}))
|
||||
|
||||
def _more_tags(self):
|
||||
return {'no_validation': not self.validate,
|
||||
'stateless': True}
|
1036
venv/Lib/site-packages/sklearn/preprocessing/_label.py
Normal file
1036
venv/Lib/site-packages/sklearn/preprocessing/_label.py
Normal file
File diff suppressed because it is too large
Load diff
18
venv/Lib/site-packages/sklearn/preprocessing/data.py
Normal file
18
venv/Lib/site-packages/sklearn/preprocessing/data.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _data # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.preprocessing.data'
|
||||
correct_import_path = 'sklearn.preprocessing'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_data, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/preprocessing/label.py
Normal file
18
venv/Lib/site-packages/sklearn/preprocessing/label.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _label # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.preprocessing.label'
|
||||
correct_import_path = 'sklearn.preprocessing'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_label, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
20
venv/Lib/site-packages/sklearn/preprocessing/setup.py
Normal file
20
venv/Lib/site-packages/sklearn/preprocessing/setup.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
import os
|
||||
|
||||
|
||||
def configuration(parent_package='', top_path=None):
|
||||
import numpy
|
||||
from numpy.distutils.misc_util import Configuration
|
||||
|
||||
config = Configuration('preprocessing', parent_package, top_path)
|
||||
libraries = []
|
||||
if os.name == 'posix':
|
||||
libraries.append('m')
|
||||
|
||||
config.add_extension('_csr_polynomial_expansion',
|
||||
sources=['_csr_polynomial_expansion.pyx'],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries)
|
||||
|
||||
config.add_subpackage('tests')
|
||||
|
||||
return config
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,158 @@
|
|||
import warnings
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from sklearn.base import clone
|
||||
|
||||
from sklearn.preprocessing import maxabs_scale
|
||||
from sklearn.preprocessing import minmax_scale
|
||||
from sklearn.preprocessing import scale
|
||||
from sklearn.preprocessing import power_transform
|
||||
from sklearn.preprocessing import quantile_transform
|
||||
from sklearn.preprocessing import robust_scale
|
||||
|
||||
from sklearn.preprocessing import MaxAbsScaler
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.preprocessing import PowerTransformer
|
||||
from sklearn.preprocessing import QuantileTransformer
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
iris = load_iris()
|
||||
|
||||
|
||||
def _get_valid_samples_by_column(X, col):
|
||||
"""Get non NaN samples in column of X"""
|
||||
return X[:, [col]][~np.isnan(X[:, col])]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est, func, support_sparse, strictly_positive",
|
||||
[(MaxAbsScaler(), maxabs_scale, True, False),
|
||||
(MinMaxScaler(), minmax_scale, False, False),
|
||||
(StandardScaler(), scale, False, False),
|
||||
(StandardScaler(with_mean=False), scale, True, False),
|
||||
(PowerTransformer('yeo-johnson'), power_transform, False, False),
|
||||
(PowerTransformer('box-cox'), power_transform, False, True),
|
||||
(QuantileTransformer(n_quantiles=10), quantile_transform, True, False),
|
||||
(RobustScaler(), robust_scale, False, False),
|
||||
(RobustScaler(with_centering=False), robust_scale, True, False)]
|
||||
)
|
||||
def test_missing_value_handling(est, func, support_sparse, strictly_positive):
|
||||
# check that the preprocessing method let pass nan
|
||||
rng = np.random.RandomState(42)
|
||||
X = iris.data.copy()
|
||||
n_missing = 50
|
||||
X[rng.randint(X.shape[0], size=n_missing),
|
||||
rng.randint(X.shape[1], size=n_missing)] = np.nan
|
||||
if strictly_positive:
|
||||
X += np.nanmin(X) + 0.1
|
||||
X_train, X_test = train_test_split(X, random_state=1)
|
||||
# sanity check
|
||||
assert not np.all(np.isnan(X_train), axis=0).any()
|
||||
assert np.any(np.isnan(X_train), axis=0).all()
|
||||
assert np.any(np.isnan(X_test), axis=0).all()
|
||||
X_test[:, 0] = np.nan # make sure this boundary case is tested
|
||||
|
||||
with pytest.warns(None) as records:
|
||||
Xt = est.fit(X_train).transform(X_test)
|
||||
# ensure no warnings are raised
|
||||
assert len(records) == 0
|
||||
# missing values should still be missing, and only them
|
||||
assert_array_equal(np.isnan(Xt), np.isnan(X_test))
|
||||
|
||||
# check that the function leads to the same results as the class
|
||||
with pytest.warns(None) as records:
|
||||
Xt_class = est.transform(X_train)
|
||||
assert len(records) == 0
|
||||
Xt_func = func(X_train, **est.get_params())
|
||||
assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
|
||||
assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
|
||||
|
||||
# check that the inverse transform keep NaN
|
||||
Xt_inv = est.inverse_transform(Xt)
|
||||
assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
|
||||
# FIXME: we can introduce equal_nan=True in recent version of numpy.
|
||||
# For the moment which just check that non-NaN values are almost equal.
|
||||
assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])
|
||||
|
||||
for i in range(X.shape[1]):
|
||||
# train only on non-NaN
|
||||
est.fit(_get_valid_samples_by_column(X_train, i))
|
||||
# check transforming with NaN works even when training without NaN
|
||||
with pytest.warns(None) as records:
|
||||
Xt_col = est.transform(X_test[:, [i]])
|
||||
assert len(records) == 0
|
||||
assert_allclose(Xt_col, Xt[:, [i]])
|
||||
# check non-NaN is handled as before - the 1st column is all nan
|
||||
if not np.isnan(X_test[:, i]).all():
|
||||
Xt_col_nonan = est.transform(
|
||||
_get_valid_samples_by_column(X_test, i))
|
||||
assert_array_equal(Xt_col_nonan,
|
||||
Xt_col[~np.isnan(Xt_col.squeeze())])
|
||||
|
||||
if support_sparse:
|
||||
est_dense = clone(est)
|
||||
est_sparse = clone(est)
|
||||
|
||||
with pytest.warns(None) as records:
|
||||
Xt_dense = est_dense.fit(X_train).transform(X_test)
|
||||
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
|
||||
assert len(records) == 0
|
||||
for sparse_constructor in (sparse.csr_matrix, sparse.csc_matrix,
|
||||
sparse.bsr_matrix, sparse.coo_matrix,
|
||||
sparse.dia_matrix, sparse.dok_matrix,
|
||||
sparse.lil_matrix):
|
||||
# check that the dense and sparse inputs lead to the same results
|
||||
# precompute the matrix to avoid catching side warnings
|
||||
X_train_sp = sparse_constructor(X_train)
|
||||
X_test_sp = sparse_constructor(X_test)
|
||||
with pytest.warns(None) as records:
|
||||
warnings.simplefilter('ignore', PendingDeprecationWarning)
|
||||
Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
|
||||
assert len(records) == 0
|
||||
assert_allclose(Xt_sp.A, Xt_dense)
|
||||
with pytest.warns(None) as records:
|
||||
warnings.simplefilter('ignore', PendingDeprecationWarning)
|
||||
Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
|
||||
assert len(records) == 0
|
||||
assert_allclose(Xt_inv_sp.A, Xt_inv_dense)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est, func",
|
||||
[(MaxAbsScaler(), maxabs_scale),
|
||||
(MinMaxScaler(), minmax_scale),
|
||||
(StandardScaler(), scale),
|
||||
(StandardScaler(with_mean=False), scale),
|
||||
(PowerTransformer('yeo-johnson'), power_transform),
|
||||
(PowerTransformer('box-cox'), power_transform,),
|
||||
(QuantileTransformer(n_quantiles=3), quantile_transform),
|
||||
(RobustScaler(), robust_scale),
|
||||
(RobustScaler(with_centering=False), robust_scale)]
|
||||
)
|
||||
def test_missing_value_pandas_na_support(est, func):
|
||||
# Test pandas IntegerArray with pd.NA
|
||||
pd = pytest.importorskip('pandas', minversion="1.0")
|
||||
|
||||
X = np.array([[1, 2, 3, np.nan, np.nan, 4, 5, 1],
|
||||
[np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
|
||||
[1, 2, 3, 4, 5, 6, 7, 8]]).T
|
||||
|
||||
# Creates dataframe with IntegerArrays with pd.NA
|
||||
X_df = pd.DataFrame(X, dtype="Int16", columns=['a', 'b', 'c'])
|
||||
X_df['c'] = X_df['c'].astype('int')
|
||||
|
||||
X_trans = est.fit_transform(X)
|
||||
X_df_trans = est.fit_transform(X_df)
|
||||
|
||||
assert_allclose(X_trans, X_df_trans)
|
2508
venv/Lib/site-packages/sklearn/preprocessing/tests/test_data.py
Normal file
2508
venv/Lib/site-packages/sklearn/preprocessing/tests/test_data.py
Normal file
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,283 @@
|
|||
|
||||
import pytest
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
import warnings
|
||||
|
||||
from sklearn.preprocessing import KBinsDiscretizer
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.utils._testing import (
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
assert_warns_message
|
||||
)
|
||||
|
||||
X = [[-2, 1.5, -4, -1],
|
||||
[-1, 2.5, -3, -0.5],
|
||||
[0, 3.5, -2, 0.5],
|
||||
[1, 4.5, -1, 2]]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'strategy, expected',
|
||||
[('uniform', [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]),
|
||||
('kmeans', [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]),
|
||||
('quantile', [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]])])
|
||||
def test_fit_transform(strategy, expected):
|
||||
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy=strategy)
|
||||
est.fit(X)
|
||||
assert_array_equal(expected, est.transform(X))
|
||||
|
||||
|
||||
def test_valid_n_bins():
|
||||
KBinsDiscretizer(n_bins=2).fit_transform(X)
|
||||
KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X)
|
||||
assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(np.int)
|
||||
|
||||
|
||||
def test_invalid_n_bins():
|
||||
est = KBinsDiscretizer(n_bins=1)
|
||||
err_msg = ("KBinsDiscretizer received an invalid "
|
||||
"number of bins. Received 1, expected at least 2.")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
est = KBinsDiscretizer(n_bins=1.1)
|
||||
err_msg = ("KBinsDiscretizer received an invalid "
|
||||
"n_bins type. Received float, expected int.")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
|
||||
def test_invalid_n_bins_array():
|
||||
# Bad shape
|
||||
n_bins = np.full((2, 4), 2.)
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Incorrect number of features
|
||||
n_bins = [1, 2, 2]
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Bad bin values
|
||||
n_bins = [1, 2, 2, 1]
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = ("KBinsDiscretizer received an invalid number of bins "
|
||||
"at indices 0, 3. Number of bins must be at least 2, "
|
||||
"and must be an int.")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Float bin values
|
||||
n_bins = [2.1, 2, 2.1, 2]
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = ("KBinsDiscretizer received an invalid number of bins "
|
||||
"at indices 0, 2. Number of bins must be at least 2, "
|
||||
"and must be an int.")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'strategy, expected',
|
||||
[('uniform', [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]),
|
||||
('kmeans', [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]),
|
||||
('quantile', [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]])])
|
||||
def test_fit_transform_n_bins_array(strategy, expected):
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='ordinal',
|
||||
strategy=strategy).fit(X)
|
||||
assert_array_equal(expected, est.transform(X))
|
||||
|
||||
# test the shape of bin_edges_
|
||||
n_features = np.array(X).shape[1]
|
||||
assert est.bin_edges_.shape == (n_features, )
|
||||
for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_):
|
||||
assert bin_edges.shape == (n_bins + 1, )
|
||||
|
||||
|
||||
def test_invalid_n_features():
|
||||
est = KBinsDiscretizer(n_bins=3).fit(X)
|
||||
bad_X = np.arange(25).reshape(5, -1)
|
||||
err_msg = "Incorrect number of features. Expecting 4, received 5"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.transform(bad_X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
|
||||
def test_same_min_max(strategy):
|
||||
warnings.simplefilter("always")
|
||||
X = np.array([[1, -2],
|
||||
[1, -1],
|
||||
[1, 0],
|
||||
[1, 1]])
|
||||
est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode='ordinal')
|
||||
assert_warns_message(UserWarning,
|
||||
"Feature 0 is constant and will be replaced "
|
||||
"with 0.", est.fit, X)
|
||||
assert est.n_bins_[0] == 1
|
||||
# replace the feature with zeros
|
||||
Xt = est.transform(X)
|
||||
assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
|
||||
|
||||
|
||||
def test_transform_1d_behavior():
|
||||
X = np.arange(4)
|
||||
est = KBinsDiscretizer(n_bins=2)
|
||||
with pytest.raises(ValueError):
|
||||
est.fit(X)
|
||||
|
||||
est = KBinsDiscretizer(n_bins=2)
|
||||
est.fit(X.reshape(-1, 1))
|
||||
with pytest.raises(ValueError):
|
||||
est.transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('i', range(1, 9))
|
||||
def test_numeric_stability(i):
|
||||
X_init = np.array([2., 4., 6., 8., 10.]).reshape(-1, 1)
|
||||
Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)
|
||||
|
||||
# Test up to discretizing nano units
|
||||
X = X_init / 10**i
|
||||
Xt = KBinsDiscretizer(n_bins=2, encode='ordinal').fit_transform(X)
|
||||
assert_array_equal(Xt_expected, Xt)
|
||||
|
||||
|
||||
def test_invalid_encode_option():
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='invalid-encode')
|
||||
err_msg = (r"Valid options for 'encode' are "
|
||||
r"\('onehot', 'onehot-dense', 'ordinal'\). "
|
||||
r"Got encode='invalid-encode' instead.")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit(X)
|
||||
|
||||
|
||||
def test_encode_options():
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
|
||||
encode='ordinal').fit(X)
|
||||
Xt_1 = est.transform(X)
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
|
||||
encode='onehot-dense').fit(X)
|
||||
Xt_2 = est.transform(X)
|
||||
assert not sp.issparse(Xt_2)
|
||||
assert_array_equal(OneHotEncoder(
|
||||
categories=[np.arange(i) for i in [2, 3, 3, 3]],
|
||||
sparse=False)
|
||||
.fit_transform(Xt_1), Xt_2)
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
|
||||
encode='onehot').fit(X)
|
||||
Xt_3 = est.transform(X)
|
||||
assert sp.issparse(Xt_3)
|
||||
assert_array_equal(OneHotEncoder(
|
||||
categories=[np.arange(i) for i in [2, 3, 3, 3]],
|
||||
sparse=True)
|
||||
.fit_transform(Xt_1).toarray(),
|
||||
Xt_3.toarray())
|
||||
|
||||
|
||||
def test_invalid_strategy_option():
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], strategy='invalid-strategy')
|
||||
err_msg = (r"Valid options for 'strategy' are "
|
||||
r"\('uniform', 'quantile', 'kmeans'\). "
|
||||
r"Got strategy='invalid-strategy' instead.")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'strategy, expected_2bins, expected_3bins, expected_5bins',
|
||||
[('uniform', [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
|
||||
('kmeans', [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
|
||||
('quantile', [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2], [0, 1, 2, 3, 4, 4])])
|
||||
def test_nonuniform_strategies(
|
||||
strategy, expected_2bins, expected_3bins, expected_5bins):
|
||||
X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
|
||||
|
||||
# with 2 bins
|
||||
est = KBinsDiscretizer(n_bins=2, strategy=strategy, encode='ordinal')
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_2bins, Xt.ravel())
|
||||
|
||||
# with 3 bins
|
||||
est = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal')
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_3bins, Xt.ravel())
|
||||
|
||||
# with 5 bins
|
||||
est = KBinsDiscretizer(n_bins=5, strategy=strategy, encode='ordinal')
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_5bins, Xt.ravel())
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'strategy, expected_inv',
|
||||
[('uniform', [[-1.5, 2., -3.5, -0.5], [-0.5, 3., -2.5, -0.5],
|
||||
[0.5, 4., -1.5, 0.5], [0.5, 4., -1.5, 1.5]]),
|
||||
('kmeans', [[-1.375, 2.125, -3.375, -0.5625],
|
||||
[-1.375, 2.125, -3.375, -0.5625],
|
||||
[-0.125, 3.375, -2.125, 0.5625],
|
||||
[0.75, 4.25, -1.25, 1.625]]),
|
||||
('quantile', [[-1.5, 2., -3.5, -0.75], [-0.5, 3., -2.5, 0.],
|
||||
[0.5, 4., -1.5, 1.25], [0.5, 4., -1.5, 1.25]])])
|
||||
@pytest.mark.parametrize('encode', ['ordinal', 'onehot', 'onehot-dense'])
|
||||
def test_inverse_transform(strategy, encode, expected_inv):
|
||||
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
|
||||
Xt = kbd.fit_transform(X)
|
||||
Xinv = kbd.inverse_transform(Xt)
|
||||
assert_array_almost_equal(expected_inv, Xinv)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
|
||||
def test_transform_outside_fit_range(strategy):
|
||||
X = np.array([0, 1, 2, 3])[:, None]
|
||||
kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode='ordinal')
|
||||
kbd.fit(X)
|
||||
|
||||
X2 = np.array([-2, 5])[:, None]
|
||||
X2t = kbd.transform(X2)
|
||||
assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
|
||||
assert_array_equal(X2t.min(axis=0), [0])
|
||||
|
||||
|
||||
def test_overwrite():
|
||||
X = np.array([0, 1, 2, 3])[:, None]
|
||||
X_before = X.copy()
|
||||
|
||||
est = KBinsDiscretizer(n_bins=3, encode="ordinal")
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(X, X_before)
|
||||
|
||||
Xt_before = Xt.copy()
|
||||
Xinv = est.inverse_transform(Xt)
|
||||
assert_array_equal(Xt, Xt_before)
|
||||
assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'strategy, expected_bin_edges',
|
||||
[('quantile', [0, 1, 3]), ('kmeans', [0, 1.5, 3])])
|
||||
def test_redundant_bins(strategy, expected_bin_edges):
|
||||
X = [[0], [0], [0], [0], [3], [3]]
|
||||
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy)
|
||||
msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
|
||||
"are removed. Consider decreasing the number of bins.")
|
||||
assert_warns_message(UserWarning, msg, kbd.fit, X)
|
||||
assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
|
||||
|
||||
|
||||
def test_percentile_numeric_stability():
|
||||
X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
|
||||
bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
|
||||
Xt = np.array([0, 0, 4]).reshape(-1, 1)
|
||||
kbd = KBinsDiscretizer(n_bins=10, encode='ordinal',
|
||||
strategy='quantile')
|
||||
msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
|
||||
"are removed. Consider decreasing the number of bins.")
|
||||
assert_warns_message(UserWarning, msg, kbd.fit, X)
|
||||
assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
|
||||
assert_array_almost_equal(kbd.transform(X), Xt)
|
|
@ -0,0 +1,698 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
import pytest
|
||||
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.preprocessing import OrdinalEncoder
|
||||
|
||||
|
||||
def test_one_hot_encoder_sparse_dense():
|
||||
# check that sparse and dense will give the same results
|
||||
|
||||
X = np.array([[3, 2, 1], [0, 1, 1]])
|
||||
enc_sparse = OneHotEncoder()
|
||||
enc_dense = OneHotEncoder(sparse=False)
|
||||
|
||||
X_trans_sparse = enc_sparse.fit_transform(X)
|
||||
X_trans_dense = enc_dense.fit_transform(X)
|
||||
|
||||
assert X_trans_sparse.shape == (2, 5)
|
||||
assert X_trans_dense.shape == (2, 5)
|
||||
|
||||
assert sparse.issparse(X_trans_sparse)
|
||||
assert not sparse.issparse(X_trans_dense)
|
||||
|
||||
# check outcome
|
||||
assert_array_equal(X_trans_sparse.toarray(), [[0., 1., 0., 1., 1.],
|
||||
[1., 0., 1., 0., 1.]])
|
||||
assert_array_equal(X_trans_sparse.toarray(), X_trans_dense)
|
||||
|
||||
|
||||
def test_one_hot_encoder_diff_n_features():
|
||||
X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
|
||||
X2 = np.array([[1, 0]])
|
||||
enc = OneHotEncoder()
|
||||
enc.fit(X)
|
||||
err_msg = ("The number of features in X is different to the number of "
|
||||
"features of the fitted data.")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
enc.transform(X2)
|
||||
|
||||
|
||||
def test_one_hot_encoder_handle_unknown():
|
||||
X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
|
||||
X2 = np.array([[4, 1, 1]])
|
||||
|
||||
# Test that one hot encoder raises error for unknown features
|
||||
# present during transform.
|
||||
oh = OneHotEncoder(handle_unknown='error')
|
||||
oh.fit(X)
|
||||
with pytest.raises(ValueError, match='Found unknown categories'):
|
||||
oh.transform(X2)
|
||||
|
||||
# Test the ignore option, ignores unknown features (giving all 0's)
|
||||
oh = OneHotEncoder(handle_unknown='ignore')
|
||||
oh.fit(X)
|
||||
X2_passed = X2.copy()
|
||||
assert_array_equal(
|
||||
oh.transform(X2_passed).toarray(),
|
||||
np.array([[0., 0., 0., 0., 1., 0., 0.]]))
|
||||
# ensure transformed data was not modified in place
|
||||
assert_allclose(X2, X2_passed)
|
||||
|
||||
# Raise error if handle_unknown is neither ignore or error.
|
||||
oh = OneHotEncoder(handle_unknown='42')
|
||||
with pytest.raises(ValueError, match='handle_unknown should be either'):
|
||||
oh.fit(X)
|
||||
|
||||
|
||||
def test_one_hot_encoder_not_fitted():
|
||||
X = np.array([['a'], ['b']])
|
||||
enc = OneHotEncoder(categories=['a', 'b'])
|
||||
msg = ("This OneHotEncoder instance is not fitted yet. "
|
||||
"Call 'fit' with appropriate arguments before using this "
|
||||
"estimator.")
|
||||
with pytest.raises(NotFittedError, match=msg):
|
||||
enc.transform(X)
|
||||
|
||||
|
||||
def test_one_hot_encoder_handle_unknown_strings():
|
||||
X = np.array(['11111111', '22', '333', '4444']).reshape((-1, 1))
|
||||
X2 = np.array(['55555', '22']).reshape((-1, 1))
|
||||
# Non Regression test for the issue #12470
|
||||
# Test the ignore option, when categories are numpy string dtype
|
||||
# particularly when the known category strings are larger
|
||||
# than the unknown category strings
|
||||
oh = OneHotEncoder(handle_unknown='ignore')
|
||||
oh.fit(X)
|
||||
X2_passed = X2.copy()
|
||||
assert_array_equal(
|
||||
oh.transform(X2_passed).toarray(),
|
||||
np.array([[0., 0., 0., 0.], [0., 1., 0., 0.]]))
|
||||
# ensure transformed data was not modified in place
|
||||
assert_array_equal(X2, X2_passed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
|
||||
@pytest.mark.parametrize("input_dtype", [np.int32, np.float32, np.float64])
|
||||
def test_one_hot_encoder_dtype(input_dtype, output_dtype):
|
||||
X = np.asarray([[0, 1]], dtype=input_dtype).T
|
||||
X_expected = np.asarray([[1, 0], [0, 1]], dtype=output_dtype)
|
||||
|
||||
oh = OneHotEncoder(categories='auto', dtype=output_dtype)
|
||||
assert_array_equal(oh.fit_transform(X).toarray(), X_expected)
|
||||
assert_array_equal(oh.fit(X).transform(X).toarray(), X_expected)
|
||||
|
||||
oh = OneHotEncoder(categories='auto', dtype=output_dtype, sparse=False)
|
||||
assert_array_equal(oh.fit_transform(X), X_expected)
|
||||
assert_array_equal(oh.fit(X).transform(X), X_expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
|
||||
def test_one_hot_encoder_dtype_pandas(output_dtype):
|
||||
pd = pytest.importorskip('pandas')
|
||||
|
||||
X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]})
|
||||
X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype)
|
||||
|
||||
oh = OneHotEncoder(dtype=output_dtype)
|
||||
assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected)
|
||||
assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected)
|
||||
|
||||
oh = OneHotEncoder(dtype=output_dtype, sparse=False)
|
||||
assert_array_equal(oh.fit_transform(X_df), X_expected)
|
||||
assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)
|
||||
|
||||
|
||||
def test_one_hot_encoder_feature_names():
|
||||
enc = OneHotEncoder()
|
||||
X = [['Male', 1, 'girl', 2, 3],
|
||||
['Female', 41, 'girl', 1, 10],
|
||||
['Male', 51, 'boy', 12, 3],
|
||||
['Male', 91, 'girl', 21, 30]]
|
||||
|
||||
enc.fit(X)
|
||||
feature_names = enc.get_feature_names()
|
||||
assert isinstance(feature_names, np.ndarray)
|
||||
|
||||
assert_array_equal(['x0_Female', 'x0_Male',
|
||||
'x1_1', 'x1_41', 'x1_51', 'x1_91',
|
||||
'x2_boy', 'x2_girl',
|
||||
'x3_1', 'x3_2', 'x3_12', 'x3_21',
|
||||
'x4_3',
|
||||
'x4_10', 'x4_30'], feature_names)
|
||||
|
||||
feature_names2 = enc.get_feature_names(['one', 'two',
|
||||
'three', 'four', 'five'])
|
||||
|
||||
assert_array_equal(['one_Female', 'one_Male',
|
||||
'two_1', 'two_41', 'two_51', 'two_91',
|
||||
'three_boy', 'three_girl',
|
||||
'four_1', 'four_2', 'four_12', 'four_21',
|
||||
'five_3', 'five_10', 'five_30'], feature_names2)
|
||||
|
||||
with pytest.raises(ValueError, match="input_features should have length"):
|
||||
enc.get_feature_names(['one', 'two'])
|
||||
|
||||
|
||||
def test_one_hot_encoder_feature_names_unicode():
|
||||
enc = OneHotEncoder()
|
||||
X = np.array([['c❤t1', 'dat2']], dtype=object).T
|
||||
enc.fit(X)
|
||||
feature_names = enc.get_feature_names()
|
||||
assert_array_equal(['x0_c❤t1', 'x0_dat2'], feature_names)
|
||||
feature_names = enc.get_feature_names(input_features=['n👍me'])
|
||||
assert_array_equal(['n👍me_c❤t1', 'n👍me_dat2'], feature_names)
|
||||
|
||||
|
||||
def test_one_hot_encoder_set_params():
|
||||
X = np.array([[1, 2]]).T
|
||||
oh = OneHotEncoder()
|
||||
# set params on not yet fitted object
|
||||
oh.set_params(categories=[[0, 1, 2, 3]])
|
||||
assert oh.get_params()['categories'] == [[0, 1, 2, 3]]
|
||||
assert oh.fit_transform(X).toarray().shape == (2, 4)
|
||||
# set params on already fitted object
|
||||
oh.set_params(categories=[[0, 1, 2, 3, 4]])
|
||||
assert oh.fit_transform(X).toarray().shape == (2, 5)
|
||||
|
||||
|
||||
def check_categorical_onehot(X):
|
||||
enc = OneHotEncoder(categories='auto')
|
||||
Xtr1 = enc.fit_transform(X)
|
||||
|
||||
enc = OneHotEncoder(categories='auto', sparse=False)
|
||||
Xtr2 = enc.fit_transform(X)
|
||||
|
||||
assert_allclose(Xtr1.toarray(), Xtr2)
|
||||
|
||||
assert sparse.isspmatrix_csr(Xtr1)
|
||||
return Xtr1.toarray()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X", [
|
||||
[['def', 1, 55], ['abc', 2, 55]],
|
||||
np.array([[10, 1, 55], [5, 2, 55]]),
|
||||
np.array([['b', 'A', 'cat'], ['a', 'B', 'cat']], dtype=object)
|
||||
], ids=['mixed', 'numeric', 'object'])
|
||||
def test_one_hot_encoder(X):
|
||||
Xtr = check_categorical_onehot(np.array(X)[:, [0]])
|
||||
assert_allclose(Xtr, [[0, 1], [1, 0]])
|
||||
|
||||
Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
|
||||
assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])
|
||||
|
||||
Xtr = OneHotEncoder(categories='auto').fit_transform(X)
|
||||
assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize('sparse_', [False, True])
|
||||
@pytest.mark.parametrize('drop', [None, 'first'])
|
||||
def test_one_hot_encoder_inverse(sparse_, drop):
|
||||
X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]]
|
||||
enc = OneHotEncoder(sparse=sparse_, drop=drop)
|
||||
X_tr = enc.fit_transform(X)
|
||||
exp = np.array(X, dtype=object)
|
||||
assert_array_equal(enc.inverse_transform(X_tr), exp)
|
||||
|
||||
X = [[2, 55], [1, 55], [3, 55]]
|
||||
enc = OneHotEncoder(sparse=sparse_, categories='auto',
|
||||
drop=drop)
|
||||
X_tr = enc.fit_transform(X)
|
||||
exp = np.array(X)
|
||||
assert_array_equal(enc.inverse_transform(X_tr), exp)
|
||||
|
||||
if drop is None:
|
||||
# with unknown categories
|
||||
# drop is incompatible with handle_unknown=ignore
|
||||
X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]]
|
||||
enc = OneHotEncoder(sparse=sparse_, handle_unknown='ignore',
|
||||
categories=[['abc', 'def'], [1, 2],
|
||||
[54, 55, 56]])
|
||||
X_tr = enc.fit_transform(X)
|
||||
exp = np.array(X, dtype=object)
|
||||
exp[2, 1] = None
|
||||
assert_array_equal(enc.inverse_transform(X_tr), exp)
|
||||
|
||||
# with an otherwise numerical output, still object if unknown
|
||||
X = [[2, 55], [1, 55], [3, 55]]
|
||||
enc = OneHotEncoder(sparse=sparse_, categories=[[1, 2], [54, 56]],
|
||||
handle_unknown='ignore')
|
||||
X_tr = enc.fit_transform(X)
|
||||
exp = np.array(X, dtype=object)
|
||||
exp[2, 0] = None
|
||||
exp[:, 1] = None
|
||||
assert_array_equal(enc.inverse_transform(X_tr), exp)
|
||||
|
||||
# incorrect shape raises
|
||||
X_tr = np.array([[0, 1, 1], [1, 0, 1]])
|
||||
msg = re.escape('Shape of the passed X data is not correct')
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
enc.inverse_transform(X_tr)
|
||||
|
||||
|
||||
def test_one_hot_encoder_inverse_if_binary():
|
||||
X = np.array([['Male', 1],
|
||||
['Female', 3],
|
||||
['Female', 2]], dtype=object)
|
||||
ohe = OneHotEncoder(drop='if_binary', sparse=False)
|
||||
X_tr = ohe.fit_transform(X)
|
||||
assert_array_equal(ohe.inverse_transform(X_tr), X)
|
||||
|
||||
|
||||
# check that resetting drop option without refitting does not throw an error
|
||||
@pytest.mark.parametrize('drop', ['if_binary', 'first', None])
|
||||
@pytest.mark.parametrize('reset_drop', ['if_binary', 'first', None])
|
||||
def test_one_hot_encoder_drop_reset(drop, reset_drop):
|
||||
X = np.array([['Male', 1],
|
||||
['Female', 3],
|
||||
['Female', 2]], dtype=object)
|
||||
ohe = OneHotEncoder(drop=drop, sparse=False)
|
||||
ohe.fit(X)
|
||||
X_tr = ohe.transform(X)
|
||||
feature_names = ohe.get_feature_names()
|
||||
ohe.set_params(drop=reset_drop)
|
||||
assert_array_equal(ohe.inverse_transform(X_tr), X)
|
||||
assert_allclose(ohe.transform(X), X_tr)
|
||||
assert_array_equal(ohe.get_feature_names(), feature_names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
|
||||
@pytest.mark.parametrize("X", [
|
||||
[1, 2],
|
||||
np.array([3., 4.])
|
||||
])
|
||||
def test_X_is_not_1D(X, method):
|
||||
oh = OneHotEncoder()
|
||||
|
||||
msg = ("Expected 2D array, got 1D array instead")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
getattr(oh, method)(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
|
||||
def test_X_is_not_1D_pandas(method):
|
||||
pd = pytest.importorskip('pandas')
|
||||
X = pd.Series([6, 3, 4, 6])
|
||||
oh = OneHotEncoder()
|
||||
|
||||
msg = ("Expected 2D array, got 1D array instead")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
getattr(oh, method)(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X, cat_exp, cat_dtype", [
|
||||
([['abc', 55], ['def', 55]], [['abc', 'def'], [55]], np.object_),
|
||||
(np.array([[1, 2], [3, 2]]), [[1, 3], [2]], np.integer),
|
||||
(np.array([['A', 'cat'], ['B', 'cat']], dtype=object),
|
||||
[['A', 'B'], ['cat']], np.object_),
|
||||
(np.array([['A', 'cat'], ['B', 'cat']]),
|
||||
[['A', 'B'], ['cat']], np.str_)
|
||||
], ids=['mixed', 'numeric', 'object', 'string'])
|
||||
def test_one_hot_encoder_categories(X, cat_exp, cat_dtype):
|
||||
# order of categories should not depend on order of samples
|
||||
for Xi in [X, X[::-1]]:
|
||||
enc = OneHotEncoder(categories='auto')
|
||||
enc.fit(Xi)
|
||||
# assert enc.categories == 'auto'
|
||||
assert isinstance(enc.categories_, list)
|
||||
for res, exp in zip(enc.categories_, cat_exp):
|
||||
assert res.tolist() == exp
|
||||
assert np.issubdtype(res.dtype, cat_dtype)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X, X2, cats, cat_dtype", [
|
||||
(np.array([['a', 'b']], dtype=object).T,
|
||||
np.array([['a', 'd']], dtype=object).T,
|
||||
[['a', 'b', 'c']], np.object_),
|
||||
(np.array([[1, 2]], dtype='int64').T,
|
||||
np.array([[1, 4]], dtype='int64').T,
|
||||
[[1, 2, 3]], np.int64),
|
||||
(np.array([['a', 'b']], dtype=object).T,
|
||||
np.array([['a', 'd']], dtype=object).T,
|
||||
[np.array(['a', 'b', 'c'])], np.object_),
|
||||
], ids=['object', 'numeric', 'object-string-cat'])
|
||||
def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype):
|
||||
enc = OneHotEncoder(categories=cats)
|
||||
exp = np.array([[1., 0., 0.],
|
||||
[0., 1., 0.]])
|
||||
assert_array_equal(enc.fit_transform(X).toarray(), exp)
|
||||
assert list(enc.categories[0]) == list(cats[0])
|
||||
assert enc.categories_[0].tolist() == list(cats[0])
|
||||
# manually specified categories should have same dtype as
|
||||
# the data when coerced from lists
|
||||
assert enc.categories_[0].dtype == cat_dtype
|
||||
|
||||
# when specifying categories manually, unknown categories should already
|
||||
# raise when fitting
|
||||
enc = OneHotEncoder(categories=cats)
|
||||
with pytest.raises(ValueError, match="Found unknown categories"):
|
||||
enc.fit(X2)
|
||||
enc = OneHotEncoder(categories=cats, handle_unknown='ignore')
|
||||
exp = np.array([[1., 0., 0.], [0., 0., 0.]])
|
||||
assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)
|
||||
|
||||
|
||||
def test_one_hot_encoder_unsorted_categories():
|
||||
X = np.array([['a', 'b']], dtype=object).T
|
||||
|
||||
enc = OneHotEncoder(categories=[['b', 'a', 'c']])
|
||||
exp = np.array([[0., 1., 0.],
|
||||
[1., 0., 0.]])
|
||||
assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
|
||||
assert_array_equal(enc.fit_transform(X).toarray(), exp)
|
||||
assert enc.categories_[0].tolist() == ['b', 'a', 'c']
|
||||
assert np.issubdtype(enc.categories_[0].dtype, np.object_)
|
||||
|
||||
# unsorted passed categories still raise for numerical values
|
||||
X = np.array([[1, 2]]).T
|
||||
enc = OneHotEncoder(categories=[[2, 1, 3]])
|
||||
msg = 'Unsorted categories are not supported'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
enc.fit_transform(X)
|
||||
|
||||
|
||||
def test_one_hot_encoder_specified_categories_mixed_columns():
|
||||
# multiple columns
|
||||
X = np.array([['a', 'b'], [0, 2]], dtype=object).T
|
||||
enc = OneHotEncoder(categories=[['a', 'b', 'c'], [0, 1, 2]])
|
||||
exp = np.array([[1., 0., 0., 1., 0., 0.],
|
||||
[0., 1., 0., 0., 0., 1.]])
|
||||
assert_array_equal(enc.fit_transform(X).toarray(), exp)
|
||||
assert enc.categories_[0].tolist() == ['a', 'b', 'c']
|
||||
assert np.issubdtype(enc.categories_[0].dtype, np.object_)
|
||||
assert enc.categories_[1].tolist() == [0, 1, 2]
|
||||
# integer categories but from object dtype data
|
||||
assert np.issubdtype(enc.categories_[1].dtype, np.object_)
|
||||
|
||||
|
||||
def test_one_hot_encoder_pandas():
|
||||
pd = pytest.importorskip('pandas')
|
||||
|
||||
X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]})
|
||||
|
||||
Xtr = check_categorical_onehot(X_df)
|
||||
assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("drop, expected_names",
|
||||
[('first', ['x0_c', 'x2_b']),
|
||||
('if_binary', ['x0_c', 'x1_2', 'x2_b']),
|
||||
(['c', 2, 'b'], ['x0_b', 'x2_a'])],
|
||||
ids=['first', 'binary', 'manual'])
|
||||
def test_one_hot_encoder_feature_names_drop(drop, expected_names):
|
||||
X = [['c', 2, 'a'],
|
||||
['b', 2, 'b']]
|
||||
|
||||
ohe = OneHotEncoder(drop=drop)
|
||||
ohe.fit(X)
|
||||
feature_names = ohe.get_feature_names()
|
||||
assert isinstance(feature_names, np.ndarray)
|
||||
assert_array_equal(expected_names, feature_names)
|
||||
|
||||
|
||||
def test_one_hot_encoder_drop_equals_if_binary():
|
||||
# Canonical case
|
||||
X = [[10, 'yes'],
|
||||
[20, 'no'],
|
||||
[30, 'yes']]
|
||||
expected = np.array([[1., 0., 0., 1.],
|
||||
[0., 1., 0., 0.],
|
||||
[0., 0., 1., 1.]])
|
||||
expected_drop_idx = np.array([None, 0])
|
||||
|
||||
ohe = OneHotEncoder(drop='if_binary', sparse=False)
|
||||
result = ohe.fit_transform(X)
|
||||
assert_array_equal(ohe.drop_idx_, expected_drop_idx)
|
||||
assert_allclose(result, expected)
|
||||
|
||||
# with only one cat, the behaviour is equivalent to drop=None
|
||||
X = [['true', 'a'],
|
||||
['false', 'a'],
|
||||
['false', 'a']]
|
||||
expected = np.array([[1., 1.],
|
||||
[0., 1.],
|
||||
[0., 1.]])
|
||||
expected_drop_idx = np.array([0, None])
|
||||
|
||||
ohe = OneHotEncoder(drop='if_binary', sparse=False)
|
||||
result = ohe.fit_transform(X)
|
||||
assert_array_equal(ohe.drop_idx_, expected_drop_idx)
|
||||
assert_allclose(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
|
||||
np.array([['a', np.nan]], dtype=object).T],
|
||||
ids=['numeric', 'object'])
|
||||
@pytest.mark.parametrize("as_data_frame", [False, True],
|
||||
ids=['array', 'dataframe'])
|
||||
@pytest.mark.parametrize("handle_unknown", ['error', 'ignore'])
|
||||
def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
|
||||
if as_data_frame:
|
||||
pd = pytest.importorskip('pandas')
|
||||
X = pd.DataFrame(X)
|
||||
|
||||
ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown)
|
||||
|
||||
with pytest.raises(ValueError, match="Input contains NaN"):
|
||||
ohe.fit(X)
|
||||
|
||||
with pytest.raises(ValueError, match="Input contains NaN"):
|
||||
ohe.fit_transform(X)
|
||||
|
||||
if as_data_frame:
|
||||
X_partial = X.iloc[:1, :]
|
||||
else:
|
||||
X_partial = X[:1, :]
|
||||
|
||||
ohe.fit(X_partial)
|
||||
|
||||
with pytest.raises(ValueError, match="Input contains NaN"):
|
||||
ohe.transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X", [
|
||||
[['abc', 2, 55], ['def', 1, 55]],
|
||||
np.array([[10, 2, 55], [20, 1, 55]]),
|
||||
np.array([['a', 'B', 'cat'], ['b', 'A', 'cat']], dtype=object)
|
||||
], ids=['mixed', 'numeric', 'object'])
|
||||
def test_ordinal_encoder(X):
|
||||
enc = OrdinalEncoder()
|
||||
exp = np.array([[0, 1, 0],
|
||||
[1, 0, 0]], dtype='int64')
|
||||
assert_array_equal(enc.fit_transform(X), exp.astype('float64'))
|
||||
enc = OrdinalEncoder(dtype='int64')
|
||||
assert_array_equal(enc.fit_transform(X), exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X, X2, cats, cat_dtype", [
|
||||
(np.array([['a', 'b']], dtype=object).T,
|
||||
np.array([['a', 'd']], dtype=object).T,
|
||||
[['a', 'b', 'c']], np.object_),
|
||||
(np.array([[1, 2]], dtype='int64').T,
|
||||
np.array([[1, 4]], dtype='int64').T,
|
||||
[[1, 2, 3]], np.int64),
|
||||
(np.array([['a', 'b']], dtype=object).T,
|
||||
np.array([['a', 'd']], dtype=object).T,
|
||||
[np.array(['a', 'b', 'c'])], np.object_),
|
||||
], ids=['object', 'numeric', 'object-string-cat'])
|
||||
def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
|
||||
enc = OrdinalEncoder(categories=cats)
|
||||
exp = np.array([[0.], [1.]])
|
||||
assert_array_equal(enc.fit_transform(X), exp)
|
||||
assert list(enc.categories[0]) == list(cats[0])
|
||||
assert enc.categories_[0].tolist() == list(cats[0])
|
||||
# manually specified categories should have same dtype as
|
||||
# the data when coerced from lists
|
||||
assert enc.categories_[0].dtype == cat_dtype
|
||||
|
||||
# when specifying categories manually, unknown categories should already
|
||||
# raise when fitting
|
||||
enc = OrdinalEncoder(categories=cats)
|
||||
with pytest.raises(ValueError, match="Found unknown categories"):
|
||||
enc.fit(X2)
|
||||
|
||||
|
||||
def test_ordinal_encoder_inverse():
|
||||
X = [['abc', 2, 55], ['def', 1, 55]]
|
||||
enc = OrdinalEncoder()
|
||||
X_tr = enc.fit_transform(X)
|
||||
exp = np.array(X, dtype=object)
|
||||
assert_array_equal(enc.inverse_transform(X_tr), exp)
|
||||
|
||||
# incorrect shape raises
|
||||
X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
|
||||
msg = re.escape('Shape of the passed X data is not correct')
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
enc.inverse_transform(X_tr)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
|
||||
np.array([['a', np.nan]], dtype=object).T],
|
||||
ids=['numeric', 'object'])
|
||||
def test_ordinal_encoder_raise_missing(X):
|
||||
ohe = OrdinalEncoder()
|
||||
|
||||
with pytest.raises(ValueError, match="Input contains NaN"):
|
||||
ohe.fit(X)
|
||||
|
||||
with pytest.raises(ValueError, match="Input contains NaN"):
|
||||
ohe.fit_transform(X)
|
||||
|
||||
ohe.fit(X[:1, :])
|
||||
|
||||
with pytest.raises(ValueError, match="Input contains NaN"):
|
||||
ohe.transform(X)
|
||||
|
||||
|
||||
def test_ordinal_encoder_raise_categories_shape():
|
||||
|
||||
X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T
|
||||
cats = ['Low', 'Medium', 'High']
|
||||
enc = OrdinalEncoder(categories=cats)
|
||||
msg = ("Shape mismatch: if categories is an array,")
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
enc.fit(X)
|
||||
|
||||
|
||||
def test_encoder_dtypes():
|
||||
# check that dtypes are preserved when determining categories
|
||||
enc = OneHotEncoder(categories='auto')
|
||||
exp = np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]], dtype='float64')
|
||||
|
||||
for X in [np.array([[1, 2], [3, 4]], dtype='int64'),
|
||||
np.array([[1, 2], [3, 4]], dtype='float64'),
|
||||
np.array([['a', 'b'], ['c', 'd']]), # string dtype
|
||||
np.array([[1, 'a'], [3, 'b']], dtype='object')]:
|
||||
enc.fit(X)
|
||||
assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
|
||||
assert_array_equal(enc.transform(X).toarray(), exp)
|
||||
|
||||
X = [[1, 2], [3, 4]]
|
||||
enc.fit(X)
|
||||
assert all([np.issubdtype(enc.categories_[i].dtype, np.integer)
|
||||
for i in range(2)])
|
||||
assert_array_equal(enc.transform(X).toarray(), exp)
|
||||
|
||||
X = [[1, 'a'], [3, 'b']]
|
||||
enc.fit(X)
|
||||
assert all([enc.categories_[i].dtype == 'object' for i in range(2)])
|
||||
assert_array_equal(enc.transform(X).toarray(), exp)
|
||||
|
||||
|
||||
def test_encoder_dtypes_pandas():
|
||||
# check dtype (similar to test_categorical_encoder_dtypes for dataframes)
|
||||
pd = pytest.importorskip('pandas')
|
||||
|
||||
enc = OneHotEncoder(categories='auto')
|
||||
exp = np.array([[1., 0., 1., 0., 1., 0.],
|
||||
[0., 1., 0., 1., 0., 1.]], dtype='float64')
|
||||
|
||||
X = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}, dtype='int64')
|
||||
enc.fit(X)
|
||||
assert all([enc.categories_[i].dtype == 'int64' for i in range(2)])
|
||||
assert_array_equal(enc.transform(X).toarray(), exp)
|
||||
|
||||
X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]})
|
||||
X_type = [X['A'].dtype, X['B'].dtype, X['C'].dtype]
|
||||
enc.fit(X)
|
||||
assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
|
||||
assert_array_equal(enc.transform(X).toarray(), exp)
|
||||
|
||||
|
||||
def test_one_hot_encoder_warning():
|
||||
enc = OneHotEncoder()
|
||||
X = [['Male', 1], ['Female', 3]]
|
||||
np.testing.assert_no_warnings(enc.fit_transform, X)
|
||||
|
||||
|
||||
def test_one_hot_encoder_drop_manual():
|
||||
cats_to_drop = ['def', 12, 3, 56]
|
||||
enc = OneHotEncoder(drop=cats_to_drop)
|
||||
X = [['abc', 12, 2, 55],
|
||||
['def', 12, 1, 55],
|
||||
['def', 12, 3, 56]]
|
||||
trans = enc.fit_transform(X).toarray()
|
||||
exp = [[1, 0, 1, 1],
|
||||
[0, 1, 0, 1],
|
||||
[0, 0, 0, 0]]
|
||||
assert_array_equal(trans, exp)
|
||||
dropped_cats = [cat[feature]
|
||||
for cat, feature in zip(enc.categories_,
|
||||
enc.drop_idx_)]
|
||||
assert_array_equal(dropped_cats, cats_to_drop)
|
||||
assert_array_equal(np.array(X, dtype=object),
|
||||
enc.inverse_transform(trans))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X_fit, params, err_msg",
|
||||
[([["Male"], ["Female"]], {'drop': 'second'},
|
||||
"Wrong input for parameter `drop`"),
|
||||
([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'},
|
||||
"`handle_unknown` must be 'error'"),
|
||||
([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
|
||||
{'drop': np.asarray('b', dtype=object)},
|
||||
"Wrong input for parameter `drop`"),
|
||||
([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
|
||||
{'drop': ['ghi', 3, 59]},
|
||||
"The following categories were supposed")]
|
||||
)
|
||||
def test_one_hot_encoder_invalid_params(X_fit, params, err_msg):
|
||||
enc = OneHotEncoder(**params)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
enc.fit(X_fit)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('drop', [['abc', 3], ['abc', 3, 41, 'a']])
|
||||
def test_invalid_drop_length(drop):
|
||||
enc = OneHotEncoder(drop=drop)
|
||||
err_msg = "`drop` should have length equal to the number"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
enc.fit([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("density", [True, False],
|
||||
ids=['sparse', 'dense'])
|
||||
@pytest.mark.parametrize("drop", ['first',
|
||||
['a', 2, 'b']],
|
||||
ids=['first', 'manual'])
|
||||
def test_categories(density, drop):
|
||||
ohe_base = OneHotEncoder(sparse=density)
|
||||
ohe_test = OneHotEncoder(sparse=density, drop=drop)
|
||||
X = [['c', 1, 'a'],
|
||||
['a', 2, 'b']]
|
||||
ohe_base.fit(X)
|
||||
ohe_test.fit(X)
|
||||
assert_array_equal(ohe_base.categories_, ohe_test.categories_)
|
||||
if drop == 'first':
|
||||
assert_array_equal(ohe_test.drop_idx_, 0)
|
||||
else:
|
||||
for drop_cat, drop_idx, cat_list in zip(drop,
|
||||
ohe_test.drop_idx_,
|
||||
ohe_test.categories_):
|
||||
assert cat_list[int(drop_idx)] == drop_cat
|
||||
assert isinstance(ohe_test.drop_idx_, np.ndarray)
|
||||
assert ohe_test.drop_idx_.dtype == np.object
|
||||
|
||||
|
||||
@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
|
||||
def test_encoders_has_categorical_tags(Encoder):
|
||||
assert 'categorical' in Encoder()._get_tags()['X_types']
|
||||
|
||||
|
||||
@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
|
||||
def test_encoders_does_not_support_none_values(Encoder):
|
||||
values = [["a"], [None]]
|
||||
with pytest.raises(TypeError, match="Encoders require their input to be "
|
||||
"uniformly strings or numbers."):
|
||||
Encoder().fit(values)
|
|
@ -0,0 +1,160 @@
|
|||
import pytest
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn.preprocessing import FunctionTransformer
|
||||
from sklearn.utils._testing import (assert_array_equal,
|
||||
assert_allclose_dense_sparse)
|
||||
from sklearn.utils._testing import assert_warns_message, assert_no_warnings
|
||||
|
||||
|
||||
def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
|
||||
def _func(X, *args, **kwargs):
|
||||
args_store.append(X)
|
||||
args_store.extend(args)
|
||||
kwargs_store.update(kwargs)
|
||||
return func(X)
|
||||
|
||||
return _func
|
||||
|
||||
|
||||
def test_delegate_to_func():
|
||||
# (args|kwargs)_store will hold the positional and keyword arguments
|
||||
# passed to the function inside the FunctionTransformer.
|
||||
args_store = []
|
||||
kwargs_store = {}
|
||||
X = np.arange(10).reshape((5, 2))
|
||||
assert_array_equal(
|
||||
FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
|
||||
X, 'transform should have returned X unchanged',
|
||||
)
|
||||
|
||||
# The function should only have received X.
|
||||
assert args_store == [X], ('Incorrect positional arguments passed to '
|
||||
'func: {args}'.format(args=args_store))
|
||||
|
||||
assert not kwargs_store, ('Unexpected keyword arguments passed to '
|
||||
'func: {args}'.format(args=kwargs_store))
|
||||
|
||||
# reset the argument stores.
|
||||
args_store[:] = []
|
||||
kwargs_store.clear()
|
||||
transformed = FunctionTransformer(
|
||||
_make_func(args_store, kwargs_store),
|
||||
).transform(X)
|
||||
|
||||
assert_array_equal(transformed, X,
|
||||
err_msg='transform should have returned X unchanged')
|
||||
|
||||
# The function should have received X
|
||||
assert args_store == [X], ('Incorrect positional arguments passed '
|
||||
'to func: {args}'.format(args=args_store))
|
||||
|
||||
assert not kwargs_store, ('Unexpected keyword arguments passed to '
|
||||
'func: {args}'.format(args=kwargs_store))
|
||||
|
||||
|
||||
def test_np_log():
|
||||
X = np.arange(10).reshape((5, 2))
|
||||
|
||||
# Test that the numpy.log example still works.
|
||||
assert_array_equal(
|
||||
FunctionTransformer(np.log1p).transform(X),
|
||||
np.log1p(X),
|
||||
)
|
||||
|
||||
|
||||
def test_kw_arg():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X),
|
||||
np.around(X, decimals=3))
|
||||
|
||||
|
||||
def test_kw_arg_update():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
F.kw_args['decimals'] = 1
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=1))
|
||||
|
||||
|
||||
def test_kw_arg_reset():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
F.kw_args = dict(decimals=1)
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=1))
|
||||
|
||||
|
||||
def test_inverse_transform():
|
||||
X = np.array([1, 4, 9, 16]).reshape((2, 2))
|
||||
|
||||
# Test that inverse_transform works correctly
|
||||
F = FunctionTransformer(
|
||||
func=np.sqrt,
|
||||
inverse_func=np.around, inv_kw_args=dict(decimals=3),
|
||||
)
|
||||
assert_array_equal(
|
||||
F.inverse_transform(F.transform(X)),
|
||||
np.around(np.sqrt(X), decimals=3),
|
||||
)
|
||||
|
||||
|
||||
def test_check_inverse():
|
||||
X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
|
||||
|
||||
X_list = [X_dense,
|
||||
sparse.csr_matrix(X_dense),
|
||||
sparse.csc_matrix(X_dense)]
|
||||
|
||||
for X in X_list:
|
||||
if sparse.issparse(X):
|
||||
accept_sparse = True
|
||||
else:
|
||||
accept_sparse = False
|
||||
trans = FunctionTransformer(func=np.sqrt,
|
||||
inverse_func=np.around,
|
||||
accept_sparse=accept_sparse,
|
||||
check_inverse=True,
|
||||
validate=True)
|
||||
assert_warns_message(UserWarning,
|
||||
"The provided functions are not strictly"
|
||||
" inverse of each other. If you are sure you"
|
||||
" want to proceed regardless, set"
|
||||
" 'check_inverse=False'.",
|
||||
trans.fit, X)
|
||||
|
||||
trans = FunctionTransformer(func=np.expm1,
|
||||
inverse_func=np.log1p,
|
||||
accept_sparse=accept_sparse,
|
||||
check_inverse=True,
|
||||
validate=True)
|
||||
Xt = assert_no_warnings(trans.fit_transform, X)
|
||||
assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))
|
||||
|
||||
# check that we don't check inverse when one of the func or inverse is not
|
||||
# provided.
|
||||
trans = FunctionTransformer(func=np.expm1, inverse_func=None,
|
||||
check_inverse=True, validate=True)
|
||||
assert_no_warnings(trans.fit, X_dense)
|
||||
trans = FunctionTransformer(func=None, inverse_func=np.expm1,
|
||||
check_inverse=True, validate=True)
|
||||
assert_no_warnings(trans.fit, X_dense)
|
||||
|
||||
|
||||
def test_function_transformer_frame():
|
||||
pd = pytest.importorskip('pandas')
|
||||
X_df = pd.DataFrame(np.random.randn(100, 10))
|
||||
transformer = FunctionTransformer()
|
||||
X_df_trans = transformer.fit_transform(X_df)
|
||||
assert hasattr(X_df_trans, 'loc')
|
656
venv/Lib/site-packages/sklearn/preprocessing/tests/test_label.py
Normal file
656
venv/Lib/site-packages/sklearn/preprocessing/tests/test_label.py
Normal file
|
@ -0,0 +1,656 @@
|
|||
import numpy as np
|
||||
|
||||
import pytest
|
||||
|
||||
from scipy.sparse import issparse
|
||||
from scipy.sparse import coo_matrix
|
||||
from scipy.sparse import csc_matrix
|
||||
from scipy.sparse import csr_matrix
|
||||
from scipy.sparse import dok_matrix
|
||||
from scipy.sparse import lil_matrix
|
||||
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_warns_message
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
from sklearn.utils import _to_object_array
|
||||
|
||||
from sklearn.preprocessing._label import LabelBinarizer
|
||||
from sklearn.preprocessing._label import MultiLabelBinarizer
|
||||
from sklearn.preprocessing._label import LabelEncoder
|
||||
from sklearn.preprocessing._label import label_binarize
|
||||
|
||||
from sklearn.preprocessing._label import _inverse_binarize_thresholding
|
||||
from sklearn.preprocessing._label import _inverse_binarize_multiclass
|
||||
from sklearn.preprocessing._label import _encode
|
||||
|
||||
from sklearn import datasets
|
||||
|
||||
iris = datasets.load_iris()
|
||||
|
||||
|
||||
def toarray(a):
|
||||
if hasattr(a, "toarray"):
|
||||
a = a.toarray()
|
||||
return a
|
||||
|
||||
|
||||
def test_label_binarizer():
|
||||
# one-class case defaults to negative label
|
||||
# For dense case:
|
||||
inp = ["pos", "pos", "pos", "pos"]
|
||||
lb = LabelBinarizer(sparse_output=False)
|
||||
expected = np.array([[0, 0, 0, 0]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["pos"])
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
# For sparse case:
|
||||
lb = LabelBinarizer(sparse_output=True)
|
||||
got = lb.fit_transform(inp)
|
||||
assert issparse(got)
|
||||
assert_array_equal(lb.classes_, ["pos"])
|
||||
assert_array_equal(expected, got.toarray())
|
||||
assert_array_equal(lb.inverse_transform(got.toarray()), inp)
|
||||
|
||||
lb = LabelBinarizer(sparse_output=False)
|
||||
# two-class case
|
||||
inp = ["neg", "pos", "pos", "neg"]
|
||||
expected = np.array([[0, 1, 1, 0]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["neg", "pos"])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
to_invert = np.array([[1, 0],
|
||||
[0, 1],
|
||||
[0, 1],
|
||||
[1, 0]])
|
||||
assert_array_equal(lb.inverse_transform(to_invert), inp)
|
||||
|
||||
# multi-class case
|
||||
inp = ["spam", "ham", "eggs", "ham", "0"]
|
||||
expected = np.array([[0, 0, 0, 1],
|
||||
[0, 0, 1, 0],
|
||||
[0, 1, 0, 0],
|
||||
[0, 0, 1, 0],
|
||||
[1, 0, 0, 0]])
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
|
||||
def test_label_binarizer_unseen_labels():
|
||||
lb = LabelBinarizer()
|
||||
|
||||
expected = np.array([[1, 0, 0],
|
||||
[0, 1, 0],
|
||||
[0, 0, 1]])
|
||||
got = lb.fit_transform(['b', 'd', 'e'])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
expected = np.array([[0, 0, 0],
|
||||
[1, 0, 0],
|
||||
[0, 0, 0],
|
||||
[0, 1, 0],
|
||||
[0, 0, 1],
|
||||
[0, 0, 0]])
|
||||
got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f'])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
|
||||
def test_label_binarizer_set_label_encoding():
|
||||
lb = LabelBinarizer(neg_label=-2, pos_label=0)
|
||||
|
||||
# two-class case with pos_label=0
|
||||
inp = np.array([0, 1, 1, 0])
|
||||
expected = np.array([[-2, 0, 0, -2]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
lb = LabelBinarizer(neg_label=-2, pos_label=2)
|
||||
|
||||
# multi-class case
|
||||
inp = np.array([3, 2, 1, 2, 0])
|
||||
expected = np.array([[-2, -2, -2, +2],
|
||||
[-2, -2, +2, -2],
|
||||
[-2, +2, -2, -2],
|
||||
[-2, -2, +2, -2],
|
||||
[+2, -2, -2, -2]])
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
|
||||
@ignore_warnings
|
||||
def test_label_binarizer_errors():
|
||||
# Check that invalid arguments yield ValueError
|
||||
one_class = np.array([0, 0, 0, 0])
|
||||
lb = LabelBinarizer().fit(one_class)
|
||||
|
||||
multi_label = [(2, 3), (0,), (0, 2)]
|
||||
with pytest.raises(ValueError):
|
||||
lb.transform(multi_label)
|
||||
|
||||
lb = LabelBinarizer()
|
||||
with pytest.raises(ValueError):
|
||||
lb.transform([])
|
||||
with pytest.raises(ValueError):
|
||||
lb.inverse_transform([])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
LabelBinarizer(neg_label=2, pos_label=1)
|
||||
with pytest.raises(ValueError):
|
||||
LabelBinarizer(neg_label=2, pos_label=2)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
|
||||
|
||||
# Fail on y_type
|
||||
with pytest.raises(ValueError):
|
||||
_inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
|
||||
output_type="foo", classes=[1, 2],
|
||||
threshold=0)
|
||||
|
||||
# Sequence of seq type should raise ValueError
|
||||
y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
|
||||
with pytest.raises(ValueError):
|
||||
LabelBinarizer().fit_transform(y_seq_of_seqs)
|
||||
|
||||
# Fail on the number of classes
|
||||
with pytest.raises(ValueError):
|
||||
_inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
|
||||
output_type="foo",
|
||||
classes=[1, 2, 3],
|
||||
threshold=0)
|
||||
|
||||
# Fail on the dimension of 'binary'
|
||||
with pytest.raises(ValueError):
|
||||
_inverse_binarize_thresholding(y=np.array([[1, 2, 3], [2, 1, 3]]),
|
||||
output_type="binary",
|
||||
classes=[1, 2, 3],
|
||||
threshold=0)
|
||||
|
||||
# Fail on multioutput data
|
||||
with pytest.raises(ValueError):
|
||||
LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, classes, unknown",
|
||||
[(np.array([2, 1, 3, 1, 3], dtype='int64'),
|
||||
np.array([1, 2, 3], dtype='int64'), np.array([4], dtype='int64')),
|
||||
(np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
|
||||
np.array(['a', 'b', 'c'], dtype=object),
|
||||
np.array(['d'], dtype=object)),
|
||||
(np.array(['b', 'a', 'c', 'a', 'c']),
|
||||
np.array(['a', 'b', 'c']), np.array(['d']))],
|
||||
ids=['int64', 'object', 'str'])
|
||||
def test_label_encoder(values, classes, unknown):
|
||||
# Test LabelEncoder's transform, fit_transform and
|
||||
# inverse_transform methods
|
||||
le = LabelEncoder()
|
||||
le.fit(values)
|
||||
assert_array_equal(le.classes_, classes)
|
||||
assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
|
||||
assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
|
||||
le = LabelEncoder()
|
||||
ret = le.fit_transform(values)
|
||||
assert_array_equal(ret, [1, 0, 2, 0, 2])
|
||||
|
||||
with pytest.raises(ValueError, match="unseen labels"):
|
||||
le.transform(unknown)
|
||||
|
||||
|
||||
def test_label_encoder_negative_ints():
|
||||
le = LabelEncoder()
|
||||
le.fit([1, 1, 4, 5, -1, 0])
|
||||
assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
|
||||
assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]),
|
||||
[1, 2, 3, 3, 4, 0, 0])
|
||||
assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]),
|
||||
[0, 1, 4, 4, 5, -1, -1])
|
||||
with pytest.raises(ValueError):
|
||||
le.transform([0, 6])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ['str', 'object'])
|
||||
def test_label_encoder_str_bad_shape(dtype):
|
||||
le = LabelEncoder()
|
||||
le.fit(np.array(["apple", "orange"], dtype=dtype))
|
||||
msg = "should be a 1d array"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.transform("apple")
|
||||
|
||||
|
||||
def test_label_encoder_errors():
|
||||
# Check that invalid arguments yield ValueError
|
||||
le = LabelEncoder()
|
||||
with pytest.raises(ValueError):
|
||||
le.transform([])
|
||||
with pytest.raises(ValueError):
|
||||
le.inverse_transform([])
|
||||
|
||||
# Fail on unseen labels
|
||||
le = LabelEncoder()
|
||||
le.fit([1, 2, 3, -1, 1])
|
||||
msg = "contains previously unseen labels"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform([-2])
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform([-2, -3, -4])
|
||||
|
||||
# Fail on inverse_transform("")
|
||||
msg = r"should be a 1d array.+shape \(\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform("")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[np.array([2, 1, 3, 1, 3], dtype='int64'),
|
||||
np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
|
||||
np.array(['b', 'a', 'c', 'a', 'c'])],
|
||||
ids=['int64', 'object', 'str'])
|
||||
def test_label_encoder_empty_array(values):
|
||||
le = LabelEncoder()
|
||||
le.fit(values)
|
||||
# test empty transform
|
||||
transformed = le.transform([])
|
||||
assert_array_equal(np.array([]), transformed)
|
||||
# test empty inverse transform
|
||||
inverse_transformed = le.inverse_transform([])
|
||||
assert_array_equal(np.array([]), inverse_transformed)
|
||||
|
||||
|
||||
def test_sparse_output_multilabel_binarizer():
|
||||
# test input as iterable of iterables
|
||||
inputs = [
|
||||
lambda: [(2, 3), (1,), (1, 2)],
|
||||
lambda: ({2, 3}, {1}, {1, 2}),
|
||||
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1],
|
||||
[1, 0, 0],
|
||||
[1, 1, 0]])
|
||||
|
||||
inverse = inputs[0]()
|
||||
for sparse_output in [True, False]:
|
||||
for inp in inputs:
|
||||
# With fit_transform
|
||||
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
|
||||
got = mlb.fit_transform(inp())
|
||||
assert issparse(got) == sparse_output
|
||||
if sparse_output:
|
||||
# verify CSR assumption that indices and indptr have same dtype
|
||||
assert got.indices.dtype == got.indptr.dtype
|
||||
got = got.toarray()
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
# With fit
|
||||
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
|
||||
got = mlb.fit(inp()).transform(inp())
|
||||
assert issparse(got) == sparse_output
|
||||
if sparse_output:
|
||||
# verify CSR assumption that indices and indptr have same dtype
|
||||
assert got.indices.dtype == got.indptr.dtype
|
||||
got = got.toarray()
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(csr_matrix(np.array([[0, 1, 1],
|
||||
[2, 0, 0],
|
||||
[1, 1, 0]])))
|
||||
|
||||
|
||||
def test_multilabel_binarizer():
|
||||
# test input as iterable of iterables
|
||||
inputs = [
|
||||
lambda: [(2, 3), (1,), (1, 2)],
|
||||
lambda: ({2, 3}, {1}, {1, 2}),
|
||||
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1],
|
||||
[1, 0, 0],
|
||||
[1, 1, 0]])
|
||||
inverse = inputs[0]()
|
||||
for inp in inputs:
|
||||
# With fit_transform
|
||||
mlb = MultiLabelBinarizer()
|
||||
got = mlb.fit_transform(inp())
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
# With fit
|
||||
mlb = MultiLabelBinarizer()
|
||||
got = mlb.fit(inp()).transform(inp())
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
|
||||
def test_multilabel_binarizer_empty_sample():
|
||||
mlb = MultiLabelBinarizer()
|
||||
y = [[1, 2], [1], []]
|
||||
Y = np.array([[1, 1],
|
||||
[1, 0],
|
||||
[0, 0]])
|
||||
assert_array_equal(mlb.fit_transform(y), Y)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_unknown_class():
|
||||
mlb = MultiLabelBinarizer()
|
||||
y = [[1, 2]]
|
||||
Y = np.array([[1, 0], [0, 1]])
|
||||
w = 'unknown class(es) [0, 4] will be ignored'
|
||||
matrix = assert_warns_message(UserWarning, w,
|
||||
mlb.fit(y).transform, [[4, 1], [2, 0]])
|
||||
assert_array_equal(matrix, Y)
|
||||
|
||||
Y = np.array([[1, 0, 0], [0, 1, 0]])
|
||||
mlb = MultiLabelBinarizer(classes=[1, 2, 3])
|
||||
matrix = assert_warns_message(UserWarning, w,
|
||||
mlb.fit(y).transform, [[4, 1], [2, 0]])
|
||||
assert_array_equal(matrix, Y)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_given_classes():
|
||||
inp = [(2, 3), (1,), (1, 2)]
|
||||
indicator_mat = np.array([[0, 1, 1],
|
||||
[1, 0, 0],
|
||||
[1, 0, 1]])
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, [1, 3, 2])
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, [1, 3, 2])
|
||||
|
||||
# ensure works with extra class
|
||||
mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
|
||||
assert_array_equal(mlb.fit_transform(inp),
|
||||
np.hstack(([[0], [0], [0]], indicator_mat)))
|
||||
assert_array_equal(mlb.classes_, [4, 1, 3, 2])
|
||||
|
||||
# ensure fit is no-op as iterable is not consumed
|
||||
inp = iter(inp)
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
|
||||
# ensure a ValueError is thrown if given duplicate classes
|
||||
err_msg = "The classes argument contains duplicate classes. Remove " \
|
||||
"these duplicates before passing them to MultiLabelBinarizer."
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
mlb.fit(inp)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_multiple_calls():
|
||||
inp = [(2, 3), (1,), (1, 2)]
|
||||
indicator_mat = np.array([[0, 1, 1],
|
||||
[1, 0, 0],
|
||||
[1, 0, 1]])
|
||||
|
||||
indicator_mat2 = np.array([[0, 1, 1],
|
||||
[1, 0, 0],
|
||||
[1, 1, 0]])
|
||||
|
||||
# first call
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
# second call change class
|
||||
mlb.classes = [1, 2, 3]
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_same_length_sequence():
|
||||
# Ensure sequences of the same length are not interpreted as a 2-d array
|
||||
inp = [[1], [0], [2]]
|
||||
indicator_mat = np.array([[0, 1, 0],
|
||||
[1, 0, 0],
|
||||
[0, 0, 1]])
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_non_integer_labels():
|
||||
tuple_classes = _to_object_array([(1,), (2,), (3,)])
|
||||
inputs = [
|
||||
([('2', '3'), ('1',), ('1', '2')], ['1', '2', '3']),
|
||||
([('b', 'c'), ('a',), ('a', 'b')], ['a', 'b', 'c']),
|
||||
([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1],
|
||||
[1, 0, 0],
|
||||
[1, 1, 0]])
|
||||
for inp, classes in inputs:
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, classes)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, classes)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
with pytest.raises(TypeError):
|
||||
mlb.fit_transform([({}), ({}, {'a': 'b'})])
|
||||
|
||||
|
||||
def test_multilabel_binarizer_non_unique():
|
||||
inp = [(1, 1, 1, 0)]
|
||||
indicator_mat = np.array([[1, 1]])
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_inverse_validation():
|
||||
inp = [(1, 1, 1, 0)]
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit_transform(inp)
|
||||
# Not binary
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1, 3]]))
|
||||
# The following binary cases are fine, however
|
||||
mlb.inverse_transform(np.array([[0, 0]]))
|
||||
mlb.inverse_transform(np.array([[1, 1]]))
|
||||
mlb.inverse_transform(np.array([[1, 0]]))
|
||||
|
||||
# Wrong shape
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1]]))
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1, 1, 1]]))
|
||||
|
||||
|
||||
def test_label_binarize_with_class_order():
|
||||
out = label_binarize([1, 6], classes=[1, 2, 4, 6])
|
||||
expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
# Modified class order
|
||||
out = label_binarize([1, 6], classes=[1, 6, 4, 2])
|
||||
expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
|
||||
expected = np.array([[0, 0, 1, 0],
|
||||
[0, 0, 0, 1],
|
||||
[0, 1, 0, 0],
|
||||
[1, 0, 0, 0]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
|
||||
def check_binarized_results(y, classes, pos_label, neg_label, expected):
|
||||
for sparse_output in [True, False]:
|
||||
if ((pos_label == 0 or neg_label != 0) and sparse_output):
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(y, classes=classes, neg_label=neg_label,
|
||||
pos_label=pos_label,
|
||||
sparse_output=sparse_output)
|
||||
continue
|
||||
|
||||
# check label_binarize
|
||||
binarized = label_binarize(y, classes=classes, neg_label=neg_label,
|
||||
pos_label=pos_label,
|
||||
sparse_output=sparse_output)
|
||||
assert_array_equal(toarray(binarized), expected)
|
||||
assert issparse(binarized) == sparse_output
|
||||
|
||||
# check inverse
|
||||
y_type = type_of_target(y)
|
||||
if y_type == "multiclass":
|
||||
inversed = _inverse_binarize_multiclass(binarized, classes=classes)
|
||||
|
||||
else:
|
||||
inversed = _inverse_binarize_thresholding(binarized,
|
||||
output_type=y_type,
|
||||
classes=classes,
|
||||
threshold=((neg_label +
|
||||
pos_label) /
|
||||
2.))
|
||||
|
||||
assert_array_equal(toarray(inversed), toarray(y))
|
||||
|
||||
# Check label binarizer
|
||||
lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label,
|
||||
sparse_output=sparse_output)
|
||||
binarized = lb.fit_transform(y)
|
||||
assert_array_equal(toarray(binarized), expected)
|
||||
assert issparse(binarized) == sparse_output
|
||||
inverse_output = lb.inverse_transform(binarized)
|
||||
assert_array_equal(toarray(inverse_output), toarray(y))
|
||||
assert issparse(inverse_output) == issparse(y)
|
||||
|
||||
|
||||
def test_label_binarize_binary():
|
||||
y = [0, 1, 0]
|
||||
classes = [0, 1]
|
||||
pos_label = 2
|
||||
neg_label = -1
|
||||
expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
# Binary case where sparse_output = True will not result in a ValueError
|
||||
y = [0, 1, 0]
|
||||
classes = [0, 1]
|
||||
pos_label = 3
|
||||
neg_label = 0
|
||||
expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
|
||||
def test_label_binarize_multiclass():
|
||||
y = [0, 1, 2]
|
||||
classes = [0, 1, 2]
|
||||
pos_label = 2
|
||||
neg_label = 0
|
||||
expected = 2 * np.eye(3)
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label,
|
||||
sparse_output=True)
|
||||
|
||||
|
||||
def test_label_binarize_multilabel():
|
||||
y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
|
||||
classes = [0, 1, 2]
|
||||
pos_label = 2
|
||||
neg_label = 0
|
||||
expected = pos_label * y_ind
|
||||
y_sparse = [sparse_matrix(y_ind)
|
||||
for sparse_matrix in [coo_matrix, csc_matrix, csr_matrix,
|
||||
dok_matrix, lil_matrix]]
|
||||
|
||||
for y in [y_ind] + y_sparse:
|
||||
check_binarized_results(y, classes, pos_label, neg_label,
|
||||
expected)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label,
|
||||
sparse_output=True)
|
||||
|
||||
|
||||
def test_invalid_input_label_binarize():
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
|
||||
with pytest.raises(ValueError, match="continuous target data is not "):
|
||||
label_binarize([1.2, 2.7], classes=[0, 1])
|
||||
with pytest.raises(ValueError, match="mismatch with the labels"):
|
||||
label_binarize([[1, 3]], classes=[1, 2, 3])
|
||||
|
||||
|
||||
def test_inverse_binarize_multiclass():
|
||||
got = _inverse_binarize_multiclass(csr_matrix([[0, 1, 0],
|
||||
[-1, 0, -1],
|
||||
[0, 0, 0]]),
|
||||
np.arange(3))
|
||||
assert_array_equal(got, np.array([1, 1, 0]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, expected",
|
||||
[(np.array([2, 1, 3, 1, 3], dtype='int64'),
|
||||
np.array([1, 2, 3], dtype='int64')),
|
||||
(np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
|
||||
np.array(['a', 'b', 'c'], dtype=object)),
|
||||
(np.array(['b', 'a', 'c', 'a', 'c']),
|
||||
np.array(['a', 'b', 'c']))],
|
||||
ids=['int64', 'object', 'str'])
|
||||
def test_encode_util(values, expected):
|
||||
uniques = _encode(values)
|
||||
assert_array_equal(uniques, expected)
|
||||
uniques, encoded = _encode(values, encode=True)
|
||||
assert_array_equal(uniques, expected)
|
||||
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
|
||||
_, encoded = _encode(values, uniques, encode=True)
|
||||
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
|
||||
|
||||
|
||||
def test_encode_check_unknown():
|
||||
# test for the check_unknown parameter of _encode()
|
||||
uniques = np.array([1, 2, 3])
|
||||
values = np.array([1, 2, 3, 4])
|
||||
|
||||
# Default is True, raise error
|
||||
with pytest.raises(ValueError,
|
||||
match='y contains previously unseen labels'):
|
||||
_encode(values, uniques, encode=True, check_unknown=True)
|
||||
|
||||
# dont raise error if False
|
||||
_encode(values, uniques, encode=True, check_unknown=False)
|
||||
|
||||
# parameter is ignored for object dtype
|
||||
uniques = np.array(['a', 'b', 'c'], dtype=object)
|
||||
values = np.array(['a', 'b', 'c', 'd'], dtype=object)
|
||||
with pytest.raises(ValueError,
|
||||
match='y contains previously unseen labels'):
|
||||
_encode(values, uniques, encode=True, check_unknown=False)
|
Loading…
Add table
Add a link
Reference in a new issue