Uploaded Test files

This commit is contained in:
Batuhan Berk Başoğlu 2020-11-12 11:05:57 -05:00
parent f584ad9d97
commit 2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions

View file

@ -0,0 +1,67 @@
"""
The :mod:`sklearn.preprocessing` module includes scaling, centering,
normalization, binarization methods.
"""
from ._function_transformer import FunctionTransformer
from ._data import Binarizer
from ._data import KernelCenterer
from ._data import MinMaxScaler
from ._data import MaxAbsScaler
from ._data import Normalizer
from ._data import RobustScaler
from ._data import StandardScaler
from ._data import QuantileTransformer
from ._data import add_dummy_feature
from ._data import binarize
from ._data import normalize
from ._data import scale
from ._data import robust_scale
from ._data import maxabs_scale
from ._data import minmax_scale
from ._data import quantile_transform
from ._data import power_transform
from ._data import PowerTransformer
from ._data import PolynomialFeatures
from ._encoders import OneHotEncoder
from ._encoders import OrdinalEncoder
from ._label import label_binarize
from ._label import LabelBinarizer
from ._label import LabelEncoder
from ._label import MultiLabelBinarizer
from ._discretization import KBinsDiscretizer
__all__ = [
'Binarizer',
'FunctionTransformer',
'KBinsDiscretizer',
'KernelCenterer',
'LabelBinarizer',
'LabelEncoder',
'MultiLabelBinarizer',
'MinMaxScaler',
'MaxAbsScaler',
'QuantileTransformer',
'Normalizer',
'OneHotEncoder',
'OrdinalEncoder',
'PowerTransformer',
'RobustScaler',
'StandardScaler',
'add_dummy_feature',
'PolynomialFeatures',
'binarize',
'normalize',
'scale',
'robust_scale',
'maxabs_scale',
'minmax_scale',
'label_binarize',
'quantile_transform',
'power_transform',
]

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,324 @@
# -*- coding: utf-8 -*-
# Author: Henry Lin <hlin117@gmail.com>
# Tom Dupré la Tour
# License: BSD
import numbers
import numpy as np
import warnings
from . import OneHotEncoder
from ..base import BaseEstimator, TransformerMixin
from ..utils.validation import check_array
from ..utils.validation import check_is_fitted
from ..utils.validation import FLOAT_DTYPES
from ..utils.validation import _deprecate_positional_args
class KBinsDiscretizer(TransformerMixin, BaseEstimator):
"""
Bin continuous data into intervals.
Read more in the :ref:`User Guide <preprocessing_discretization>`.
.. versionadded:: 0.20
Parameters
----------
n_bins : int or array-like, shape (n_features,) (default=5)
The number of bins to produce. Raises ValueError if ``n_bins < 2``.
encode : {'onehot', 'onehot-dense', 'ordinal'}, (default='onehot')
Method used to encode the transformed result.
onehot
Encode the transformed result with one-hot encoding
and return a sparse matrix. Ignored features are always
stacked to the right.
onehot-dense
Encode the transformed result with one-hot encoding
and return a dense array. Ignored features are always
stacked to the right.
ordinal
Return the bin identifier encoded as an integer value.
strategy : {'uniform', 'quantile', 'kmeans'}, (default='quantile')
Strategy used to define the widths of the bins.
uniform
All bins in each feature have identical widths.
quantile
All bins in each feature have the same number of points.
kmeans
Values in each bin have the same nearest center of a 1D k-means
cluster.
Attributes
----------
n_bins_ : int array, shape (n_features,)
Number of bins per feature. Bins whose width are too small
(i.e., <= 1e-8) are removed with a warning.
bin_edges_ : array of arrays, shape (n_features, )
The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
Ignored features will have empty arrays.
See Also
--------
sklearn.preprocessing.Binarizer : Class used to bin values as ``0`` or
``1`` based on a parameter ``threshold``.
Notes
-----
In bin edges for feature ``i``, the first and last values are used only for
``inverse_transform``. During transform, bin edges are extended to::
np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])
You can combine ``KBinsDiscretizer`` with
:class:`sklearn.compose.ColumnTransformer` if you only want to preprocess
part of the features.
``KBinsDiscretizer`` might produce constant features (e.g., when
``encode = 'onehot'`` and certain bins do not contain any data).
These features can be removed with feature selection algorithms
(e.g., :class:`sklearn.feature_selection.VarianceThreshold`).
Examples
--------
>>> X = [[-2, 1, -4, -1],
... [-1, 2, -3, -0.5],
... [ 0, 3, -2, 0.5],
... [ 1, 4, -1, 2]]
>>> est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
>>> est.fit(X)
KBinsDiscretizer(...)
>>> Xt = est.transform(X)
>>> Xt # doctest: +SKIP
array([[ 0., 0., 0., 0.],
[ 1., 1., 1., 0.],
[ 2., 2., 2., 1.],
[ 2., 2., 2., 2.]])
Sometimes it may be useful to convert the data back into the original
feature space. The ``inverse_transform`` function converts the binned
data into the original feature space. Each value will be equal to the mean
of the two bin edges.
>>> est.bin_edges_[0]
array([-2., -1., 0., 1.])
>>> est.inverse_transform(Xt)
array([[-1.5, 1.5, -3.5, -0.5],
[-0.5, 2.5, -2.5, -0.5],
[ 0.5, 3.5, -1.5, 0.5],
[ 0.5, 3.5, -1.5, 1.5]])
"""
@_deprecate_positional_args
def __init__(self, n_bins=5, *, encode='onehot', strategy='quantile'):
self.n_bins = n_bins
self.encode = encode
self.strategy = strategy
def fit(self, X, y=None):
"""
Fit the estimator.
Parameters
----------
X : numeric array-like, shape (n_samples, n_features)
Data to be discretized.
y : None
Ignored. This parameter exists only for compatibility with
:class:`sklearn.pipeline.Pipeline`.
Returns
-------
self
"""
X = self._validate_data(X, dtype='numeric')
valid_encode = ('onehot', 'onehot-dense', 'ordinal')
if self.encode not in valid_encode:
raise ValueError("Valid options for 'encode' are {}. "
"Got encode={!r} instead."
.format(valid_encode, self.encode))
valid_strategy = ('uniform', 'quantile', 'kmeans')
if self.strategy not in valid_strategy:
raise ValueError("Valid options for 'strategy' are {}. "
"Got strategy={!r} instead."
.format(valid_strategy, self.strategy))
n_features = X.shape[1]
n_bins = self._validate_n_bins(n_features)
bin_edges = np.zeros(n_features, dtype=object)
for jj in range(n_features):
column = X[:, jj]
col_min, col_max = column.min(), column.max()
if col_min == col_max:
warnings.warn("Feature %d is constant and will be "
"replaced with 0." % jj)
n_bins[jj] = 1
bin_edges[jj] = np.array([-np.inf, np.inf])
continue
if self.strategy == 'uniform':
bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)
elif self.strategy == 'quantile':
quantiles = np.linspace(0, 100, n_bins[jj] + 1)
bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
elif self.strategy == 'kmeans':
from ..cluster import KMeans # fixes import loops
# Deterministic initialization with uniform spacing
uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)
init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
# 1D k-means procedure
km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
centers = km.fit(column[:, None]).cluster_centers_[:, 0]
# Must sort, centers may be unsorted even with sorted init
centers.sort()
bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
# Remove bins whose width are too small (i.e., <= 1e-8)
if self.strategy in ('quantile', 'kmeans'):
mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
bin_edges[jj] = bin_edges[jj][mask]
if len(bin_edges[jj]) - 1 != n_bins[jj]:
warnings.warn('Bins whose width are too small (i.e., <= '
'1e-8) in feature %d are removed. Consider '
'decreasing the number of bins.' % jj)
n_bins[jj] = len(bin_edges[jj]) - 1
self.bin_edges_ = bin_edges
self.n_bins_ = n_bins
if 'onehot' in self.encode:
self._encoder = OneHotEncoder(
categories=[np.arange(i) for i in self.n_bins_],
sparse=self.encode == 'onehot')
# Fit the OneHotEncoder with toy datasets
# so that it's ready for use after the KBinsDiscretizer is fitted
self._encoder.fit(np.zeros((1, len(self.n_bins_)), dtype=int))
return self
def _validate_n_bins(self, n_features):
"""Returns n_bins_, the number of bins per feature.
"""
orig_bins = self.n_bins
if isinstance(orig_bins, numbers.Number):
if not isinstance(orig_bins, numbers.Integral):
raise ValueError("{} received an invalid n_bins type. "
"Received {}, expected int."
.format(KBinsDiscretizer.__name__,
type(orig_bins).__name__))
if orig_bins < 2:
raise ValueError("{} received an invalid number "
"of bins. Received {}, expected at least 2."
.format(KBinsDiscretizer.__name__, orig_bins))
return np.full(n_features, orig_bins, dtype=np.int)
n_bins = check_array(orig_bins, dtype=np.int, copy=True,
ensure_2d=False)
if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
raise ValueError("n_bins must be a scalar or array "
"of shape (n_features,).")
bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)
violating_indices = np.where(bad_nbins_value)[0]
if violating_indices.shape[0] > 0:
indices = ", ".join(str(i) for i in violating_indices)
raise ValueError("{} received an invalid number "
"of bins at indices {}. Number of bins "
"must be at least 2, and must be an int."
.format(KBinsDiscretizer.__name__, indices))
return n_bins
def transform(self, X):
"""
Discretize the data.
Parameters
----------
X : numeric array-like, shape (n_samples, n_features)
Data to be discretized.
Returns
-------
Xt : numeric array-like or sparse matrix
Data in the binned space.
"""
check_is_fitted(self)
Xt = check_array(X, copy=True, dtype=FLOAT_DTYPES)
n_features = self.n_bins_.shape[0]
if Xt.shape[1] != n_features:
raise ValueError("Incorrect number of features. Expecting {}, "
"received {}.".format(n_features, Xt.shape[1]))
bin_edges = self.bin_edges_
for jj in range(Xt.shape[1]):
# Values which are close to a bin edge are susceptible to numeric
# instability. Add eps to X so these values are binned correctly
# with respect to their decimal truncation. See documentation of
# numpy.isclose for an explanation of ``rtol`` and ``atol``.
rtol = 1.e-5
atol = 1.e-8
eps = atol + rtol * np.abs(Xt[:, jj])
Xt[:, jj] = np.digitize(Xt[:, jj] + eps, bin_edges[jj][1:])
np.clip(Xt, 0, self.n_bins_ - 1, out=Xt)
if self.encode == 'ordinal':
return Xt
return self._encoder.transform(Xt)
def inverse_transform(self, Xt):
"""
Transform discretized data back to original feature space.
Note that this function does not regenerate the original data
due to discretization rounding.
Parameters
----------
Xt : numeric array-like, shape (n_sample, n_features)
Transformed data in the binned space.
Returns
-------
Xinv : numeric array-like
Data in the original feature space.
"""
check_is_fitted(self)
if 'onehot' in self.encode:
Xt = self._encoder.inverse_transform(Xt)
Xinv = check_array(Xt, copy=True, dtype=FLOAT_DTYPES)
n_features = self.n_bins_.shape[0]
if Xinv.shape[1] != n_features:
raise ValueError("Incorrect number of features. Expecting {}, "
"received {}.".format(n_features, Xinv.shape[1]))
for jj in range(n_features):
bin_edges = self.bin_edges_[jj]
bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5
Xinv[:, jj] = bin_centers[np.int_(Xinv[:, jj])]
return Xinv

View file

@ -0,0 +1,737 @@
# Authors: Andreas Mueller <amueller@ais.uni-bonn.de>
# Joris Van den Bossche <jorisvandenbossche@gmail.com>
# License: BSD 3 clause
import numpy as np
from scipy import sparse
from ..base import BaseEstimator, TransformerMixin
from ..utils import check_array
from ..utils.validation import check_is_fitted
from ..utils.validation import _deprecate_positional_args
from ._label import _encode, _encode_check_unknown
__all__ = [
'OneHotEncoder',
'OrdinalEncoder'
]
class _BaseEncoder(TransformerMixin, BaseEstimator):
"""
Base class for encoders that includes the code to categorize and
transform the input features.
"""
def _check_X(self, X):
"""
Perform custom check_array:
- convert list of strings to object dtype
- check for missing values for object dtype data (check_array does
not do that)
- return list of features (arrays): this list of features is
constructed feature by feature to preserve the data types
of pandas DataFrame columns, as otherwise information is lost
and cannot be used, eg for the `categories_` attribute.
"""
if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
# if not a dataframe, do normal check_array validation
X_temp = check_array(X, dtype=None)
if (not hasattr(X, 'dtype')
and np.issubdtype(X_temp.dtype, np.str_)):
X = check_array(X, dtype=np.object)
else:
X = X_temp
needs_validation = False
else:
# pandas dataframe, do validation later column by column, in order
# to keep the dtype information to be used in the encoder.
needs_validation = True
n_samples, n_features = X.shape
X_columns = []
for i in range(n_features):
Xi = self._get_feature(X, feature_idx=i)
Xi = check_array(Xi, ensure_2d=False, dtype=None,
force_all_finite=needs_validation)
X_columns.append(Xi)
return X_columns, n_samples, n_features
def _get_feature(self, X, feature_idx):
if hasattr(X, 'iloc'):
# pandas dataframes
return X.iloc[:, feature_idx]
# numpy arrays, sparse arrays
return X[:, feature_idx]
def _fit(self, X, handle_unknown='error'):
X_list, n_samples, n_features = self._check_X(X)
if self.categories != 'auto':
if len(self.categories) != n_features:
raise ValueError("Shape mismatch: if categories is an array,"
" it has to be of shape (n_features,).")
self.categories_ = []
for i in range(n_features):
Xi = X_list[i]
if self.categories == 'auto':
cats = _encode(Xi)
else:
cats = np.array(self.categories[i], dtype=Xi.dtype)
if Xi.dtype != object:
if not np.all(np.sort(cats) == cats):
raise ValueError("Unsorted categories are not "
"supported for numerical categories")
if handle_unknown == 'error':
diff = _encode_check_unknown(Xi, cats)
if diff:
msg = ("Found unknown categories {0} in column {1}"
" during fit".format(diff, i))
raise ValueError(msg)
self.categories_.append(cats)
def _transform(self, X, handle_unknown='error'):
X_list, n_samples, n_features = self._check_X(X)
X_int = np.zeros((n_samples, n_features), dtype=np.int)
X_mask = np.ones((n_samples, n_features), dtype=np.bool)
if n_features != len(self.categories_):
raise ValueError(
"The number of features in X is different to the number of "
"features of the fitted data. The fitted data had {} features "
"and the X has {} features."
.format(len(self.categories_,), n_features)
)
for i in range(n_features):
Xi = X_list[i]
diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
return_mask=True)
if not np.all(valid_mask):
if handle_unknown == 'error':
msg = ("Found unknown categories {0} in column {1}"
" during transform".format(diff, i))
raise ValueError(msg)
else:
# Set the problematic rows to an acceptable value and
# continue `The rows are marked `X_mask` and will be
# removed later.
X_mask[:, i] = valid_mask
# cast Xi into the largest string type necessary
# to handle different lengths of numpy strings
if (self.categories_[i].dtype.kind in ('U', 'S')
and self.categories_[i].itemsize > Xi.itemsize):
Xi = Xi.astype(self.categories_[i].dtype)
else:
Xi = Xi.copy()
Xi[~valid_mask] = self.categories_[i][0]
# We use check_unknown=False, since _encode_check_unknown was
# already called above.
_, encoded = _encode(Xi, self.categories_[i], encode=True,
check_unknown=False)
X_int[:, i] = encoded
return X_int, X_mask
def _more_tags(self):
return {'X_types': ['categorical']}
class OneHotEncoder(_BaseEncoder):
"""
Encode categorical features as a one-hot numeric array.
The input to this transformer should be an array-like of integers or
strings, denoting the values taken on by categorical (discrete) features.
The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
encoding scheme. This creates a binary column for each category and
returns a sparse matrix or dense array (depending on the ``sparse``
parameter)
By default, the encoder derives the categories based on the unique values
in each feature. Alternatively, you can also specify the `categories`
manually.
This encoding is needed for feeding categorical data to many scikit-learn
estimators, notably linear models and SVMs with the standard kernels.
Note: a one-hot encoding of y labels should use a LabelBinarizer
instead.
Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
.. versionchanged:: 0.20
Parameters
----------
categories : 'auto' or a list of array-like, default='auto'
Categories (unique values) per feature:
- 'auto' : Determine categories automatically from the training data.
- list : ``categories[i]`` holds the categories expected in the ith
column. The passed categories should not mix strings and numeric
values within a single feature, and should be sorted in case of
numeric values.
The used categories can be found in the ``categories_`` attribute.
.. versionadded:: 0.20
drop : {'first', 'if_binary'} or a array-like of shape (n_features,), \
default=None
Specifies a methodology to use to drop one of the categories per
feature. This is useful in situations where perfectly collinear
features cause problems, such as when feeding the resulting data
into a neural network or an unregularized regression.
However, dropping one category breaks the symmetry of the original
representation and can therefore induce a bias in downstream models,
for instance for penalized linear classification or regression models.
- None : retain all features (the default).
- 'first' : drop the first category in each feature. If only one
category is present, the feature will be dropped entirely.
- 'if_binary' : drop the first category in each feature with two
categories. Features with 1 or more than 2 categories are
left intact.
- array : ``drop[i]`` is the category in feature ``X[:, i]`` that
should be dropped.
sparse : bool, default=True
Will return sparse matrix if set True else will return an array.
dtype : number type, default=np.float
Desired dtype of output.
handle_unknown : {'error', 'ignore'}, default='error'
Whether to raise an error or ignore if an unknown categorical feature
is present during transform (default is to raise). When this parameter
is set to 'ignore' and an unknown category is encountered during
transform, the resulting one-hot encoded columns for this feature
will be all zeros. In the inverse transform, an unknown category
will be denoted as None.
Attributes
----------
categories_ : list of arrays
The categories of each feature determined during fitting
(in order of the features in X and corresponding with the output
of ``transform``). This includes the category specified in ``drop``
(if any).
drop_idx_ : array of shape (n_features,)
- ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
to be dropped for each feature.
- ``drop_idx_[i] = None`` if no category is to be dropped from the
feature with index ``i``, e.g. when `drop='if_binary'` and the
feature isn't binary.
- ``drop_idx_ = None`` if all the transformed features will be
retained.
See Also
--------
sklearn.preprocessing.OrdinalEncoder : Performs an ordinal (integer)
encoding of the categorical features.
sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of
dictionary items (also handles string-valued features).
sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot
encoding of dictionary items or strings.
sklearn.preprocessing.LabelBinarizer : Binarizes labels in a one-vs-all
fashion.
sklearn.preprocessing.MultiLabelBinarizer : Transforms between iterable of
iterables and a multilabel format, e.g. a (samples x classes) binary
matrix indicating the presence of a class label.
Examples
--------
Given a dataset with two features, we let the encoder find the unique
values per feature and transform the data to a binary one-hot encoding.
>>> from sklearn.preprocessing import OneHotEncoder
One can discard categories not seen during `fit`:
>>> enc = OneHotEncoder(handle_unknown='ignore')
>>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
>>> enc.fit(X)
OneHotEncoder(handle_unknown='ignore')
>>> enc.categories_
[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
>>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
array([[1., 0., 1., 0., 0.],
[0., 1., 0., 0., 0.]])
>>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
array([['Male', 1],
[None, 2]], dtype=object)
>>> enc.get_feature_names(['gender', 'group'])
array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'],
dtype=object)
One can always drop the first column for each feature:
>>> drop_enc = OneHotEncoder(drop='first').fit(X)
>>> drop_enc.categories_
[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
>>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()
array([[0., 0., 0.],
[1., 1., 0.]])
Or drop a column for feature only having 2 categories:
>>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)
>>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()
array([[0., 1., 0., 0.],
[1., 0., 1., 0.]])
"""
@_deprecate_positional_args
def __init__(self, *, categories='auto', drop=None, sparse=True,
dtype=np.float64, handle_unknown='error'):
self.categories = categories
self.sparse = sparse
self.dtype = dtype
self.handle_unknown = handle_unknown
self.drop = drop
def _validate_keywords(self):
if self.handle_unknown not in ('error', 'ignore'):
msg = ("handle_unknown should be either 'error' or 'ignore', "
"got {0}.".format(self.handle_unknown))
raise ValueError(msg)
# If we have both dropped columns and ignored unknown
# values, there will be ambiguous cells. This creates difficulties
# in interpreting the model.
if self.drop is not None and self.handle_unknown != 'error':
raise ValueError(
"`handle_unknown` must be 'error' when the drop parameter is "
"specified, as both would create categories that are all "
"zero.")
def _compute_drop_idx(self):
if self.drop is None:
return None
elif isinstance(self.drop, str):
if self.drop == 'first':
return np.zeros(len(self.categories_), dtype=np.object)
elif self.drop == 'if_binary':
return np.array([0 if len(cats) == 2 else None
for cats in self.categories_], dtype=np.object)
else:
msg = (
"Wrong input for parameter `drop`. Expected "
"'first', 'if_binary', None or array of objects, got {}"
)
raise ValueError(msg.format(type(self.drop)))
else:
try:
self.drop = np.asarray(self.drop, dtype=object)
droplen = len(self.drop)
except (ValueError, TypeError):
msg = (
"Wrong input for parameter `drop`. Expected "
"'first', 'if_binary', None or array of objects, got {}"
)
raise ValueError(msg.format(type(self.drop)))
if droplen != len(self.categories_):
msg = ("`drop` should have length equal to the number "
"of features ({}), got {}")
raise ValueError(msg.format(len(self.categories_),
len(self.drop)))
missing_drops = [(i, val) for i, val in enumerate(self.drop)
if val not in self.categories_[i]]
if any(missing_drops):
msg = ("The following categories were supposed to be "
"dropped, but were not found in the training "
"data.\n{}".format(
"\n".join(
["Category: {}, Feature: {}".format(c, v)
for c, v in missing_drops])))
raise ValueError(msg)
return np.array([np.where(cat_list == val)[0][0]
for (val, cat_list) in
zip(self.drop, self.categories_)],
dtype=np.object)
def fit(self, X, y=None):
"""
Fit OneHotEncoder to X.
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to determine the categories of each feature.
y : None
Ignored. This parameter exists only for compatibility with
:class:`sklearn.pipeline.Pipeline`.
Returns
-------
self
"""
self._validate_keywords()
self._fit(X, handle_unknown=self.handle_unknown)
self.drop_idx_ = self._compute_drop_idx()
return self
def fit_transform(self, X, y=None):
"""
Fit OneHotEncoder to X, then transform X.
Equivalent to fit(X).transform(X) but more convenient.
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to encode.
y : None
Ignored. This parameter exists only for compatibility with
:class:`sklearn.pipeline.Pipeline`.
Returns
-------
X_out : sparse matrix if sparse=True else a 2-d array
Transformed input.
"""
self._validate_keywords()
return super().fit_transform(X, y)
def transform(self, X):
"""
Transform X using one-hot encoding.
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to encode.
Returns
-------
X_out : sparse matrix if sparse=True else a 2-d array
Transformed input.
"""
check_is_fitted(self)
# validation of X happens in _check_X called by _transform
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
n_samples, n_features = X_int.shape
if self.drop_idx_ is not None:
to_drop = self.drop_idx_.copy()
# We remove all the dropped categories from mask, and decrement all
# categories that occur after them to avoid an empty column.
keep_cells = X_int != to_drop
n_values = []
for i, cats in enumerate(self.categories_):
n_cats = len(cats)
# drop='if_binary' but feature isn't binary
if to_drop[i] is None:
# set to cardinality to not drop from X_int
to_drop[i] = n_cats
n_values.append(n_cats)
else: # dropped
n_values.append(n_cats - 1)
to_drop = to_drop.reshape(1, -1)
X_int[X_int > to_drop] -= 1
X_mask &= keep_cells
else:
n_values = [len(cats) for cats in self.categories_]
mask = X_mask.ravel()
feature_indices = np.cumsum([0] + n_values)
indices = (X_int + feature_indices[:-1]).ravel()[mask]
indptr = np.empty(n_samples + 1, dtype=np.int)
indptr[0] = 0
np.sum(X_mask, axis=1, out=indptr[1:])
np.cumsum(indptr[1:], out=indptr[1:])
data = np.ones(indptr[-1])
out = sparse.csr_matrix((data, indices, indptr),
shape=(n_samples, feature_indices[-1]),
dtype=self.dtype)
if not self.sparse:
return out.toarray()
else:
return out
def inverse_transform(self, X):
"""
Convert the data back to the original representation.
In case unknown categories are encountered (all zeros in the
one-hot encoding), ``None`` is used to represent this category.
Parameters
----------
X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
The transformed data.
Returns
-------
X_tr : array-like, shape [n_samples, n_features]
Inverse transformed array.
"""
check_is_fitted(self)
X = check_array(X, accept_sparse='csr')
n_samples, _ = X.shape
n_features = len(self.categories_)
if self.drop_idx_ is None:
n_transformed_features = sum(len(cats)
for cats in self.categories_)
else:
n_transformed_features = sum(
len(cats) - 1 if to_drop is not None else len(cats)
for cats, to_drop in zip(self.categories_, self.drop_idx_)
)
# validate shape of passed X
msg = ("Shape of the passed X data is not correct. Expected {0} "
"columns, got {1}.")
if X.shape[1] != n_transformed_features:
raise ValueError(msg.format(n_transformed_features, X.shape[1]))
# create resulting array of appropriate dtype
dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
X_tr = np.empty((n_samples, n_features), dtype=dt)
j = 0
found_unknown = {}
for i in range(n_features):
if self.drop_idx_ is None or self.drop_idx_[i] is None:
cats = self.categories_[i]
else:
cats = np.delete(self.categories_[i], self.drop_idx_[i])
n_categories = len(cats)
# Only happens if there was a column with a unique
# category. In this case we just fill the column with this
# unique category value.
if n_categories == 0:
X_tr[:, i] = self.categories_[i][self.drop_idx_[i]]
j += n_categories
continue
sub = X[:, j:j + n_categories]
# for sparse X argmax returns 2D matrix, ensure 1D array
labels = np.asarray(sub.argmax(axis=1)).flatten()
X_tr[:, i] = cats[labels]
if self.handle_unknown == 'ignore':
unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
# ignored unknown categories: we have a row of all zero
if unknown.any():
found_unknown[i] = unknown
# drop will either be None or handle_unknown will be error. If
# self.drop_idx_ is not None, then we can safely assume that all of
# the nulls in each column are the dropped value
elif self.drop_idx_ is not None:
dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
if dropped.any():
X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]]
j += n_categories
# if ignored are found: potentially need to upcast result to
# insert None values
if found_unknown:
if X_tr.dtype != object:
X_tr = X_tr.astype(object)
for idx, mask in found_unknown.items():
X_tr[mask, idx] = None
return X_tr
def get_feature_names(self, input_features=None):
"""
Return feature names for output features.
Parameters
----------
input_features : list of str of shape (n_features,)
String names for input features if available. By default,
"x0", "x1", ... "xn_features" is used.
Returns
-------
output_feature_names : ndarray of shape (n_output_features,)
Array of feature names.
"""
check_is_fitted(self)
cats = self.categories_
if input_features is None:
input_features = ['x%d' % i for i in range(len(cats))]
elif len(input_features) != len(self.categories_):
raise ValueError(
"input_features should have length equal to number of "
"features ({}), got {}".format(len(self.categories_),
len(input_features)))
feature_names = []
for i in range(len(cats)):
names = [
input_features[i] + '_' + str(t) for t in cats[i]]
if self.drop_idx_ is not None and self.drop_idx_[i] is not None:
names.pop(self.drop_idx_[i])
feature_names.extend(names)
return np.array(feature_names, dtype=object)
class OrdinalEncoder(_BaseEncoder):
"""
Encode categorical features as an integer array.
The input to this transformer should be an array-like of integers or
strings, denoting the values taken on by categorical (discrete) features.
The features are converted to ordinal integers. This results in
a single column of integers (0 to n_categories - 1) per feature.
Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
.. versionadded:: 0.20
Parameters
----------
categories : 'auto' or a list of array-like, default='auto'
Categories (unique values) per feature:
- 'auto' : Determine categories automatically from the training data.
- list : ``categories[i]`` holds the categories expected in the ith
column. The passed categories should not mix strings and numeric
values, and should be sorted in case of numeric values.
The used categories can be found in the ``categories_`` attribute.
dtype : number type, default np.float64
Desired dtype of output.
Attributes
----------
categories_ : list of arrays
The categories of each feature determined during fitting
(in order of the features in X and corresponding with the output
of ``transform``).
See Also
--------
sklearn.preprocessing.OneHotEncoder : Performs a one-hot encoding of
categorical features.
sklearn.preprocessing.LabelEncoder : Encodes target labels with values
between 0 and n_classes-1.
Examples
--------
Given a dataset with two features, we let the encoder find the unique
values per feature and transform the data to an ordinal encoding.
>>> from sklearn.preprocessing import OrdinalEncoder
>>> enc = OrdinalEncoder()
>>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
>>> enc.fit(X)
OrdinalEncoder()
>>> enc.categories_
[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
>>> enc.transform([['Female', 3], ['Male', 1]])
array([[0., 2.],
[1., 0.]])
>>> enc.inverse_transform([[1, 0], [0, 1]])
array([['Male', 1],
['Female', 2]], dtype=object)
"""
@_deprecate_positional_args
def __init__(self, *, categories='auto', dtype=np.float64):
self.categories = categories
self.dtype = dtype
def fit(self, X, y=None):
"""
Fit the OrdinalEncoder to X.
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to determine the categories of each feature.
y : None
Ignored. This parameter exists only for compatibility with
:class:`sklearn.pipeline.Pipeline`.
Returns
-------
self
"""
self._fit(X)
return self
def transform(self, X):
"""
Transform X to ordinal codes.
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to encode.
Returns
-------
X_out : sparse matrix or a 2-d array
Transformed input.
"""
X_int, _ = self._transform(X)
return X_int.astype(self.dtype, copy=False)
def inverse_transform(self, X):
"""
Convert the data back to the original representation.
Parameters
----------
X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
The transformed data.
Returns
-------
X_tr : array-like, shape [n_samples, n_features]
Inverse transformed array.
"""
check_is_fitted(self)
X = check_array(X, accept_sparse='csr')
n_samples, _ = X.shape
n_features = len(self.categories_)
# validate shape of passed X
msg = ("Shape of the passed X data is not correct. Expected {0} "
"columns, got {1}.")
if X.shape[1] != n_features:
raise ValueError(msg.format(n_features, X.shape[1]))
# create resulting array of appropriate dtype
dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
X_tr = np.empty((n_samples, n_features), dtype=dt)
for i in range(n_features):
labels = X[:, i].astype('int64', copy=False)
X_tr[:, i] = self.categories_[i][labels]
return X_tr

View file

@ -0,0 +1,175 @@
import warnings
from ..base import BaseEstimator, TransformerMixin
from ..utils.validation import _allclose_dense_sparse
from ..utils.validation import _deprecate_positional_args
def _identity(X):
"""The identity function.
"""
return X
class FunctionTransformer(TransformerMixin, BaseEstimator):
"""Constructs a transformer from an arbitrary callable.
A FunctionTransformer forwards its X (and optionally y) arguments to a
user-defined function or function object and returns the result of this
function. This is useful for stateless transformations such as taking the
log of frequencies, doing custom scaling, etc.
Note: If a lambda is used as the function, then the resulting
transformer will not be pickleable.
.. versionadded:: 0.17
Read more in the :ref:`User Guide <function_transformer>`.
Parameters
----------
func : callable, optional default=None
The callable to use for the transformation. This will be passed
the same arguments as transform, with args and kwargs forwarded.
If func is None, then func will be the identity function.
inverse_func : callable, optional default=None
The callable to use for the inverse transformation. This will be
passed the same arguments as inverse transform, with args and
kwargs forwarded. If inverse_func is None, then inverse_func
will be the identity function.
validate : bool, optional default=False
Indicate that the input X array should be checked before calling
``func``. The possibilities are:
- If False, there is no input validation.
- If True, then X will be converted to a 2-dimensional NumPy array or
sparse matrix. If the conversion is not possible an exception is
raised.
.. versionchanged:: 0.22
The default of ``validate`` changed from True to False.
accept_sparse : boolean, optional
Indicate that func accepts a sparse matrix as input. If validate is
False, this has no effect. Otherwise, if accept_sparse is false,
sparse matrix inputs will cause an exception to be raised.
check_inverse : bool, default=True
Whether to check that or ``func`` followed by ``inverse_func`` leads to
the original inputs. It can be used for a sanity check, raising a
warning when the condition is not fulfilled.
.. versionadded:: 0.20
kw_args : dict, optional
Dictionary of additional keyword arguments to pass to func.
.. versionadded:: 0.18
inv_kw_args : dict, optional
Dictionary of additional keyword arguments to pass to inverse_func.
.. versionadded:: 0.18
Examples
--------
>>> import numpy as np
>>> from sklearn.preprocessing import FunctionTransformer
>>> transformer = FunctionTransformer(np.log1p)
>>> X = np.array([[0, 1], [2, 3]])
>>> transformer.transform(X)
array([[0. , 0.6931...],
[1.0986..., 1.3862...]])
"""
@_deprecate_positional_args
def __init__(self, func=None, inverse_func=None, *, validate=False,
accept_sparse=False, check_inverse=True, kw_args=None,
inv_kw_args=None):
self.func = func
self.inverse_func = inverse_func
self.validate = validate
self.accept_sparse = accept_sparse
self.check_inverse = check_inverse
self.kw_args = kw_args
self.inv_kw_args = inv_kw_args
def _check_input(self, X):
if self.validate:
return self._validate_data(X, accept_sparse=self.accept_sparse)
return X
def _check_inverse_transform(self, X):
"""Check that func and inverse_func are the inverse."""
idx_selected = slice(None, None, max(1, X.shape[0] // 100))
X_round_trip = self.inverse_transform(self.transform(X[idx_selected]))
if not _allclose_dense_sparse(X[idx_selected], X_round_trip):
warnings.warn("The provided functions are not strictly"
" inverse of each other. If you are sure you"
" want to proceed regardless, set"
" 'check_inverse=False'.", UserWarning)
def fit(self, X, y=None):
"""Fit transformer by checking X.
If ``validate`` is ``True``, ``X`` will be checked.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Input array.
Returns
-------
self
"""
X = self._check_input(X)
if (self.check_inverse and not (self.func is None or
self.inverse_func is None)):
self._check_inverse_transform(X)
return self
def transform(self, X):
"""Transform X using the forward function.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Input array.
Returns
-------
X_out : array-like, shape (n_samples, n_features)
Transformed input.
"""
return self._transform(X, func=self.func, kw_args=self.kw_args)
def inverse_transform(self, X):
"""Transform X using the inverse function.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Input array.
Returns
-------
X_out : array-like, shape (n_samples, n_features)
Transformed input.
"""
return self._transform(X, func=self.inverse_func,
kw_args=self.inv_kw_args)
def _transform(self, X, func=None, kw_args=None):
X = self._check_input(X)
if func is None:
func = _identity
return func(X, **(kw_args if kw_args else {}))
def _more_tags(self):
return {'no_validation': not self.validate,
'stateless': True}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _data # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.preprocessing.data'
correct_import_path = 'sklearn.preprocessing'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_data, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _label # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.preprocessing.label'
correct_import_path = 'sklearn.preprocessing'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_label, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,20 @@
import os
def configuration(parent_package='', top_path=None):
import numpy
from numpy.distutils.misc_util import Configuration
config = Configuration('preprocessing', parent_package, top_path)
libraries = []
if os.name == 'posix':
libraries.append('m')
config.add_extension('_csr_polynomial_expansion',
sources=['_csr_polynomial_expansion.pyx'],
include_dirs=[numpy.get_include()],
libraries=libraries)
config.add_subpackage('tests')
return config

View file

@ -0,0 +1,158 @@
import warnings
import pytest
import numpy as np
from scipy import sparse
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.base import clone
from sklearn.preprocessing import maxabs_scale
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import scale
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import robust_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_allclose
iris = load_iris()
def _get_valid_samples_by_column(X, col):
"""Get non NaN samples in column of X"""
return X[:, [col]][~np.isnan(X[:, col])]
@pytest.mark.parametrize(
"est, func, support_sparse, strictly_positive",
[(MaxAbsScaler(), maxabs_scale, True, False),
(MinMaxScaler(), minmax_scale, False, False),
(StandardScaler(), scale, False, False),
(StandardScaler(with_mean=False), scale, True, False),
(PowerTransformer('yeo-johnson'), power_transform, False, False),
(PowerTransformer('box-cox'), power_transform, False, True),
(QuantileTransformer(n_quantiles=10), quantile_transform, True, False),
(RobustScaler(), robust_scale, False, False),
(RobustScaler(with_centering=False), robust_scale, True, False)]
)
def test_missing_value_handling(est, func, support_sparse, strictly_positive):
# check that the preprocessing method let pass nan
rng = np.random.RandomState(42)
X = iris.data.copy()
n_missing = 50
X[rng.randint(X.shape[0], size=n_missing),
rng.randint(X.shape[1], size=n_missing)] = np.nan
if strictly_positive:
X += np.nanmin(X) + 0.1
X_train, X_test = train_test_split(X, random_state=1)
# sanity check
assert not np.all(np.isnan(X_train), axis=0).any()
assert np.any(np.isnan(X_train), axis=0).all()
assert np.any(np.isnan(X_test), axis=0).all()
X_test[:, 0] = np.nan # make sure this boundary case is tested
with pytest.warns(None) as records:
Xt = est.fit(X_train).transform(X_test)
# ensure no warnings are raised
assert len(records) == 0
# missing values should still be missing, and only them
assert_array_equal(np.isnan(Xt), np.isnan(X_test))
# check that the function leads to the same results as the class
with pytest.warns(None) as records:
Xt_class = est.transform(X_train)
assert len(records) == 0
Xt_func = func(X_train, **est.get_params())
assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
# check that the inverse transform keep NaN
Xt_inv = est.inverse_transform(Xt)
assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
# FIXME: we can introduce equal_nan=True in recent version of numpy.
# For the moment which just check that non-NaN values are almost equal.
assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])
for i in range(X.shape[1]):
# train only on non-NaN
est.fit(_get_valid_samples_by_column(X_train, i))
# check transforming with NaN works even when training without NaN
with pytest.warns(None) as records:
Xt_col = est.transform(X_test[:, [i]])
assert len(records) == 0
assert_allclose(Xt_col, Xt[:, [i]])
# check non-NaN is handled as before - the 1st column is all nan
if not np.isnan(X_test[:, i]).all():
Xt_col_nonan = est.transform(
_get_valid_samples_by_column(X_test, i))
assert_array_equal(Xt_col_nonan,
Xt_col[~np.isnan(Xt_col.squeeze())])
if support_sparse:
est_dense = clone(est)
est_sparse = clone(est)
with pytest.warns(None) as records:
Xt_dense = est_dense.fit(X_train).transform(X_test)
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
assert len(records) == 0
for sparse_constructor in (sparse.csr_matrix, sparse.csc_matrix,
sparse.bsr_matrix, sparse.coo_matrix,
sparse.dia_matrix, sparse.dok_matrix,
sparse.lil_matrix):
# check that the dense and sparse inputs lead to the same results
# precompute the matrix to avoid catching side warnings
X_train_sp = sparse_constructor(X_train)
X_test_sp = sparse_constructor(X_test)
with pytest.warns(None) as records:
warnings.simplefilter('ignore', PendingDeprecationWarning)
Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
assert len(records) == 0
assert_allclose(Xt_sp.A, Xt_dense)
with pytest.warns(None) as records:
warnings.simplefilter('ignore', PendingDeprecationWarning)
Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
assert len(records) == 0
assert_allclose(Xt_inv_sp.A, Xt_inv_dense)
@pytest.mark.parametrize(
"est, func",
[(MaxAbsScaler(), maxabs_scale),
(MinMaxScaler(), minmax_scale),
(StandardScaler(), scale),
(StandardScaler(with_mean=False), scale),
(PowerTransformer('yeo-johnson'), power_transform),
(PowerTransformer('box-cox'), power_transform,),
(QuantileTransformer(n_quantiles=3), quantile_transform),
(RobustScaler(), robust_scale),
(RobustScaler(with_centering=False), robust_scale)]
)
def test_missing_value_pandas_na_support(est, func):
# Test pandas IntegerArray with pd.NA
pd = pytest.importorskip('pandas', minversion="1.0")
X = np.array([[1, 2, 3, np.nan, np.nan, 4, 5, 1],
[np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
[1, 2, 3, 4, 5, 6, 7, 8]]).T
# Creates dataframe with IntegerArrays with pd.NA
X_df = pd.DataFrame(X, dtype="Int16", columns=['a', 'b', 'c'])
X_df['c'] = X_df['c'].astype('int')
X_trans = est.fit_transform(X)
X_df_trans = est.fit_transform(X_df)
assert_allclose(X_trans, X_df_trans)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,283 @@
import pytest
import numpy as np
import scipy.sparse as sp
import warnings
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils._testing import (
assert_array_almost_equal,
assert_array_equal,
assert_warns_message
)
X = [[-2, 1.5, -4, -1],
[-1, 2.5, -3, -0.5],
[0, 3.5, -2, 0.5],
[1, 4.5, -1, 2]]
@pytest.mark.parametrize(
'strategy, expected',
[('uniform', [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]),
('kmeans', [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]),
('quantile', [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]])])
def test_fit_transform(strategy, expected):
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy=strategy)
est.fit(X)
assert_array_equal(expected, est.transform(X))
def test_valid_n_bins():
KBinsDiscretizer(n_bins=2).fit_transform(X)
KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X)
assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(np.int)
def test_invalid_n_bins():
est = KBinsDiscretizer(n_bins=1)
err_msg = ("KBinsDiscretizer received an invalid "
"number of bins. Received 1, expected at least 2.")
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
est = KBinsDiscretizer(n_bins=1.1)
err_msg = ("KBinsDiscretizer received an invalid "
"n_bins type. Received float, expected int.")
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
def test_invalid_n_bins_array():
# Bad shape
n_bins = np.full((2, 4), 2.)
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Incorrect number of features
n_bins = [1, 2, 2]
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Bad bin values
n_bins = [1, 2, 2, 1]
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = ("KBinsDiscretizer received an invalid number of bins "
"at indices 0, 3. Number of bins must be at least 2, "
"and must be an int.")
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Float bin values
n_bins = [2.1, 2, 2.1, 2]
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = ("KBinsDiscretizer received an invalid number of bins "
"at indices 0, 2. Number of bins must be at least 2, "
"and must be an int.")
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
@pytest.mark.parametrize(
'strategy, expected',
[('uniform', [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]),
('kmeans', [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]),
('quantile', [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]])])
def test_fit_transform_n_bins_array(strategy, expected):
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='ordinal',
strategy=strategy).fit(X)
assert_array_equal(expected, est.transform(X))
# test the shape of bin_edges_
n_features = np.array(X).shape[1]
assert est.bin_edges_.shape == (n_features, )
for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_):
assert bin_edges.shape == (n_bins + 1, )
def test_invalid_n_features():
est = KBinsDiscretizer(n_bins=3).fit(X)
bad_X = np.arange(25).reshape(5, -1)
err_msg = "Incorrect number of features. Expecting 4, received 5"
with pytest.raises(ValueError, match=err_msg):
est.transform(bad_X)
@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
def test_same_min_max(strategy):
warnings.simplefilter("always")
X = np.array([[1, -2],
[1, -1],
[1, 0],
[1, 1]])
est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode='ordinal')
assert_warns_message(UserWarning,
"Feature 0 is constant and will be replaced "
"with 0.", est.fit, X)
assert est.n_bins_[0] == 1
# replace the feature with zeros
Xt = est.transform(X)
assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
def test_transform_1d_behavior():
X = np.arange(4)
est = KBinsDiscretizer(n_bins=2)
with pytest.raises(ValueError):
est.fit(X)
est = KBinsDiscretizer(n_bins=2)
est.fit(X.reshape(-1, 1))
with pytest.raises(ValueError):
est.transform(X)
@pytest.mark.parametrize('i', range(1, 9))
def test_numeric_stability(i):
X_init = np.array([2., 4., 6., 8., 10.]).reshape(-1, 1)
Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)
# Test up to discretizing nano units
X = X_init / 10**i
Xt = KBinsDiscretizer(n_bins=2, encode='ordinal').fit_transform(X)
assert_array_equal(Xt_expected, Xt)
def test_invalid_encode_option():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='invalid-encode')
err_msg = (r"Valid options for 'encode' are "
r"\('onehot', 'onehot-dense', 'ordinal'\). "
r"Got encode='invalid-encode' instead.")
with pytest.raises(ValueError, match=err_msg):
est.fit(X)
def test_encode_options():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
encode='ordinal').fit(X)
Xt_1 = est.transform(X)
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
encode='onehot-dense').fit(X)
Xt_2 = est.transform(X)
assert not sp.issparse(Xt_2)
assert_array_equal(OneHotEncoder(
categories=[np.arange(i) for i in [2, 3, 3, 3]],
sparse=False)
.fit_transform(Xt_1), Xt_2)
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
encode='onehot').fit(X)
Xt_3 = est.transform(X)
assert sp.issparse(Xt_3)
assert_array_equal(OneHotEncoder(
categories=[np.arange(i) for i in [2, 3, 3, 3]],
sparse=True)
.fit_transform(Xt_1).toarray(),
Xt_3.toarray())
def test_invalid_strategy_option():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], strategy='invalid-strategy')
err_msg = (r"Valid options for 'strategy' are "
r"\('uniform', 'quantile', 'kmeans'\). "
r"Got strategy='invalid-strategy' instead.")
with pytest.raises(ValueError, match=err_msg):
est.fit(X)
@pytest.mark.parametrize(
'strategy, expected_2bins, expected_3bins, expected_5bins',
[('uniform', [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
('kmeans', [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
('quantile', [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2], [0, 1, 2, 3, 4, 4])])
def test_nonuniform_strategies(
strategy, expected_2bins, expected_3bins, expected_5bins):
X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
# with 2 bins
est = KBinsDiscretizer(n_bins=2, strategy=strategy, encode='ordinal')
Xt = est.fit_transform(X)
assert_array_equal(expected_2bins, Xt.ravel())
# with 3 bins
est = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal')
Xt = est.fit_transform(X)
assert_array_equal(expected_3bins, Xt.ravel())
# with 5 bins
est = KBinsDiscretizer(n_bins=5, strategy=strategy, encode='ordinal')
Xt = est.fit_transform(X)
assert_array_equal(expected_5bins, Xt.ravel())
@pytest.mark.parametrize(
'strategy, expected_inv',
[('uniform', [[-1.5, 2., -3.5, -0.5], [-0.5, 3., -2.5, -0.5],
[0.5, 4., -1.5, 0.5], [0.5, 4., -1.5, 1.5]]),
('kmeans', [[-1.375, 2.125, -3.375, -0.5625],
[-1.375, 2.125, -3.375, -0.5625],
[-0.125, 3.375, -2.125, 0.5625],
[0.75, 4.25, -1.25, 1.625]]),
('quantile', [[-1.5, 2., -3.5, -0.75], [-0.5, 3., -2.5, 0.],
[0.5, 4., -1.5, 1.25], [0.5, 4., -1.5, 1.25]])])
@pytest.mark.parametrize('encode', ['ordinal', 'onehot', 'onehot-dense'])
def test_inverse_transform(strategy, encode, expected_inv):
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
Xt = kbd.fit_transform(X)
Xinv = kbd.inverse_transform(Xt)
assert_array_almost_equal(expected_inv, Xinv)
@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
def test_transform_outside_fit_range(strategy):
X = np.array([0, 1, 2, 3])[:, None]
kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode='ordinal')
kbd.fit(X)
X2 = np.array([-2, 5])[:, None]
X2t = kbd.transform(X2)
assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
assert_array_equal(X2t.min(axis=0), [0])
def test_overwrite():
X = np.array([0, 1, 2, 3])[:, None]
X_before = X.copy()
est = KBinsDiscretizer(n_bins=3, encode="ordinal")
Xt = est.fit_transform(X)
assert_array_equal(X, X_before)
Xt_before = Xt.copy()
Xinv = est.inverse_transform(Xt)
assert_array_equal(Xt, Xt_before)
assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
@pytest.mark.parametrize(
'strategy, expected_bin_edges',
[('quantile', [0, 1, 3]), ('kmeans', [0, 1.5, 3])])
def test_redundant_bins(strategy, expected_bin_edges):
X = [[0], [0], [0], [0], [3], [3]]
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy)
msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
"are removed. Consider decreasing the number of bins.")
assert_warns_message(UserWarning, msg, kbd.fit, X)
assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
def test_percentile_numeric_stability():
X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
Xt = np.array([0, 0, 4]).reshape(-1, 1)
kbd = KBinsDiscretizer(n_bins=10, encode='ordinal',
strategy='quantile')
msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
"are removed. Consider decreasing the number of bins.")
assert_warns_message(UserWarning, msg, kbd.fit, X)
assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
assert_array_almost_equal(kbd.transform(X), Xt)

View file

@ -0,0 +1,698 @@
# -*- coding: utf-8 -*-
import re
import numpy as np
from scipy import sparse
import pytest
from sklearn.exceptions import NotFittedError
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_allclose
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
def test_one_hot_encoder_sparse_dense():
# check that sparse and dense will give the same results
X = np.array([[3, 2, 1], [0, 1, 1]])
enc_sparse = OneHotEncoder()
enc_dense = OneHotEncoder(sparse=False)
X_trans_sparse = enc_sparse.fit_transform(X)
X_trans_dense = enc_dense.fit_transform(X)
assert X_trans_sparse.shape == (2, 5)
assert X_trans_dense.shape == (2, 5)
assert sparse.issparse(X_trans_sparse)
assert not sparse.issparse(X_trans_dense)
# check outcome
assert_array_equal(X_trans_sparse.toarray(), [[0., 1., 0., 1., 1.],
[1., 0., 1., 0., 1.]])
assert_array_equal(X_trans_sparse.toarray(), X_trans_dense)
def test_one_hot_encoder_diff_n_features():
X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
X2 = np.array([[1, 0]])
enc = OneHotEncoder()
enc.fit(X)
err_msg = ("The number of features in X is different to the number of "
"features of the fitted data.")
with pytest.raises(ValueError, match=err_msg):
enc.transform(X2)
def test_one_hot_encoder_handle_unknown():
X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
X2 = np.array([[4, 1, 1]])
# Test that one hot encoder raises error for unknown features
# present during transform.
oh = OneHotEncoder(handle_unknown='error')
oh.fit(X)
with pytest.raises(ValueError, match='Found unknown categories'):
oh.transform(X2)
# Test the ignore option, ignores unknown features (giving all 0's)
oh = OneHotEncoder(handle_unknown='ignore')
oh.fit(X)
X2_passed = X2.copy()
assert_array_equal(
oh.transform(X2_passed).toarray(),
np.array([[0., 0., 0., 0., 1., 0., 0.]]))
# ensure transformed data was not modified in place
assert_allclose(X2, X2_passed)
# Raise error if handle_unknown is neither ignore or error.
oh = OneHotEncoder(handle_unknown='42')
with pytest.raises(ValueError, match='handle_unknown should be either'):
oh.fit(X)
def test_one_hot_encoder_not_fitted():
X = np.array([['a'], ['b']])
enc = OneHotEncoder(categories=['a', 'b'])
msg = ("This OneHotEncoder instance is not fitted yet. "
"Call 'fit' with appropriate arguments before using this "
"estimator.")
with pytest.raises(NotFittedError, match=msg):
enc.transform(X)
def test_one_hot_encoder_handle_unknown_strings():
X = np.array(['11111111', '22', '333', '4444']).reshape((-1, 1))
X2 = np.array(['55555', '22']).reshape((-1, 1))
# Non Regression test for the issue #12470
# Test the ignore option, when categories are numpy string dtype
# particularly when the known category strings are larger
# than the unknown category strings
oh = OneHotEncoder(handle_unknown='ignore')
oh.fit(X)
X2_passed = X2.copy()
assert_array_equal(
oh.transform(X2_passed).toarray(),
np.array([[0., 0., 0., 0.], [0., 1., 0., 0.]]))
# ensure transformed data was not modified in place
assert_array_equal(X2, X2_passed)
@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
@pytest.mark.parametrize("input_dtype", [np.int32, np.float32, np.float64])
def test_one_hot_encoder_dtype(input_dtype, output_dtype):
X = np.asarray([[0, 1]], dtype=input_dtype).T
X_expected = np.asarray([[1, 0], [0, 1]], dtype=output_dtype)
oh = OneHotEncoder(categories='auto', dtype=output_dtype)
assert_array_equal(oh.fit_transform(X).toarray(), X_expected)
assert_array_equal(oh.fit(X).transform(X).toarray(), X_expected)
oh = OneHotEncoder(categories='auto', dtype=output_dtype, sparse=False)
assert_array_equal(oh.fit_transform(X), X_expected)
assert_array_equal(oh.fit(X).transform(X), X_expected)
@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
def test_one_hot_encoder_dtype_pandas(output_dtype):
pd = pytest.importorskip('pandas')
X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]})
X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype)
oh = OneHotEncoder(dtype=output_dtype)
assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected)
assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected)
oh = OneHotEncoder(dtype=output_dtype, sparse=False)
assert_array_equal(oh.fit_transform(X_df), X_expected)
assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)
def test_one_hot_encoder_feature_names():
enc = OneHotEncoder()
X = [['Male', 1, 'girl', 2, 3],
['Female', 41, 'girl', 1, 10],
['Male', 51, 'boy', 12, 3],
['Male', 91, 'girl', 21, 30]]
enc.fit(X)
feature_names = enc.get_feature_names()
assert isinstance(feature_names, np.ndarray)
assert_array_equal(['x0_Female', 'x0_Male',
'x1_1', 'x1_41', 'x1_51', 'x1_91',
'x2_boy', 'x2_girl',
'x3_1', 'x3_2', 'x3_12', 'x3_21',
'x4_3',
'x4_10', 'x4_30'], feature_names)
feature_names2 = enc.get_feature_names(['one', 'two',
'three', 'four', 'five'])
assert_array_equal(['one_Female', 'one_Male',
'two_1', 'two_41', 'two_51', 'two_91',
'three_boy', 'three_girl',
'four_1', 'four_2', 'four_12', 'four_21',
'five_3', 'five_10', 'five_30'], feature_names2)
with pytest.raises(ValueError, match="input_features should have length"):
enc.get_feature_names(['one', 'two'])
def test_one_hot_encoder_feature_names_unicode():
enc = OneHotEncoder()
X = np.array([['c❤t1', 'dat2']], dtype=object).T
enc.fit(X)
feature_names = enc.get_feature_names()
assert_array_equal(['x0_c❤t1', 'x0_dat2'], feature_names)
feature_names = enc.get_feature_names(input_features=['n👍me'])
assert_array_equal(['n👍me_c❤t1', 'n👍me_dat2'], feature_names)
def test_one_hot_encoder_set_params():
X = np.array([[1, 2]]).T
oh = OneHotEncoder()
# set params on not yet fitted object
oh.set_params(categories=[[0, 1, 2, 3]])
assert oh.get_params()['categories'] == [[0, 1, 2, 3]]
assert oh.fit_transform(X).toarray().shape == (2, 4)
# set params on already fitted object
oh.set_params(categories=[[0, 1, 2, 3, 4]])
assert oh.fit_transform(X).toarray().shape == (2, 5)
def check_categorical_onehot(X):
enc = OneHotEncoder(categories='auto')
Xtr1 = enc.fit_transform(X)
enc = OneHotEncoder(categories='auto', sparse=False)
Xtr2 = enc.fit_transform(X)
assert_allclose(Xtr1.toarray(), Xtr2)
assert sparse.isspmatrix_csr(Xtr1)
return Xtr1.toarray()
@pytest.mark.parametrize("X", [
[['def', 1, 55], ['abc', 2, 55]],
np.array([[10, 1, 55], [5, 2, 55]]),
np.array([['b', 'A', 'cat'], ['a', 'B', 'cat']], dtype=object)
], ids=['mixed', 'numeric', 'object'])
def test_one_hot_encoder(X):
Xtr = check_categorical_onehot(np.array(X)[:, [0]])
assert_allclose(Xtr, [[0, 1], [1, 0]])
Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])
Xtr = OneHotEncoder(categories='auto').fit_transform(X)
assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]])
@pytest.mark.parametrize('sparse_', [False, True])
@pytest.mark.parametrize('drop', [None, 'first'])
def test_one_hot_encoder_inverse(sparse_, drop):
X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]]
enc = OneHotEncoder(sparse=sparse_, drop=drop)
X_tr = enc.fit_transform(X)
exp = np.array(X, dtype=object)
assert_array_equal(enc.inverse_transform(X_tr), exp)
X = [[2, 55], [1, 55], [3, 55]]
enc = OneHotEncoder(sparse=sparse_, categories='auto',
drop=drop)
X_tr = enc.fit_transform(X)
exp = np.array(X)
assert_array_equal(enc.inverse_transform(X_tr), exp)
if drop is None:
# with unknown categories
# drop is incompatible with handle_unknown=ignore
X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]]
enc = OneHotEncoder(sparse=sparse_, handle_unknown='ignore',
categories=[['abc', 'def'], [1, 2],
[54, 55, 56]])
X_tr = enc.fit_transform(X)
exp = np.array(X, dtype=object)
exp[2, 1] = None
assert_array_equal(enc.inverse_transform(X_tr), exp)
# with an otherwise numerical output, still object if unknown
X = [[2, 55], [1, 55], [3, 55]]
enc = OneHotEncoder(sparse=sparse_, categories=[[1, 2], [54, 56]],
handle_unknown='ignore')
X_tr = enc.fit_transform(X)
exp = np.array(X, dtype=object)
exp[2, 0] = None
exp[:, 1] = None
assert_array_equal(enc.inverse_transform(X_tr), exp)
# incorrect shape raises
X_tr = np.array([[0, 1, 1], [1, 0, 1]])
msg = re.escape('Shape of the passed X data is not correct')
with pytest.raises(ValueError, match=msg):
enc.inverse_transform(X_tr)
def test_one_hot_encoder_inverse_if_binary():
X = np.array([['Male', 1],
['Female', 3],
['Female', 2]], dtype=object)
ohe = OneHotEncoder(drop='if_binary', sparse=False)
X_tr = ohe.fit_transform(X)
assert_array_equal(ohe.inverse_transform(X_tr), X)
# check that resetting drop option without refitting does not throw an error
@pytest.mark.parametrize('drop', ['if_binary', 'first', None])
@pytest.mark.parametrize('reset_drop', ['if_binary', 'first', None])
def test_one_hot_encoder_drop_reset(drop, reset_drop):
X = np.array([['Male', 1],
['Female', 3],
['Female', 2]], dtype=object)
ohe = OneHotEncoder(drop=drop, sparse=False)
ohe.fit(X)
X_tr = ohe.transform(X)
feature_names = ohe.get_feature_names()
ohe.set_params(drop=reset_drop)
assert_array_equal(ohe.inverse_transform(X_tr), X)
assert_allclose(ohe.transform(X), X_tr)
assert_array_equal(ohe.get_feature_names(), feature_names)
@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
@pytest.mark.parametrize("X", [
[1, 2],
np.array([3., 4.])
])
def test_X_is_not_1D(X, method):
oh = OneHotEncoder()
msg = ("Expected 2D array, got 1D array instead")
with pytest.raises(ValueError, match=msg):
getattr(oh, method)(X)
@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
def test_X_is_not_1D_pandas(method):
pd = pytest.importorskip('pandas')
X = pd.Series([6, 3, 4, 6])
oh = OneHotEncoder()
msg = ("Expected 2D array, got 1D array instead")
with pytest.raises(ValueError, match=msg):
getattr(oh, method)(X)
@pytest.mark.parametrize("X, cat_exp, cat_dtype", [
([['abc', 55], ['def', 55]], [['abc', 'def'], [55]], np.object_),
(np.array([[1, 2], [3, 2]]), [[1, 3], [2]], np.integer),
(np.array([['A', 'cat'], ['B', 'cat']], dtype=object),
[['A', 'B'], ['cat']], np.object_),
(np.array([['A', 'cat'], ['B', 'cat']]),
[['A', 'B'], ['cat']], np.str_)
], ids=['mixed', 'numeric', 'object', 'string'])
def test_one_hot_encoder_categories(X, cat_exp, cat_dtype):
# order of categories should not depend on order of samples
for Xi in [X, X[::-1]]:
enc = OneHotEncoder(categories='auto')
enc.fit(Xi)
# assert enc.categories == 'auto'
assert isinstance(enc.categories_, list)
for res, exp in zip(enc.categories_, cat_exp):
assert res.tolist() == exp
assert np.issubdtype(res.dtype, cat_dtype)
@pytest.mark.parametrize("X, X2, cats, cat_dtype", [
(np.array([['a', 'b']], dtype=object).T,
np.array([['a', 'd']], dtype=object).T,
[['a', 'b', 'c']], np.object_),
(np.array([[1, 2]], dtype='int64').T,
np.array([[1, 4]], dtype='int64').T,
[[1, 2, 3]], np.int64),
(np.array([['a', 'b']], dtype=object).T,
np.array([['a', 'd']], dtype=object).T,
[np.array(['a', 'b', 'c'])], np.object_),
], ids=['object', 'numeric', 'object-string-cat'])
def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype):
enc = OneHotEncoder(categories=cats)
exp = np.array([[1., 0., 0.],
[0., 1., 0.]])
assert_array_equal(enc.fit_transform(X).toarray(), exp)
assert list(enc.categories[0]) == list(cats[0])
assert enc.categories_[0].tolist() == list(cats[0])
# manually specified categories should have same dtype as
# the data when coerced from lists
assert enc.categories_[0].dtype == cat_dtype
# when specifying categories manually, unknown categories should already
# raise when fitting
enc = OneHotEncoder(categories=cats)
with pytest.raises(ValueError, match="Found unknown categories"):
enc.fit(X2)
enc = OneHotEncoder(categories=cats, handle_unknown='ignore')
exp = np.array([[1., 0., 0.], [0., 0., 0.]])
assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)
def test_one_hot_encoder_unsorted_categories():
X = np.array([['a', 'b']], dtype=object).T
enc = OneHotEncoder(categories=[['b', 'a', 'c']])
exp = np.array([[0., 1., 0.],
[1., 0., 0.]])
assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
assert_array_equal(enc.fit_transform(X).toarray(), exp)
assert enc.categories_[0].tolist() == ['b', 'a', 'c']
assert np.issubdtype(enc.categories_[0].dtype, np.object_)
# unsorted passed categories still raise for numerical values
X = np.array([[1, 2]]).T
enc = OneHotEncoder(categories=[[2, 1, 3]])
msg = 'Unsorted categories are not supported'
with pytest.raises(ValueError, match=msg):
enc.fit_transform(X)
def test_one_hot_encoder_specified_categories_mixed_columns():
# multiple columns
X = np.array([['a', 'b'], [0, 2]], dtype=object).T
enc = OneHotEncoder(categories=[['a', 'b', 'c'], [0, 1, 2]])
exp = np.array([[1., 0., 0., 1., 0., 0.],
[0., 1., 0., 0., 0., 1.]])
assert_array_equal(enc.fit_transform(X).toarray(), exp)
assert enc.categories_[0].tolist() == ['a', 'b', 'c']
assert np.issubdtype(enc.categories_[0].dtype, np.object_)
assert enc.categories_[1].tolist() == [0, 1, 2]
# integer categories but from object dtype data
assert np.issubdtype(enc.categories_[1].dtype, np.object_)
def test_one_hot_encoder_pandas():
pd = pytest.importorskip('pandas')
X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]})
Xtr = check_categorical_onehot(X_df)
assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
@pytest.mark.parametrize("drop, expected_names",
[('first', ['x0_c', 'x2_b']),
('if_binary', ['x0_c', 'x1_2', 'x2_b']),
(['c', 2, 'b'], ['x0_b', 'x2_a'])],
ids=['first', 'binary', 'manual'])
def test_one_hot_encoder_feature_names_drop(drop, expected_names):
X = [['c', 2, 'a'],
['b', 2, 'b']]
ohe = OneHotEncoder(drop=drop)
ohe.fit(X)
feature_names = ohe.get_feature_names()
assert isinstance(feature_names, np.ndarray)
assert_array_equal(expected_names, feature_names)
def test_one_hot_encoder_drop_equals_if_binary():
# Canonical case
X = [[10, 'yes'],
[20, 'no'],
[30, 'yes']]
expected = np.array([[1., 0., 0., 1.],
[0., 1., 0., 0.],
[0., 0., 1., 1.]])
expected_drop_idx = np.array([None, 0])
ohe = OneHotEncoder(drop='if_binary', sparse=False)
result = ohe.fit_transform(X)
assert_array_equal(ohe.drop_idx_, expected_drop_idx)
assert_allclose(result, expected)
# with only one cat, the behaviour is equivalent to drop=None
X = [['true', 'a'],
['false', 'a'],
['false', 'a']]
expected = np.array([[1., 1.],
[0., 1.],
[0., 1.]])
expected_drop_idx = np.array([0, None])
ohe = OneHotEncoder(drop='if_binary', sparse=False)
result = ohe.fit_transform(X)
assert_array_equal(ohe.drop_idx_, expected_drop_idx)
assert_allclose(result, expected)
@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
np.array([['a', np.nan]], dtype=object).T],
ids=['numeric', 'object'])
@pytest.mark.parametrize("as_data_frame", [False, True],
ids=['array', 'dataframe'])
@pytest.mark.parametrize("handle_unknown", ['error', 'ignore'])
def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
if as_data_frame:
pd = pytest.importorskip('pandas')
X = pd.DataFrame(X)
ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown)
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.fit(X)
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.fit_transform(X)
if as_data_frame:
X_partial = X.iloc[:1, :]
else:
X_partial = X[:1, :]
ohe.fit(X_partial)
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.transform(X)
@pytest.mark.parametrize("X", [
[['abc', 2, 55], ['def', 1, 55]],
np.array([[10, 2, 55], [20, 1, 55]]),
np.array([['a', 'B', 'cat'], ['b', 'A', 'cat']], dtype=object)
], ids=['mixed', 'numeric', 'object'])
def test_ordinal_encoder(X):
enc = OrdinalEncoder()
exp = np.array([[0, 1, 0],
[1, 0, 0]], dtype='int64')
assert_array_equal(enc.fit_transform(X), exp.astype('float64'))
enc = OrdinalEncoder(dtype='int64')
assert_array_equal(enc.fit_transform(X), exp)
@pytest.mark.parametrize("X, X2, cats, cat_dtype", [
(np.array([['a', 'b']], dtype=object).T,
np.array([['a', 'd']], dtype=object).T,
[['a', 'b', 'c']], np.object_),
(np.array([[1, 2]], dtype='int64').T,
np.array([[1, 4]], dtype='int64').T,
[[1, 2, 3]], np.int64),
(np.array([['a', 'b']], dtype=object).T,
np.array([['a', 'd']], dtype=object).T,
[np.array(['a', 'b', 'c'])], np.object_),
], ids=['object', 'numeric', 'object-string-cat'])
def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
enc = OrdinalEncoder(categories=cats)
exp = np.array([[0.], [1.]])
assert_array_equal(enc.fit_transform(X), exp)
assert list(enc.categories[0]) == list(cats[0])
assert enc.categories_[0].tolist() == list(cats[0])
# manually specified categories should have same dtype as
# the data when coerced from lists
assert enc.categories_[0].dtype == cat_dtype
# when specifying categories manually, unknown categories should already
# raise when fitting
enc = OrdinalEncoder(categories=cats)
with pytest.raises(ValueError, match="Found unknown categories"):
enc.fit(X2)
def test_ordinal_encoder_inverse():
X = [['abc', 2, 55], ['def', 1, 55]]
enc = OrdinalEncoder()
X_tr = enc.fit_transform(X)
exp = np.array(X, dtype=object)
assert_array_equal(enc.inverse_transform(X_tr), exp)
# incorrect shape raises
X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
msg = re.escape('Shape of the passed X data is not correct')
with pytest.raises(ValueError, match=msg):
enc.inverse_transform(X_tr)
@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
np.array([['a', np.nan]], dtype=object).T],
ids=['numeric', 'object'])
def test_ordinal_encoder_raise_missing(X):
ohe = OrdinalEncoder()
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.fit(X)
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.fit_transform(X)
ohe.fit(X[:1, :])
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.transform(X)
def test_ordinal_encoder_raise_categories_shape():
X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T
cats = ['Low', 'Medium', 'High']
enc = OrdinalEncoder(categories=cats)
msg = ("Shape mismatch: if categories is an array,")
with pytest.raises(ValueError, match=msg):
enc.fit(X)
def test_encoder_dtypes():
# check that dtypes are preserved when determining categories
enc = OneHotEncoder(categories='auto')
exp = np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]], dtype='float64')
for X in [np.array([[1, 2], [3, 4]], dtype='int64'),
np.array([[1, 2], [3, 4]], dtype='float64'),
np.array([['a', 'b'], ['c', 'd']]), # string dtype
np.array([[1, 'a'], [3, 'b']], dtype='object')]:
enc.fit(X)
assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
assert_array_equal(enc.transform(X).toarray(), exp)
X = [[1, 2], [3, 4]]
enc.fit(X)
assert all([np.issubdtype(enc.categories_[i].dtype, np.integer)
for i in range(2)])
assert_array_equal(enc.transform(X).toarray(), exp)
X = [[1, 'a'], [3, 'b']]
enc.fit(X)
assert all([enc.categories_[i].dtype == 'object' for i in range(2)])
assert_array_equal(enc.transform(X).toarray(), exp)
def test_encoder_dtypes_pandas():
# check dtype (similar to test_categorical_encoder_dtypes for dataframes)
pd = pytest.importorskip('pandas')
enc = OneHotEncoder(categories='auto')
exp = np.array([[1., 0., 1., 0., 1., 0.],
[0., 1., 0., 1., 0., 1.]], dtype='float64')
X = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}, dtype='int64')
enc.fit(X)
assert all([enc.categories_[i].dtype == 'int64' for i in range(2)])
assert_array_equal(enc.transform(X).toarray(), exp)
X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]})
X_type = [X['A'].dtype, X['B'].dtype, X['C'].dtype]
enc.fit(X)
assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
assert_array_equal(enc.transform(X).toarray(), exp)
def test_one_hot_encoder_warning():
enc = OneHotEncoder()
X = [['Male', 1], ['Female', 3]]
np.testing.assert_no_warnings(enc.fit_transform, X)
def test_one_hot_encoder_drop_manual():
cats_to_drop = ['def', 12, 3, 56]
enc = OneHotEncoder(drop=cats_to_drop)
X = [['abc', 12, 2, 55],
['def', 12, 1, 55],
['def', 12, 3, 56]]
trans = enc.fit_transform(X).toarray()
exp = [[1, 0, 1, 1],
[0, 1, 0, 1],
[0, 0, 0, 0]]
assert_array_equal(trans, exp)
dropped_cats = [cat[feature]
for cat, feature in zip(enc.categories_,
enc.drop_idx_)]
assert_array_equal(dropped_cats, cats_to_drop)
assert_array_equal(np.array(X, dtype=object),
enc.inverse_transform(trans))
@pytest.mark.parametrize(
"X_fit, params, err_msg",
[([["Male"], ["Female"]], {'drop': 'second'},
"Wrong input for parameter `drop`"),
([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'},
"`handle_unknown` must be 'error'"),
([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
{'drop': np.asarray('b', dtype=object)},
"Wrong input for parameter `drop`"),
([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
{'drop': ['ghi', 3, 59]},
"The following categories were supposed")]
)
def test_one_hot_encoder_invalid_params(X_fit, params, err_msg):
enc = OneHotEncoder(**params)
with pytest.raises(ValueError, match=err_msg):
enc.fit(X_fit)
@pytest.mark.parametrize('drop', [['abc', 3], ['abc', 3, 41, 'a']])
def test_invalid_drop_length(drop):
enc = OneHotEncoder(drop=drop)
err_msg = "`drop` should have length equal to the number"
with pytest.raises(ValueError, match=err_msg):
enc.fit([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
@pytest.mark.parametrize("density", [True, False],
ids=['sparse', 'dense'])
@pytest.mark.parametrize("drop", ['first',
['a', 2, 'b']],
ids=['first', 'manual'])
def test_categories(density, drop):
ohe_base = OneHotEncoder(sparse=density)
ohe_test = OneHotEncoder(sparse=density, drop=drop)
X = [['c', 1, 'a'],
['a', 2, 'b']]
ohe_base.fit(X)
ohe_test.fit(X)
assert_array_equal(ohe_base.categories_, ohe_test.categories_)
if drop == 'first':
assert_array_equal(ohe_test.drop_idx_, 0)
else:
for drop_cat, drop_idx, cat_list in zip(drop,
ohe_test.drop_idx_,
ohe_test.categories_):
assert cat_list[int(drop_idx)] == drop_cat
assert isinstance(ohe_test.drop_idx_, np.ndarray)
assert ohe_test.drop_idx_.dtype == np.object
@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
def test_encoders_has_categorical_tags(Encoder):
assert 'categorical' in Encoder()._get_tags()['X_types']
@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
def test_encoders_does_not_support_none_values(Encoder):
values = [["a"], [None]]
with pytest.raises(TypeError, match="Encoders require their input to be "
"uniformly strings or numbers."):
Encoder().fit(values)

View file

@ -0,0 +1,160 @@
import pytest
import numpy as np
from scipy import sparse
from sklearn.preprocessing import FunctionTransformer
from sklearn.utils._testing import (assert_array_equal,
assert_allclose_dense_sparse)
from sklearn.utils._testing import assert_warns_message, assert_no_warnings
def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
def _func(X, *args, **kwargs):
args_store.append(X)
args_store.extend(args)
kwargs_store.update(kwargs)
return func(X)
return _func
def test_delegate_to_func():
# (args|kwargs)_store will hold the positional and keyword arguments
# passed to the function inside the FunctionTransformer.
args_store = []
kwargs_store = {}
X = np.arange(10).reshape((5, 2))
assert_array_equal(
FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
X, 'transform should have returned X unchanged',
)
# The function should only have received X.
assert args_store == [X], ('Incorrect positional arguments passed to '
'func: {args}'.format(args=args_store))
assert not kwargs_store, ('Unexpected keyword arguments passed to '
'func: {args}'.format(args=kwargs_store))
# reset the argument stores.
args_store[:] = []
kwargs_store.clear()
transformed = FunctionTransformer(
_make_func(args_store, kwargs_store),
).transform(X)
assert_array_equal(transformed, X,
err_msg='transform should have returned X unchanged')
# The function should have received X
assert args_store == [X], ('Incorrect positional arguments passed '
'to func: {args}'.format(args=args_store))
assert not kwargs_store, ('Unexpected keyword arguments passed to '
'func: {args}'.format(args=kwargs_store))
def test_np_log():
X = np.arange(10).reshape((5, 2))
# Test that the numpy.log example still works.
assert_array_equal(
FunctionTransformer(np.log1p).transform(X),
np.log1p(X),
)
def test_kw_arg():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
# Test that rounding is correct
assert_array_equal(F.transform(X),
np.around(X, decimals=3))
def test_kw_arg_update():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
F.kw_args['decimals'] = 1
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=1))
def test_kw_arg_reset():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
F.kw_args = dict(decimals=1)
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=1))
def test_inverse_transform():
X = np.array([1, 4, 9, 16]).reshape((2, 2))
# Test that inverse_transform works correctly
F = FunctionTransformer(
func=np.sqrt,
inverse_func=np.around, inv_kw_args=dict(decimals=3),
)
assert_array_equal(
F.inverse_transform(F.transform(X)),
np.around(np.sqrt(X), decimals=3),
)
def test_check_inverse():
X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
X_list = [X_dense,
sparse.csr_matrix(X_dense),
sparse.csc_matrix(X_dense)]
for X in X_list:
if sparse.issparse(X):
accept_sparse = True
else:
accept_sparse = False
trans = FunctionTransformer(func=np.sqrt,
inverse_func=np.around,
accept_sparse=accept_sparse,
check_inverse=True,
validate=True)
assert_warns_message(UserWarning,
"The provided functions are not strictly"
" inverse of each other. If you are sure you"
" want to proceed regardless, set"
" 'check_inverse=False'.",
trans.fit, X)
trans = FunctionTransformer(func=np.expm1,
inverse_func=np.log1p,
accept_sparse=accept_sparse,
check_inverse=True,
validate=True)
Xt = assert_no_warnings(trans.fit_transform, X)
assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))
# check that we don't check inverse when one of the func or inverse is not
# provided.
trans = FunctionTransformer(func=np.expm1, inverse_func=None,
check_inverse=True, validate=True)
assert_no_warnings(trans.fit, X_dense)
trans = FunctionTransformer(func=None, inverse_func=np.expm1,
check_inverse=True, validate=True)
assert_no_warnings(trans.fit, X_dense)
def test_function_transformer_frame():
pd = pytest.importorskip('pandas')
X_df = pd.DataFrame(np.random.randn(100, 10))
transformer = FunctionTransformer()
X_df_trans = transformer.fit_transform(X_df)
assert hasattr(X_df_trans, 'loc')

View file

@ -0,0 +1,656 @@
import numpy as np
import pytest
from scipy.sparse import issparse
from scipy.sparse import coo_matrix
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import dok_matrix
from scipy.sparse import lil_matrix
from sklearn.utils.multiclass import type_of_target
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_warns_message
from sklearn.utils._testing import ignore_warnings
from sklearn.utils import _to_object_array
from sklearn.preprocessing._label import LabelBinarizer
from sklearn.preprocessing._label import MultiLabelBinarizer
from sklearn.preprocessing._label import LabelEncoder
from sklearn.preprocessing._label import label_binarize
from sklearn.preprocessing._label import _inverse_binarize_thresholding
from sklearn.preprocessing._label import _inverse_binarize_multiclass
from sklearn.preprocessing._label import _encode
from sklearn import datasets
iris = datasets.load_iris()
def toarray(a):
if hasattr(a, "toarray"):
a = a.toarray()
return a
def test_label_binarizer():
# one-class case defaults to negative label
# For dense case:
inp = ["pos", "pos", "pos", "pos"]
lb = LabelBinarizer(sparse_output=False)
expected = np.array([[0, 0, 0, 0]]).T
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["pos"])
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
# For sparse case:
lb = LabelBinarizer(sparse_output=True)
got = lb.fit_transform(inp)
assert issparse(got)
assert_array_equal(lb.classes_, ["pos"])
assert_array_equal(expected, got.toarray())
assert_array_equal(lb.inverse_transform(got.toarray()), inp)
lb = LabelBinarizer(sparse_output=False)
# two-class case
inp = ["neg", "pos", "pos", "neg"]
expected = np.array([[0, 1, 1, 0]]).T
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["neg", "pos"])
assert_array_equal(expected, got)
to_invert = np.array([[1, 0],
[0, 1],
[0, 1],
[1, 0]])
assert_array_equal(lb.inverse_transform(to_invert), inp)
# multi-class case
inp = ["spam", "ham", "eggs", "ham", "0"]
expected = np.array([[0, 0, 0, 1],
[0, 0, 1, 0],
[0, 1, 0, 0],
[0, 0, 1, 0],
[1, 0, 0, 0]])
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
def test_label_binarizer_unseen_labels():
lb = LabelBinarizer()
expected = np.array([[1, 0, 0],
[0, 1, 0],
[0, 0, 1]])
got = lb.fit_transform(['b', 'd', 'e'])
assert_array_equal(expected, got)
expected = np.array([[0, 0, 0],
[1, 0, 0],
[0, 0, 0],
[0, 1, 0],
[0, 0, 1],
[0, 0, 0]])
got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f'])
assert_array_equal(expected, got)
def test_label_binarizer_set_label_encoding():
lb = LabelBinarizer(neg_label=-2, pos_label=0)
# two-class case with pos_label=0
inp = np.array([0, 1, 1, 0])
expected = np.array([[-2, 0, 0, -2]]).T
got = lb.fit_transform(inp)
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
lb = LabelBinarizer(neg_label=-2, pos_label=2)
# multi-class case
inp = np.array([3, 2, 1, 2, 0])
expected = np.array([[-2, -2, -2, +2],
[-2, -2, +2, -2],
[-2, +2, -2, -2],
[-2, -2, +2, -2],
[+2, -2, -2, -2]])
got = lb.fit_transform(inp)
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
@ignore_warnings
def test_label_binarizer_errors():
# Check that invalid arguments yield ValueError
one_class = np.array([0, 0, 0, 0])
lb = LabelBinarizer().fit(one_class)
multi_label = [(2, 3), (0,), (0, 2)]
with pytest.raises(ValueError):
lb.transform(multi_label)
lb = LabelBinarizer()
with pytest.raises(ValueError):
lb.transform([])
with pytest.raises(ValueError):
lb.inverse_transform([])
with pytest.raises(ValueError):
LabelBinarizer(neg_label=2, pos_label=1)
with pytest.raises(ValueError):
LabelBinarizer(neg_label=2, pos_label=2)
with pytest.raises(ValueError):
LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
# Fail on y_type
with pytest.raises(ValueError):
_inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
output_type="foo", classes=[1, 2],
threshold=0)
# Sequence of seq type should raise ValueError
y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
with pytest.raises(ValueError):
LabelBinarizer().fit_transform(y_seq_of_seqs)
# Fail on the number of classes
with pytest.raises(ValueError):
_inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
output_type="foo",
classes=[1, 2, 3],
threshold=0)
# Fail on the dimension of 'binary'
with pytest.raises(ValueError):
_inverse_binarize_thresholding(y=np.array([[1, 2, 3], [2, 1, 3]]),
output_type="binary",
classes=[1, 2, 3],
threshold=0)
# Fail on multioutput data
with pytest.raises(ValueError):
LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
with pytest.raises(ValueError):
label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
@pytest.mark.parametrize(
"values, classes, unknown",
[(np.array([2, 1, 3, 1, 3], dtype='int64'),
np.array([1, 2, 3], dtype='int64'), np.array([4], dtype='int64')),
(np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
np.array(['a', 'b', 'c'], dtype=object),
np.array(['d'], dtype=object)),
(np.array(['b', 'a', 'c', 'a', 'c']),
np.array(['a', 'b', 'c']), np.array(['d']))],
ids=['int64', 'object', 'str'])
def test_label_encoder(values, classes, unknown):
# Test LabelEncoder's transform, fit_transform and
# inverse_transform methods
le = LabelEncoder()
le.fit(values)
assert_array_equal(le.classes_, classes)
assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
le = LabelEncoder()
ret = le.fit_transform(values)
assert_array_equal(ret, [1, 0, 2, 0, 2])
with pytest.raises(ValueError, match="unseen labels"):
le.transform(unknown)
def test_label_encoder_negative_ints():
le = LabelEncoder()
le.fit([1, 1, 4, 5, -1, 0])
assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]),
[1, 2, 3, 3, 4, 0, 0])
assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]),
[0, 1, 4, 4, 5, -1, -1])
with pytest.raises(ValueError):
le.transform([0, 6])
@pytest.mark.parametrize("dtype", ['str', 'object'])
def test_label_encoder_str_bad_shape(dtype):
le = LabelEncoder()
le.fit(np.array(["apple", "orange"], dtype=dtype))
msg = "should be a 1d array"
with pytest.raises(ValueError, match=msg):
le.transform("apple")
def test_label_encoder_errors():
# Check that invalid arguments yield ValueError
le = LabelEncoder()
with pytest.raises(ValueError):
le.transform([])
with pytest.raises(ValueError):
le.inverse_transform([])
# Fail on unseen labels
le = LabelEncoder()
le.fit([1, 2, 3, -1, 1])
msg = "contains previously unseen labels"
with pytest.raises(ValueError, match=msg):
le.inverse_transform([-2])
with pytest.raises(ValueError, match=msg):
le.inverse_transform([-2, -3, -4])
# Fail on inverse_transform("")
msg = r"should be a 1d array.+shape \(\)"
with pytest.raises(ValueError, match=msg):
le.inverse_transform("")
@pytest.mark.parametrize(
"values",
[np.array([2, 1, 3, 1, 3], dtype='int64'),
np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
np.array(['b', 'a', 'c', 'a', 'c'])],
ids=['int64', 'object', 'str'])
def test_label_encoder_empty_array(values):
le = LabelEncoder()
le.fit(values)
# test empty transform
transformed = le.transform([])
assert_array_equal(np.array([]), transformed)
# test empty inverse transform
inverse_transformed = le.inverse_transform([])
assert_array_equal(np.array([]), inverse_transformed)
def test_sparse_output_multilabel_binarizer():
# test input as iterable of iterables
inputs = [
lambda: [(2, 3), (1,), (1, 2)],
lambda: ({2, 3}, {1}, {1, 2}),
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
]
indicator_mat = np.array([[0, 1, 1],
[1, 0, 0],
[1, 1, 0]])
inverse = inputs[0]()
for sparse_output in [True, False]:
for inp in inputs:
# With fit_transform
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
got = mlb.fit_transform(inp())
assert issparse(got) == sparse_output
if sparse_output:
# verify CSR assumption that indices and indptr have same dtype
assert got.indices.dtype == got.indptr.dtype
got = got.toarray()
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
# With fit
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
got = mlb.fit(inp()).transform(inp())
assert issparse(got) == sparse_output
if sparse_output:
# verify CSR assumption that indices and indptr have same dtype
assert got.indices.dtype == got.indptr.dtype
got = got.toarray()
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
with pytest.raises(ValueError):
mlb.inverse_transform(csr_matrix(np.array([[0, 1, 1],
[2, 0, 0],
[1, 1, 0]])))
def test_multilabel_binarizer():
# test input as iterable of iterables
inputs = [
lambda: [(2, 3), (1,), (1, 2)],
lambda: ({2, 3}, {1}, {1, 2}),
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
]
indicator_mat = np.array([[0, 1, 1],
[1, 0, 0],
[1, 1, 0]])
inverse = inputs[0]()
for inp in inputs:
# With fit_transform
mlb = MultiLabelBinarizer()
got = mlb.fit_transform(inp())
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
# With fit
mlb = MultiLabelBinarizer()
got = mlb.fit(inp()).transform(inp())
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
def test_multilabel_binarizer_empty_sample():
mlb = MultiLabelBinarizer()
y = [[1, 2], [1], []]
Y = np.array([[1, 1],
[1, 0],
[0, 0]])
assert_array_equal(mlb.fit_transform(y), Y)
def test_multilabel_binarizer_unknown_class():
mlb = MultiLabelBinarizer()
y = [[1, 2]]
Y = np.array([[1, 0], [0, 1]])
w = 'unknown class(es) [0, 4] will be ignored'
matrix = assert_warns_message(UserWarning, w,
mlb.fit(y).transform, [[4, 1], [2, 0]])
assert_array_equal(matrix, Y)
Y = np.array([[1, 0, 0], [0, 1, 0]])
mlb = MultiLabelBinarizer(classes=[1, 2, 3])
matrix = assert_warns_message(UserWarning, w,
mlb.fit(y).transform, [[4, 1], [2, 0]])
assert_array_equal(matrix, Y)
def test_multilabel_binarizer_given_classes():
inp = [(2, 3), (1,), (1, 2)]
indicator_mat = np.array([[0, 1, 1],
[1, 0, 0],
[1, 0, 1]])
# fit_transform()
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, [1, 3, 2])
# fit().transform()
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, [1, 3, 2])
# ensure works with extra class
mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
assert_array_equal(mlb.fit_transform(inp),
np.hstack(([[0], [0], [0]], indicator_mat)))
assert_array_equal(mlb.classes_, [4, 1, 3, 2])
# ensure fit is no-op as iterable is not consumed
inp = iter(inp)
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
# ensure a ValueError is thrown if given duplicate classes
err_msg = "The classes argument contains duplicate classes. Remove " \
"these duplicates before passing them to MultiLabelBinarizer."
mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
with pytest.raises(ValueError, match=err_msg):
mlb.fit(inp)
def test_multilabel_binarizer_multiple_calls():
inp = [(2, 3), (1,), (1, 2)]
indicator_mat = np.array([[0, 1, 1],
[1, 0, 0],
[1, 0, 1]])
indicator_mat2 = np.array([[0, 1, 1],
[1, 0, 0],
[1, 1, 0]])
# first call
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
# second call change class
mlb.classes = [1, 2, 3]
assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
def test_multilabel_binarizer_same_length_sequence():
# Ensure sequences of the same length are not interpreted as a 2-d array
inp = [[1], [0], [2]]
indicator_mat = np.array([[0, 1, 0],
[1, 0, 0],
[0, 0, 1]])
# fit_transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
# fit().transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
def test_multilabel_binarizer_non_integer_labels():
tuple_classes = _to_object_array([(1,), (2,), (3,)])
inputs = [
([('2', '3'), ('1',), ('1', '2')], ['1', '2', '3']),
([('b', 'c'), ('a',), ('a', 'b')], ['a', 'b', 'c']),
([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
]
indicator_mat = np.array([[0, 1, 1],
[1, 0, 0],
[1, 1, 0]])
for inp, classes in inputs:
# fit_transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, classes)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
# fit().transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, classes)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
mlb = MultiLabelBinarizer()
with pytest.raises(TypeError):
mlb.fit_transform([({}), ({}, {'a': 'b'})])
def test_multilabel_binarizer_non_unique():
inp = [(1, 1, 1, 0)]
indicator_mat = np.array([[1, 1]])
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
def test_multilabel_binarizer_inverse_validation():
inp = [(1, 1, 1, 0)]
mlb = MultiLabelBinarizer()
mlb.fit_transform(inp)
# Not binary
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1, 3]]))
# The following binary cases are fine, however
mlb.inverse_transform(np.array([[0, 0]]))
mlb.inverse_transform(np.array([[1, 1]]))
mlb.inverse_transform(np.array([[1, 0]]))
# Wrong shape
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1]]))
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1, 1, 1]]))
def test_label_binarize_with_class_order():
out = label_binarize([1, 6], classes=[1, 2, 4, 6])
expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
assert_array_equal(out, expected)
# Modified class order
out = label_binarize([1, 6], classes=[1, 6, 4, 2])
expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
assert_array_equal(out, expected)
out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
expected = np.array([[0, 0, 1, 0],
[0, 0, 0, 1],
[0, 1, 0, 0],
[1, 0, 0, 0]])
assert_array_equal(out, expected)
def check_binarized_results(y, classes, pos_label, neg_label, expected):
for sparse_output in [True, False]:
if ((pos_label == 0 or neg_label != 0) and sparse_output):
with pytest.raises(ValueError):
label_binarize(y, classes=classes, neg_label=neg_label,
pos_label=pos_label,
sparse_output=sparse_output)
continue
# check label_binarize
binarized = label_binarize(y, classes=classes, neg_label=neg_label,
pos_label=pos_label,
sparse_output=sparse_output)
assert_array_equal(toarray(binarized), expected)
assert issparse(binarized) == sparse_output
# check inverse
y_type = type_of_target(y)
if y_type == "multiclass":
inversed = _inverse_binarize_multiclass(binarized, classes=classes)
else:
inversed = _inverse_binarize_thresholding(binarized,
output_type=y_type,
classes=classes,
threshold=((neg_label +
pos_label) /
2.))
assert_array_equal(toarray(inversed), toarray(y))
# Check label binarizer
lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label,
sparse_output=sparse_output)
binarized = lb.fit_transform(y)
assert_array_equal(toarray(binarized), expected)
assert issparse(binarized) == sparse_output
inverse_output = lb.inverse_transform(binarized)
assert_array_equal(toarray(inverse_output), toarray(y))
assert issparse(inverse_output) == issparse(y)
def test_label_binarize_binary():
y = [0, 1, 0]
classes = [0, 1]
pos_label = 2
neg_label = -1
expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
check_binarized_results(y, classes, pos_label, neg_label, expected)
# Binary case where sparse_output = True will not result in a ValueError
y = [0, 1, 0]
classes = [0, 1]
pos_label = 3
neg_label = 0
expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
check_binarized_results(y, classes, pos_label, neg_label, expected)
def test_label_binarize_multiclass():
y = [0, 1, 2]
classes = [0, 1, 2]
pos_label = 2
neg_label = 0
expected = 2 * np.eye(3)
check_binarized_results(y, classes, pos_label, neg_label, expected)
with pytest.raises(ValueError):
label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label,
sparse_output=True)
def test_label_binarize_multilabel():
y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
classes = [0, 1, 2]
pos_label = 2
neg_label = 0
expected = pos_label * y_ind
y_sparse = [sparse_matrix(y_ind)
for sparse_matrix in [coo_matrix, csc_matrix, csr_matrix,
dok_matrix, lil_matrix]]
for y in [y_ind] + y_sparse:
check_binarized_results(y, classes, pos_label, neg_label,
expected)
with pytest.raises(ValueError):
label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label,
sparse_output=True)
def test_invalid_input_label_binarize():
with pytest.raises(ValueError):
label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
with pytest.raises(ValueError, match="continuous target data is not "):
label_binarize([1.2, 2.7], classes=[0, 1])
with pytest.raises(ValueError, match="mismatch with the labels"):
label_binarize([[1, 3]], classes=[1, 2, 3])
def test_inverse_binarize_multiclass():
got = _inverse_binarize_multiclass(csr_matrix([[0, 1, 0],
[-1, 0, -1],
[0, 0, 0]]),
np.arange(3))
assert_array_equal(got, np.array([1, 1, 0]))
@pytest.mark.parametrize(
"values, expected",
[(np.array([2, 1, 3, 1, 3], dtype='int64'),
np.array([1, 2, 3], dtype='int64')),
(np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
np.array(['a', 'b', 'c'], dtype=object)),
(np.array(['b', 'a', 'c', 'a', 'c']),
np.array(['a', 'b', 'c']))],
ids=['int64', 'object', 'str'])
def test_encode_util(values, expected):
uniques = _encode(values)
assert_array_equal(uniques, expected)
uniques, encoded = _encode(values, encode=True)
assert_array_equal(uniques, expected)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
_, encoded = _encode(values, uniques, encode=True)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
def test_encode_check_unknown():
# test for the check_unknown parameter of _encode()
uniques = np.array([1, 2, 3])
values = np.array([1, 2, 3, 4])
# Default is True, raise error
with pytest.raises(ValueError,
match='y contains previously unseen labels'):
_encode(values, uniques, encode=True, check_unknown=True)
# dont raise error if False
_encode(values, uniques, encode=True, check_unknown=False)
# parameter is ignored for object dtype
uniques = np.array(['a', 'b', 'c'], dtype=object)
values = np.array(['a', 'b', 'c', 'd'], dtype=object)
with pytest.raises(ValueError,
match='y contains previously unseen labels'):
_encode(values, uniques, encode=True, check_unknown=False)