Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,158 @@
|
|||
import warnings
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from sklearn.base import clone
|
||||
|
||||
from sklearn.preprocessing import maxabs_scale
|
||||
from sklearn.preprocessing import minmax_scale
|
||||
from sklearn.preprocessing import scale
|
||||
from sklearn.preprocessing import power_transform
|
||||
from sklearn.preprocessing import quantile_transform
|
||||
from sklearn.preprocessing import robust_scale
|
||||
|
||||
from sklearn.preprocessing import MaxAbsScaler
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.preprocessing import PowerTransformer
|
||||
from sklearn.preprocessing import QuantileTransformer
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
iris = load_iris()
|
||||
|
||||
|
||||
def _get_valid_samples_by_column(X, col):
|
||||
"""Get non NaN samples in column of X"""
|
||||
return X[:, [col]][~np.isnan(X[:, col])]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est, func, support_sparse, strictly_positive",
|
||||
[(MaxAbsScaler(), maxabs_scale, True, False),
|
||||
(MinMaxScaler(), minmax_scale, False, False),
|
||||
(StandardScaler(), scale, False, False),
|
||||
(StandardScaler(with_mean=False), scale, True, False),
|
||||
(PowerTransformer('yeo-johnson'), power_transform, False, False),
|
||||
(PowerTransformer('box-cox'), power_transform, False, True),
|
||||
(QuantileTransformer(n_quantiles=10), quantile_transform, True, False),
|
||||
(RobustScaler(), robust_scale, False, False),
|
||||
(RobustScaler(with_centering=False), robust_scale, True, False)]
|
||||
)
|
||||
def test_missing_value_handling(est, func, support_sparse, strictly_positive):
|
||||
# check that the preprocessing method let pass nan
|
||||
rng = np.random.RandomState(42)
|
||||
X = iris.data.copy()
|
||||
n_missing = 50
|
||||
X[rng.randint(X.shape[0], size=n_missing),
|
||||
rng.randint(X.shape[1], size=n_missing)] = np.nan
|
||||
if strictly_positive:
|
||||
X += np.nanmin(X) + 0.1
|
||||
X_train, X_test = train_test_split(X, random_state=1)
|
||||
# sanity check
|
||||
assert not np.all(np.isnan(X_train), axis=0).any()
|
||||
assert np.any(np.isnan(X_train), axis=0).all()
|
||||
assert np.any(np.isnan(X_test), axis=0).all()
|
||||
X_test[:, 0] = np.nan # make sure this boundary case is tested
|
||||
|
||||
with pytest.warns(None) as records:
|
||||
Xt = est.fit(X_train).transform(X_test)
|
||||
# ensure no warnings are raised
|
||||
assert len(records) == 0
|
||||
# missing values should still be missing, and only them
|
||||
assert_array_equal(np.isnan(Xt), np.isnan(X_test))
|
||||
|
||||
# check that the function leads to the same results as the class
|
||||
with pytest.warns(None) as records:
|
||||
Xt_class = est.transform(X_train)
|
||||
assert len(records) == 0
|
||||
Xt_func = func(X_train, **est.get_params())
|
||||
assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
|
||||
assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
|
||||
|
||||
# check that the inverse transform keep NaN
|
||||
Xt_inv = est.inverse_transform(Xt)
|
||||
assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
|
||||
# FIXME: we can introduce equal_nan=True in recent version of numpy.
|
||||
# For the moment which just check that non-NaN values are almost equal.
|
||||
assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])
|
||||
|
||||
for i in range(X.shape[1]):
|
||||
# train only on non-NaN
|
||||
est.fit(_get_valid_samples_by_column(X_train, i))
|
||||
# check transforming with NaN works even when training without NaN
|
||||
with pytest.warns(None) as records:
|
||||
Xt_col = est.transform(X_test[:, [i]])
|
||||
assert len(records) == 0
|
||||
assert_allclose(Xt_col, Xt[:, [i]])
|
||||
# check non-NaN is handled as before - the 1st column is all nan
|
||||
if not np.isnan(X_test[:, i]).all():
|
||||
Xt_col_nonan = est.transform(
|
||||
_get_valid_samples_by_column(X_test, i))
|
||||
assert_array_equal(Xt_col_nonan,
|
||||
Xt_col[~np.isnan(Xt_col.squeeze())])
|
||||
|
||||
if support_sparse:
|
||||
est_dense = clone(est)
|
||||
est_sparse = clone(est)
|
||||
|
||||
with pytest.warns(None) as records:
|
||||
Xt_dense = est_dense.fit(X_train).transform(X_test)
|
||||
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
|
||||
assert len(records) == 0
|
||||
for sparse_constructor in (sparse.csr_matrix, sparse.csc_matrix,
|
||||
sparse.bsr_matrix, sparse.coo_matrix,
|
||||
sparse.dia_matrix, sparse.dok_matrix,
|
||||
sparse.lil_matrix):
|
||||
# check that the dense and sparse inputs lead to the same results
|
||||
# precompute the matrix to avoid catching side warnings
|
||||
X_train_sp = sparse_constructor(X_train)
|
||||
X_test_sp = sparse_constructor(X_test)
|
||||
with pytest.warns(None) as records:
|
||||
warnings.simplefilter('ignore', PendingDeprecationWarning)
|
||||
Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
|
||||
assert len(records) == 0
|
||||
assert_allclose(Xt_sp.A, Xt_dense)
|
||||
with pytest.warns(None) as records:
|
||||
warnings.simplefilter('ignore', PendingDeprecationWarning)
|
||||
Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
|
||||
assert len(records) == 0
|
||||
assert_allclose(Xt_inv_sp.A, Xt_inv_dense)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est, func",
|
||||
[(MaxAbsScaler(), maxabs_scale),
|
||||
(MinMaxScaler(), minmax_scale),
|
||||
(StandardScaler(), scale),
|
||||
(StandardScaler(with_mean=False), scale),
|
||||
(PowerTransformer('yeo-johnson'), power_transform),
|
||||
(PowerTransformer('box-cox'), power_transform,),
|
||||
(QuantileTransformer(n_quantiles=3), quantile_transform),
|
||||
(RobustScaler(), robust_scale),
|
||||
(RobustScaler(with_centering=False), robust_scale)]
|
||||
)
|
||||
def test_missing_value_pandas_na_support(est, func):
|
||||
# Test pandas IntegerArray with pd.NA
|
||||
pd = pytest.importorskip('pandas', minversion="1.0")
|
||||
|
||||
X = np.array([[1, 2, 3, np.nan, np.nan, 4, 5, 1],
|
||||
[np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
|
||||
[1, 2, 3, 4, 5, 6, 7, 8]]).T
|
||||
|
||||
# Creates dataframe with IntegerArrays with pd.NA
|
||||
X_df = pd.DataFrame(X, dtype="Int16", columns=['a', 'b', 'c'])
|
||||
X_df['c'] = X_df['c'].astype('int')
|
||||
|
||||
X_trans = est.fit_transform(X)
|
||||
X_df_trans = est.fit_transform(X_df)
|
||||
|
||||
assert_allclose(X_trans, X_df_trans)
|
||||
2508
venv/Lib/site-packages/sklearn/preprocessing/tests/test_data.py
Normal file
2508
venv/Lib/site-packages/sklearn/preprocessing/tests/test_data.py
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -0,0 +1,283 @@
|
|||
|
||||
import pytest
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
import warnings
|
||||
|
||||
from sklearn.preprocessing import KBinsDiscretizer
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.utils._testing import (
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
assert_warns_message
|
||||
)
|
||||
|
||||
X = [[-2, 1.5, -4, -1],
|
||||
[-1, 2.5, -3, -0.5],
|
||||
[0, 3.5, -2, 0.5],
|
||||
[1, 4.5, -1, 2]]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'strategy, expected',
|
||||
[('uniform', [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]),
|
||||
('kmeans', [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]),
|
||||
('quantile', [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]])])
|
||||
def test_fit_transform(strategy, expected):
|
||||
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy=strategy)
|
||||
est.fit(X)
|
||||
assert_array_equal(expected, est.transform(X))
|
||||
|
||||
|
||||
def test_valid_n_bins():
|
||||
KBinsDiscretizer(n_bins=2).fit_transform(X)
|
||||
KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X)
|
||||
assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(np.int)
|
||||
|
||||
|
||||
def test_invalid_n_bins():
|
||||
est = KBinsDiscretizer(n_bins=1)
|
||||
err_msg = ("KBinsDiscretizer received an invalid "
|
||||
"number of bins. Received 1, expected at least 2.")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
est = KBinsDiscretizer(n_bins=1.1)
|
||||
err_msg = ("KBinsDiscretizer received an invalid "
|
||||
"n_bins type. Received float, expected int.")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
|
||||
def test_invalid_n_bins_array():
|
||||
# Bad shape
|
||||
n_bins = np.full((2, 4), 2.)
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Incorrect number of features
|
||||
n_bins = [1, 2, 2]
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Bad bin values
|
||||
n_bins = [1, 2, 2, 1]
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = ("KBinsDiscretizer received an invalid number of bins "
|
||||
"at indices 0, 3. Number of bins must be at least 2, "
|
||||
"and must be an int.")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Float bin values
|
||||
n_bins = [2.1, 2, 2.1, 2]
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = ("KBinsDiscretizer received an invalid number of bins "
|
||||
"at indices 0, 2. Number of bins must be at least 2, "
|
||||
"and must be an int.")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'strategy, expected',
|
||||
[('uniform', [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]),
|
||||
('kmeans', [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]),
|
||||
('quantile', [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]])])
|
||||
def test_fit_transform_n_bins_array(strategy, expected):
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='ordinal',
|
||||
strategy=strategy).fit(X)
|
||||
assert_array_equal(expected, est.transform(X))
|
||||
|
||||
# test the shape of bin_edges_
|
||||
n_features = np.array(X).shape[1]
|
||||
assert est.bin_edges_.shape == (n_features, )
|
||||
for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_):
|
||||
assert bin_edges.shape == (n_bins + 1, )
|
||||
|
||||
|
||||
def test_invalid_n_features():
|
||||
est = KBinsDiscretizer(n_bins=3).fit(X)
|
||||
bad_X = np.arange(25).reshape(5, -1)
|
||||
err_msg = "Incorrect number of features. Expecting 4, received 5"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.transform(bad_X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
|
||||
def test_same_min_max(strategy):
|
||||
warnings.simplefilter("always")
|
||||
X = np.array([[1, -2],
|
||||
[1, -1],
|
||||
[1, 0],
|
||||
[1, 1]])
|
||||
est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode='ordinal')
|
||||
assert_warns_message(UserWarning,
|
||||
"Feature 0 is constant and will be replaced "
|
||||
"with 0.", est.fit, X)
|
||||
assert est.n_bins_[0] == 1
|
||||
# replace the feature with zeros
|
||||
Xt = est.transform(X)
|
||||
assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
|
||||
|
||||
|
||||
def test_transform_1d_behavior():
|
||||
X = np.arange(4)
|
||||
est = KBinsDiscretizer(n_bins=2)
|
||||
with pytest.raises(ValueError):
|
||||
est.fit(X)
|
||||
|
||||
est = KBinsDiscretizer(n_bins=2)
|
||||
est.fit(X.reshape(-1, 1))
|
||||
with pytest.raises(ValueError):
|
||||
est.transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('i', range(1, 9))
|
||||
def test_numeric_stability(i):
|
||||
X_init = np.array([2., 4., 6., 8., 10.]).reshape(-1, 1)
|
||||
Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)
|
||||
|
||||
# Test up to discretizing nano units
|
||||
X = X_init / 10**i
|
||||
Xt = KBinsDiscretizer(n_bins=2, encode='ordinal').fit_transform(X)
|
||||
assert_array_equal(Xt_expected, Xt)
|
||||
|
||||
|
||||
def test_invalid_encode_option():
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='invalid-encode')
|
||||
err_msg = (r"Valid options for 'encode' are "
|
||||
r"\('onehot', 'onehot-dense', 'ordinal'\). "
|
||||
r"Got encode='invalid-encode' instead.")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit(X)
|
||||
|
||||
|
||||
def test_encode_options():
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
|
||||
encode='ordinal').fit(X)
|
||||
Xt_1 = est.transform(X)
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
|
||||
encode='onehot-dense').fit(X)
|
||||
Xt_2 = est.transform(X)
|
||||
assert not sp.issparse(Xt_2)
|
||||
assert_array_equal(OneHotEncoder(
|
||||
categories=[np.arange(i) for i in [2, 3, 3, 3]],
|
||||
sparse=False)
|
||||
.fit_transform(Xt_1), Xt_2)
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
|
||||
encode='onehot').fit(X)
|
||||
Xt_3 = est.transform(X)
|
||||
assert sp.issparse(Xt_3)
|
||||
assert_array_equal(OneHotEncoder(
|
||||
categories=[np.arange(i) for i in [2, 3, 3, 3]],
|
||||
sparse=True)
|
||||
.fit_transform(Xt_1).toarray(),
|
||||
Xt_3.toarray())
|
||||
|
||||
|
||||
def test_invalid_strategy_option():
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], strategy='invalid-strategy')
|
||||
err_msg = (r"Valid options for 'strategy' are "
|
||||
r"\('uniform', 'quantile', 'kmeans'\). "
|
||||
r"Got strategy='invalid-strategy' instead.")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'strategy, expected_2bins, expected_3bins, expected_5bins',
|
||||
[('uniform', [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
|
||||
('kmeans', [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
|
||||
('quantile', [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2], [0, 1, 2, 3, 4, 4])])
|
||||
def test_nonuniform_strategies(
|
||||
strategy, expected_2bins, expected_3bins, expected_5bins):
|
||||
X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
|
||||
|
||||
# with 2 bins
|
||||
est = KBinsDiscretizer(n_bins=2, strategy=strategy, encode='ordinal')
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_2bins, Xt.ravel())
|
||||
|
||||
# with 3 bins
|
||||
est = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal')
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_3bins, Xt.ravel())
|
||||
|
||||
# with 5 bins
|
||||
est = KBinsDiscretizer(n_bins=5, strategy=strategy, encode='ordinal')
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_5bins, Xt.ravel())
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'strategy, expected_inv',
|
||||
[('uniform', [[-1.5, 2., -3.5, -0.5], [-0.5, 3., -2.5, -0.5],
|
||||
[0.5, 4., -1.5, 0.5], [0.5, 4., -1.5, 1.5]]),
|
||||
('kmeans', [[-1.375, 2.125, -3.375, -0.5625],
|
||||
[-1.375, 2.125, -3.375, -0.5625],
|
||||
[-0.125, 3.375, -2.125, 0.5625],
|
||||
[0.75, 4.25, -1.25, 1.625]]),
|
||||
('quantile', [[-1.5, 2., -3.5, -0.75], [-0.5, 3., -2.5, 0.],
|
||||
[0.5, 4., -1.5, 1.25], [0.5, 4., -1.5, 1.25]])])
|
||||
@pytest.mark.parametrize('encode', ['ordinal', 'onehot', 'onehot-dense'])
|
||||
def test_inverse_transform(strategy, encode, expected_inv):
|
||||
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
|
||||
Xt = kbd.fit_transform(X)
|
||||
Xinv = kbd.inverse_transform(Xt)
|
||||
assert_array_almost_equal(expected_inv, Xinv)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
|
||||
def test_transform_outside_fit_range(strategy):
|
||||
X = np.array([0, 1, 2, 3])[:, None]
|
||||
kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode='ordinal')
|
||||
kbd.fit(X)
|
||||
|
||||
X2 = np.array([-2, 5])[:, None]
|
||||
X2t = kbd.transform(X2)
|
||||
assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
|
||||
assert_array_equal(X2t.min(axis=0), [0])
|
||||
|
||||
|
||||
def test_overwrite():
|
||||
X = np.array([0, 1, 2, 3])[:, None]
|
||||
X_before = X.copy()
|
||||
|
||||
est = KBinsDiscretizer(n_bins=3, encode="ordinal")
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(X, X_before)
|
||||
|
||||
Xt_before = Xt.copy()
|
||||
Xinv = est.inverse_transform(Xt)
|
||||
assert_array_equal(Xt, Xt_before)
|
||||
assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'strategy, expected_bin_edges',
|
||||
[('quantile', [0, 1, 3]), ('kmeans', [0, 1.5, 3])])
|
||||
def test_redundant_bins(strategy, expected_bin_edges):
|
||||
X = [[0], [0], [0], [0], [3], [3]]
|
||||
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy)
|
||||
msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
|
||||
"are removed. Consider decreasing the number of bins.")
|
||||
assert_warns_message(UserWarning, msg, kbd.fit, X)
|
||||
assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
|
||||
|
||||
|
||||
def test_percentile_numeric_stability():
|
||||
X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
|
||||
bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
|
||||
Xt = np.array([0, 0, 4]).reshape(-1, 1)
|
||||
kbd = KBinsDiscretizer(n_bins=10, encode='ordinal',
|
||||
strategy='quantile')
|
||||
msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
|
||||
"are removed. Consider decreasing the number of bins.")
|
||||
assert_warns_message(UserWarning, msg, kbd.fit, X)
|
||||
assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
|
||||
assert_array_almost_equal(kbd.transform(X), Xt)
|
||||
|
|
@ -0,0 +1,698 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
import pytest
|
||||
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.preprocessing import OrdinalEncoder
|
||||
|
||||
|
||||
def test_one_hot_encoder_sparse_dense():
|
||||
# check that sparse and dense will give the same results
|
||||
|
||||
X = np.array([[3, 2, 1], [0, 1, 1]])
|
||||
enc_sparse = OneHotEncoder()
|
||||
enc_dense = OneHotEncoder(sparse=False)
|
||||
|
||||
X_trans_sparse = enc_sparse.fit_transform(X)
|
||||
X_trans_dense = enc_dense.fit_transform(X)
|
||||
|
||||
assert X_trans_sparse.shape == (2, 5)
|
||||
assert X_trans_dense.shape == (2, 5)
|
||||
|
||||
assert sparse.issparse(X_trans_sparse)
|
||||
assert not sparse.issparse(X_trans_dense)
|
||||
|
||||
# check outcome
|
||||
assert_array_equal(X_trans_sparse.toarray(), [[0., 1., 0., 1., 1.],
|
||||
[1., 0., 1., 0., 1.]])
|
||||
assert_array_equal(X_trans_sparse.toarray(), X_trans_dense)
|
||||
|
||||
|
||||
def test_one_hot_encoder_diff_n_features():
|
||||
X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
|
||||
X2 = np.array([[1, 0]])
|
||||
enc = OneHotEncoder()
|
||||
enc.fit(X)
|
||||
err_msg = ("The number of features in X is different to the number of "
|
||||
"features of the fitted data.")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
enc.transform(X2)
|
||||
|
||||
|
||||
def test_one_hot_encoder_handle_unknown():
|
||||
X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
|
||||
X2 = np.array([[4, 1, 1]])
|
||||
|
||||
# Test that one hot encoder raises error for unknown features
|
||||
# present during transform.
|
||||
oh = OneHotEncoder(handle_unknown='error')
|
||||
oh.fit(X)
|
||||
with pytest.raises(ValueError, match='Found unknown categories'):
|
||||
oh.transform(X2)
|
||||
|
||||
# Test the ignore option, ignores unknown features (giving all 0's)
|
||||
oh = OneHotEncoder(handle_unknown='ignore')
|
||||
oh.fit(X)
|
||||
X2_passed = X2.copy()
|
||||
assert_array_equal(
|
||||
oh.transform(X2_passed).toarray(),
|
||||
np.array([[0., 0., 0., 0., 1., 0., 0.]]))
|
||||
# ensure transformed data was not modified in place
|
||||
assert_allclose(X2, X2_passed)
|
||||
|
||||
# Raise error if handle_unknown is neither ignore or error.
|
||||
oh = OneHotEncoder(handle_unknown='42')
|
||||
with pytest.raises(ValueError, match='handle_unknown should be either'):
|
||||
oh.fit(X)
|
||||
|
||||
|
||||
def test_one_hot_encoder_not_fitted():
|
||||
X = np.array([['a'], ['b']])
|
||||
enc = OneHotEncoder(categories=['a', 'b'])
|
||||
msg = ("This OneHotEncoder instance is not fitted yet. "
|
||||
"Call 'fit' with appropriate arguments before using this "
|
||||
"estimator.")
|
||||
with pytest.raises(NotFittedError, match=msg):
|
||||
enc.transform(X)
|
||||
|
||||
|
||||
def test_one_hot_encoder_handle_unknown_strings():
|
||||
X = np.array(['11111111', '22', '333', '4444']).reshape((-1, 1))
|
||||
X2 = np.array(['55555', '22']).reshape((-1, 1))
|
||||
# Non Regression test for the issue #12470
|
||||
# Test the ignore option, when categories are numpy string dtype
|
||||
# particularly when the known category strings are larger
|
||||
# than the unknown category strings
|
||||
oh = OneHotEncoder(handle_unknown='ignore')
|
||||
oh.fit(X)
|
||||
X2_passed = X2.copy()
|
||||
assert_array_equal(
|
||||
oh.transform(X2_passed).toarray(),
|
||||
np.array([[0., 0., 0., 0.], [0., 1., 0., 0.]]))
|
||||
# ensure transformed data was not modified in place
|
||||
assert_array_equal(X2, X2_passed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
|
||||
@pytest.mark.parametrize("input_dtype", [np.int32, np.float32, np.float64])
|
||||
def test_one_hot_encoder_dtype(input_dtype, output_dtype):
|
||||
X = np.asarray([[0, 1]], dtype=input_dtype).T
|
||||
X_expected = np.asarray([[1, 0], [0, 1]], dtype=output_dtype)
|
||||
|
||||
oh = OneHotEncoder(categories='auto', dtype=output_dtype)
|
||||
assert_array_equal(oh.fit_transform(X).toarray(), X_expected)
|
||||
assert_array_equal(oh.fit(X).transform(X).toarray(), X_expected)
|
||||
|
||||
oh = OneHotEncoder(categories='auto', dtype=output_dtype, sparse=False)
|
||||
assert_array_equal(oh.fit_transform(X), X_expected)
|
||||
assert_array_equal(oh.fit(X).transform(X), X_expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
|
||||
def test_one_hot_encoder_dtype_pandas(output_dtype):
|
||||
pd = pytest.importorskip('pandas')
|
||||
|
||||
X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]})
|
||||
X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype)
|
||||
|
||||
oh = OneHotEncoder(dtype=output_dtype)
|
||||
assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected)
|
||||
assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected)
|
||||
|
||||
oh = OneHotEncoder(dtype=output_dtype, sparse=False)
|
||||
assert_array_equal(oh.fit_transform(X_df), X_expected)
|
||||
assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)
|
||||
|
||||
|
||||
def test_one_hot_encoder_feature_names():
|
||||
enc = OneHotEncoder()
|
||||
X = [['Male', 1, 'girl', 2, 3],
|
||||
['Female', 41, 'girl', 1, 10],
|
||||
['Male', 51, 'boy', 12, 3],
|
||||
['Male', 91, 'girl', 21, 30]]
|
||||
|
||||
enc.fit(X)
|
||||
feature_names = enc.get_feature_names()
|
||||
assert isinstance(feature_names, np.ndarray)
|
||||
|
||||
assert_array_equal(['x0_Female', 'x0_Male',
|
||||
'x1_1', 'x1_41', 'x1_51', 'x1_91',
|
||||
'x2_boy', 'x2_girl',
|
||||
'x3_1', 'x3_2', 'x3_12', 'x3_21',
|
||||
'x4_3',
|
||||
'x4_10', 'x4_30'], feature_names)
|
||||
|
||||
feature_names2 = enc.get_feature_names(['one', 'two',
|
||||
'three', 'four', 'five'])
|
||||
|
||||
assert_array_equal(['one_Female', 'one_Male',
|
||||
'two_1', 'two_41', 'two_51', 'two_91',
|
||||
'three_boy', 'three_girl',
|
||||
'four_1', 'four_2', 'four_12', 'four_21',
|
||||
'five_3', 'five_10', 'five_30'], feature_names2)
|
||||
|
||||
with pytest.raises(ValueError, match="input_features should have length"):
|
||||
enc.get_feature_names(['one', 'two'])
|
||||
|
||||
|
||||
def test_one_hot_encoder_feature_names_unicode():
|
||||
enc = OneHotEncoder()
|
||||
X = np.array([['c❤t1', 'dat2']], dtype=object).T
|
||||
enc.fit(X)
|
||||
feature_names = enc.get_feature_names()
|
||||
assert_array_equal(['x0_c❤t1', 'x0_dat2'], feature_names)
|
||||
feature_names = enc.get_feature_names(input_features=['n👍me'])
|
||||
assert_array_equal(['n👍me_c❤t1', 'n👍me_dat2'], feature_names)
|
||||
|
||||
|
||||
def test_one_hot_encoder_set_params():
|
||||
X = np.array([[1, 2]]).T
|
||||
oh = OneHotEncoder()
|
||||
# set params on not yet fitted object
|
||||
oh.set_params(categories=[[0, 1, 2, 3]])
|
||||
assert oh.get_params()['categories'] == [[0, 1, 2, 3]]
|
||||
assert oh.fit_transform(X).toarray().shape == (2, 4)
|
||||
# set params on already fitted object
|
||||
oh.set_params(categories=[[0, 1, 2, 3, 4]])
|
||||
assert oh.fit_transform(X).toarray().shape == (2, 5)
|
||||
|
||||
|
||||
def check_categorical_onehot(X):
|
||||
enc = OneHotEncoder(categories='auto')
|
||||
Xtr1 = enc.fit_transform(X)
|
||||
|
||||
enc = OneHotEncoder(categories='auto', sparse=False)
|
||||
Xtr2 = enc.fit_transform(X)
|
||||
|
||||
assert_allclose(Xtr1.toarray(), Xtr2)
|
||||
|
||||
assert sparse.isspmatrix_csr(Xtr1)
|
||||
return Xtr1.toarray()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X", [
|
||||
[['def', 1, 55], ['abc', 2, 55]],
|
||||
np.array([[10, 1, 55], [5, 2, 55]]),
|
||||
np.array([['b', 'A', 'cat'], ['a', 'B', 'cat']], dtype=object)
|
||||
], ids=['mixed', 'numeric', 'object'])
|
||||
def test_one_hot_encoder(X):
|
||||
Xtr = check_categorical_onehot(np.array(X)[:, [0]])
|
||||
assert_allclose(Xtr, [[0, 1], [1, 0]])
|
||||
|
||||
Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
|
||||
assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])
|
||||
|
||||
Xtr = OneHotEncoder(categories='auto').fit_transform(X)
|
||||
assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize('sparse_', [False, True])
|
||||
@pytest.mark.parametrize('drop', [None, 'first'])
|
||||
def test_one_hot_encoder_inverse(sparse_, drop):
|
||||
X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]]
|
||||
enc = OneHotEncoder(sparse=sparse_, drop=drop)
|
||||
X_tr = enc.fit_transform(X)
|
||||
exp = np.array(X, dtype=object)
|
||||
assert_array_equal(enc.inverse_transform(X_tr), exp)
|
||||
|
||||
X = [[2, 55], [1, 55], [3, 55]]
|
||||
enc = OneHotEncoder(sparse=sparse_, categories='auto',
|
||||
drop=drop)
|
||||
X_tr = enc.fit_transform(X)
|
||||
exp = np.array(X)
|
||||
assert_array_equal(enc.inverse_transform(X_tr), exp)
|
||||
|
||||
if drop is None:
|
||||
# with unknown categories
|
||||
# drop is incompatible with handle_unknown=ignore
|
||||
X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]]
|
||||
enc = OneHotEncoder(sparse=sparse_, handle_unknown='ignore',
|
||||
categories=[['abc', 'def'], [1, 2],
|
||||
[54, 55, 56]])
|
||||
X_tr = enc.fit_transform(X)
|
||||
exp = np.array(X, dtype=object)
|
||||
exp[2, 1] = None
|
||||
assert_array_equal(enc.inverse_transform(X_tr), exp)
|
||||
|
||||
# with an otherwise numerical output, still object if unknown
|
||||
X = [[2, 55], [1, 55], [3, 55]]
|
||||
enc = OneHotEncoder(sparse=sparse_, categories=[[1, 2], [54, 56]],
|
||||
handle_unknown='ignore')
|
||||
X_tr = enc.fit_transform(X)
|
||||
exp = np.array(X, dtype=object)
|
||||
exp[2, 0] = None
|
||||
exp[:, 1] = None
|
||||
assert_array_equal(enc.inverse_transform(X_tr), exp)
|
||||
|
||||
# incorrect shape raises
|
||||
X_tr = np.array([[0, 1, 1], [1, 0, 1]])
|
||||
msg = re.escape('Shape of the passed X data is not correct')
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
enc.inverse_transform(X_tr)
|
||||
|
||||
|
||||
def test_one_hot_encoder_inverse_if_binary():
|
||||
X = np.array([['Male', 1],
|
||||
['Female', 3],
|
||||
['Female', 2]], dtype=object)
|
||||
ohe = OneHotEncoder(drop='if_binary', sparse=False)
|
||||
X_tr = ohe.fit_transform(X)
|
||||
assert_array_equal(ohe.inverse_transform(X_tr), X)
|
||||
|
||||
|
||||
# check that resetting drop option without refitting does not throw an error
|
||||
@pytest.mark.parametrize('drop', ['if_binary', 'first', None])
|
||||
@pytest.mark.parametrize('reset_drop', ['if_binary', 'first', None])
|
||||
def test_one_hot_encoder_drop_reset(drop, reset_drop):
|
||||
X = np.array([['Male', 1],
|
||||
['Female', 3],
|
||||
['Female', 2]], dtype=object)
|
||||
ohe = OneHotEncoder(drop=drop, sparse=False)
|
||||
ohe.fit(X)
|
||||
X_tr = ohe.transform(X)
|
||||
feature_names = ohe.get_feature_names()
|
||||
ohe.set_params(drop=reset_drop)
|
||||
assert_array_equal(ohe.inverse_transform(X_tr), X)
|
||||
assert_allclose(ohe.transform(X), X_tr)
|
||||
assert_array_equal(ohe.get_feature_names(), feature_names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
|
||||
@pytest.mark.parametrize("X", [
|
||||
[1, 2],
|
||||
np.array([3., 4.])
|
||||
])
|
||||
def test_X_is_not_1D(X, method):
|
||||
oh = OneHotEncoder()
|
||||
|
||||
msg = ("Expected 2D array, got 1D array instead")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
getattr(oh, method)(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
|
||||
def test_X_is_not_1D_pandas(method):
|
||||
pd = pytest.importorskip('pandas')
|
||||
X = pd.Series([6, 3, 4, 6])
|
||||
oh = OneHotEncoder()
|
||||
|
||||
msg = ("Expected 2D array, got 1D array instead")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
getattr(oh, method)(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X, cat_exp, cat_dtype", [
|
||||
([['abc', 55], ['def', 55]], [['abc', 'def'], [55]], np.object_),
|
||||
(np.array([[1, 2], [3, 2]]), [[1, 3], [2]], np.integer),
|
||||
(np.array([['A', 'cat'], ['B', 'cat']], dtype=object),
|
||||
[['A', 'B'], ['cat']], np.object_),
|
||||
(np.array([['A', 'cat'], ['B', 'cat']]),
|
||||
[['A', 'B'], ['cat']], np.str_)
|
||||
], ids=['mixed', 'numeric', 'object', 'string'])
|
||||
def test_one_hot_encoder_categories(X, cat_exp, cat_dtype):
|
||||
# order of categories should not depend on order of samples
|
||||
for Xi in [X, X[::-1]]:
|
||||
enc = OneHotEncoder(categories='auto')
|
||||
enc.fit(Xi)
|
||||
# assert enc.categories == 'auto'
|
||||
assert isinstance(enc.categories_, list)
|
||||
for res, exp in zip(enc.categories_, cat_exp):
|
||||
assert res.tolist() == exp
|
||||
assert np.issubdtype(res.dtype, cat_dtype)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X, X2, cats, cat_dtype", [
|
||||
(np.array([['a', 'b']], dtype=object).T,
|
||||
np.array([['a', 'd']], dtype=object).T,
|
||||
[['a', 'b', 'c']], np.object_),
|
||||
(np.array([[1, 2]], dtype='int64').T,
|
||||
np.array([[1, 4]], dtype='int64').T,
|
||||
[[1, 2, 3]], np.int64),
|
||||
(np.array([['a', 'b']], dtype=object).T,
|
||||
np.array([['a', 'd']], dtype=object).T,
|
||||
[np.array(['a', 'b', 'c'])], np.object_),
|
||||
], ids=['object', 'numeric', 'object-string-cat'])
|
||||
def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype):
|
||||
enc = OneHotEncoder(categories=cats)
|
||||
exp = np.array([[1., 0., 0.],
|
||||
[0., 1., 0.]])
|
||||
assert_array_equal(enc.fit_transform(X).toarray(), exp)
|
||||
assert list(enc.categories[0]) == list(cats[0])
|
||||
assert enc.categories_[0].tolist() == list(cats[0])
|
||||
# manually specified categories should have same dtype as
|
||||
# the data when coerced from lists
|
||||
assert enc.categories_[0].dtype == cat_dtype
|
||||
|
||||
# when specifying categories manually, unknown categories should already
|
||||
# raise when fitting
|
||||
enc = OneHotEncoder(categories=cats)
|
||||
with pytest.raises(ValueError, match="Found unknown categories"):
|
||||
enc.fit(X2)
|
||||
enc = OneHotEncoder(categories=cats, handle_unknown='ignore')
|
||||
exp = np.array([[1., 0., 0.], [0., 0., 0.]])
|
||||
assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)
|
||||
|
||||
|
||||
def test_one_hot_encoder_unsorted_categories():
|
||||
X = np.array([['a', 'b']], dtype=object).T
|
||||
|
||||
enc = OneHotEncoder(categories=[['b', 'a', 'c']])
|
||||
exp = np.array([[0., 1., 0.],
|
||||
[1., 0., 0.]])
|
||||
assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
|
||||
assert_array_equal(enc.fit_transform(X).toarray(), exp)
|
||||
assert enc.categories_[0].tolist() == ['b', 'a', 'c']
|
||||
assert np.issubdtype(enc.categories_[0].dtype, np.object_)
|
||||
|
||||
# unsorted passed categories still raise for numerical values
|
||||
X = np.array([[1, 2]]).T
|
||||
enc = OneHotEncoder(categories=[[2, 1, 3]])
|
||||
msg = 'Unsorted categories are not supported'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
enc.fit_transform(X)
|
||||
|
||||
|
||||
def test_one_hot_encoder_specified_categories_mixed_columns():
|
||||
# multiple columns
|
||||
X = np.array([['a', 'b'], [0, 2]], dtype=object).T
|
||||
enc = OneHotEncoder(categories=[['a', 'b', 'c'], [0, 1, 2]])
|
||||
exp = np.array([[1., 0., 0., 1., 0., 0.],
|
||||
[0., 1., 0., 0., 0., 1.]])
|
||||
assert_array_equal(enc.fit_transform(X).toarray(), exp)
|
||||
assert enc.categories_[0].tolist() == ['a', 'b', 'c']
|
||||
assert np.issubdtype(enc.categories_[0].dtype, np.object_)
|
||||
assert enc.categories_[1].tolist() == [0, 1, 2]
|
||||
# integer categories but from object dtype data
|
||||
assert np.issubdtype(enc.categories_[1].dtype, np.object_)
|
||||
|
||||
|
||||
def test_one_hot_encoder_pandas():
|
||||
pd = pytest.importorskip('pandas')
|
||||
|
||||
X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]})
|
||||
|
||||
Xtr = check_categorical_onehot(X_df)
|
||||
assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("drop, expected_names",
|
||||
[('first', ['x0_c', 'x2_b']),
|
||||
('if_binary', ['x0_c', 'x1_2', 'x2_b']),
|
||||
(['c', 2, 'b'], ['x0_b', 'x2_a'])],
|
||||
ids=['first', 'binary', 'manual'])
|
||||
def test_one_hot_encoder_feature_names_drop(drop, expected_names):
|
||||
X = [['c', 2, 'a'],
|
||||
['b', 2, 'b']]
|
||||
|
||||
ohe = OneHotEncoder(drop=drop)
|
||||
ohe.fit(X)
|
||||
feature_names = ohe.get_feature_names()
|
||||
assert isinstance(feature_names, np.ndarray)
|
||||
assert_array_equal(expected_names, feature_names)
|
||||
|
||||
|
||||
def test_one_hot_encoder_drop_equals_if_binary():
|
||||
# Canonical case
|
||||
X = [[10, 'yes'],
|
||||
[20, 'no'],
|
||||
[30, 'yes']]
|
||||
expected = np.array([[1., 0., 0., 1.],
|
||||
[0., 1., 0., 0.],
|
||||
[0., 0., 1., 1.]])
|
||||
expected_drop_idx = np.array([None, 0])
|
||||
|
||||
ohe = OneHotEncoder(drop='if_binary', sparse=False)
|
||||
result = ohe.fit_transform(X)
|
||||
assert_array_equal(ohe.drop_idx_, expected_drop_idx)
|
||||
assert_allclose(result, expected)
|
||||
|
||||
# with only one cat, the behaviour is equivalent to drop=None
|
||||
X = [['true', 'a'],
|
||||
['false', 'a'],
|
||||
['false', 'a']]
|
||||
expected = np.array([[1., 1.],
|
||||
[0., 1.],
|
||||
[0., 1.]])
|
||||
expected_drop_idx = np.array([0, None])
|
||||
|
||||
ohe = OneHotEncoder(drop='if_binary', sparse=False)
|
||||
result = ohe.fit_transform(X)
|
||||
assert_array_equal(ohe.drop_idx_, expected_drop_idx)
|
||||
assert_allclose(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
|
||||
np.array([['a', np.nan]], dtype=object).T],
|
||||
ids=['numeric', 'object'])
|
||||
@pytest.mark.parametrize("as_data_frame", [False, True],
|
||||
ids=['array', 'dataframe'])
|
||||
@pytest.mark.parametrize("handle_unknown", ['error', 'ignore'])
|
||||
def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
|
||||
if as_data_frame:
|
||||
pd = pytest.importorskip('pandas')
|
||||
X = pd.DataFrame(X)
|
||||
|
||||
ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown)
|
||||
|
||||
with pytest.raises(ValueError, match="Input contains NaN"):
|
||||
ohe.fit(X)
|
||||
|
||||
with pytest.raises(ValueError, match="Input contains NaN"):
|
||||
ohe.fit_transform(X)
|
||||
|
||||
if as_data_frame:
|
||||
X_partial = X.iloc[:1, :]
|
||||
else:
|
||||
X_partial = X[:1, :]
|
||||
|
||||
ohe.fit(X_partial)
|
||||
|
||||
with pytest.raises(ValueError, match="Input contains NaN"):
|
||||
ohe.transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X", [
|
||||
[['abc', 2, 55], ['def', 1, 55]],
|
||||
np.array([[10, 2, 55], [20, 1, 55]]),
|
||||
np.array([['a', 'B', 'cat'], ['b', 'A', 'cat']], dtype=object)
|
||||
], ids=['mixed', 'numeric', 'object'])
|
||||
def test_ordinal_encoder(X):
|
||||
enc = OrdinalEncoder()
|
||||
exp = np.array([[0, 1, 0],
|
||||
[1, 0, 0]], dtype='int64')
|
||||
assert_array_equal(enc.fit_transform(X), exp.astype('float64'))
|
||||
enc = OrdinalEncoder(dtype='int64')
|
||||
assert_array_equal(enc.fit_transform(X), exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X, X2, cats, cat_dtype", [
|
||||
(np.array([['a', 'b']], dtype=object).T,
|
||||
np.array([['a', 'd']], dtype=object).T,
|
||||
[['a', 'b', 'c']], np.object_),
|
||||
(np.array([[1, 2]], dtype='int64').T,
|
||||
np.array([[1, 4]], dtype='int64').T,
|
||||
[[1, 2, 3]], np.int64),
|
||||
(np.array([['a', 'b']], dtype=object).T,
|
||||
np.array([['a', 'd']], dtype=object).T,
|
||||
[np.array(['a', 'b', 'c'])], np.object_),
|
||||
], ids=['object', 'numeric', 'object-string-cat'])
|
||||
def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
|
||||
enc = OrdinalEncoder(categories=cats)
|
||||
exp = np.array([[0.], [1.]])
|
||||
assert_array_equal(enc.fit_transform(X), exp)
|
||||
assert list(enc.categories[0]) == list(cats[0])
|
||||
assert enc.categories_[0].tolist() == list(cats[0])
|
||||
# manually specified categories should have same dtype as
|
||||
# the data when coerced from lists
|
||||
assert enc.categories_[0].dtype == cat_dtype
|
||||
|
||||
# when specifying categories manually, unknown categories should already
|
||||
# raise when fitting
|
||||
enc = OrdinalEncoder(categories=cats)
|
||||
with pytest.raises(ValueError, match="Found unknown categories"):
|
||||
enc.fit(X2)
|
||||
|
||||
|
||||
def test_ordinal_encoder_inverse():
|
||||
X = [['abc', 2, 55], ['def', 1, 55]]
|
||||
enc = OrdinalEncoder()
|
||||
X_tr = enc.fit_transform(X)
|
||||
exp = np.array(X, dtype=object)
|
||||
assert_array_equal(enc.inverse_transform(X_tr), exp)
|
||||
|
||||
# incorrect shape raises
|
||||
X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
|
||||
msg = re.escape('Shape of the passed X data is not correct')
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
enc.inverse_transform(X_tr)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
|
||||
np.array([['a', np.nan]], dtype=object).T],
|
||||
ids=['numeric', 'object'])
|
||||
def test_ordinal_encoder_raise_missing(X):
|
||||
ohe = OrdinalEncoder()
|
||||
|
||||
with pytest.raises(ValueError, match="Input contains NaN"):
|
||||
ohe.fit(X)
|
||||
|
||||
with pytest.raises(ValueError, match="Input contains NaN"):
|
||||
ohe.fit_transform(X)
|
||||
|
||||
ohe.fit(X[:1, :])
|
||||
|
||||
with pytest.raises(ValueError, match="Input contains NaN"):
|
||||
ohe.transform(X)
|
||||
|
||||
|
||||
def test_ordinal_encoder_raise_categories_shape():
|
||||
|
||||
X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T
|
||||
cats = ['Low', 'Medium', 'High']
|
||||
enc = OrdinalEncoder(categories=cats)
|
||||
msg = ("Shape mismatch: if categories is an array,")
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
enc.fit(X)
|
||||
|
||||
|
||||
def test_encoder_dtypes():
|
||||
# check that dtypes are preserved when determining categories
|
||||
enc = OneHotEncoder(categories='auto')
|
||||
exp = np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]], dtype='float64')
|
||||
|
||||
for X in [np.array([[1, 2], [3, 4]], dtype='int64'),
|
||||
np.array([[1, 2], [3, 4]], dtype='float64'),
|
||||
np.array([['a', 'b'], ['c', 'd']]), # string dtype
|
||||
np.array([[1, 'a'], [3, 'b']], dtype='object')]:
|
||||
enc.fit(X)
|
||||
assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
|
||||
assert_array_equal(enc.transform(X).toarray(), exp)
|
||||
|
||||
X = [[1, 2], [3, 4]]
|
||||
enc.fit(X)
|
||||
assert all([np.issubdtype(enc.categories_[i].dtype, np.integer)
|
||||
for i in range(2)])
|
||||
assert_array_equal(enc.transform(X).toarray(), exp)
|
||||
|
||||
X = [[1, 'a'], [3, 'b']]
|
||||
enc.fit(X)
|
||||
assert all([enc.categories_[i].dtype == 'object' for i in range(2)])
|
||||
assert_array_equal(enc.transform(X).toarray(), exp)
|
||||
|
||||
|
||||
def test_encoder_dtypes_pandas():
|
||||
# check dtype (similar to test_categorical_encoder_dtypes for dataframes)
|
||||
pd = pytest.importorskip('pandas')
|
||||
|
||||
enc = OneHotEncoder(categories='auto')
|
||||
exp = np.array([[1., 0., 1., 0., 1., 0.],
|
||||
[0., 1., 0., 1., 0., 1.]], dtype='float64')
|
||||
|
||||
X = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}, dtype='int64')
|
||||
enc.fit(X)
|
||||
assert all([enc.categories_[i].dtype == 'int64' for i in range(2)])
|
||||
assert_array_equal(enc.transform(X).toarray(), exp)
|
||||
|
||||
X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]})
|
||||
X_type = [X['A'].dtype, X['B'].dtype, X['C'].dtype]
|
||||
enc.fit(X)
|
||||
assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
|
||||
assert_array_equal(enc.transform(X).toarray(), exp)
|
||||
|
||||
|
||||
def test_one_hot_encoder_warning():
|
||||
enc = OneHotEncoder()
|
||||
X = [['Male', 1], ['Female', 3]]
|
||||
np.testing.assert_no_warnings(enc.fit_transform, X)
|
||||
|
||||
|
||||
def test_one_hot_encoder_drop_manual():
|
||||
cats_to_drop = ['def', 12, 3, 56]
|
||||
enc = OneHotEncoder(drop=cats_to_drop)
|
||||
X = [['abc', 12, 2, 55],
|
||||
['def', 12, 1, 55],
|
||||
['def', 12, 3, 56]]
|
||||
trans = enc.fit_transform(X).toarray()
|
||||
exp = [[1, 0, 1, 1],
|
||||
[0, 1, 0, 1],
|
||||
[0, 0, 0, 0]]
|
||||
assert_array_equal(trans, exp)
|
||||
dropped_cats = [cat[feature]
|
||||
for cat, feature in zip(enc.categories_,
|
||||
enc.drop_idx_)]
|
||||
assert_array_equal(dropped_cats, cats_to_drop)
|
||||
assert_array_equal(np.array(X, dtype=object),
|
||||
enc.inverse_transform(trans))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X_fit, params, err_msg",
|
||||
[([["Male"], ["Female"]], {'drop': 'second'},
|
||||
"Wrong input for parameter `drop`"),
|
||||
([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'},
|
||||
"`handle_unknown` must be 'error'"),
|
||||
([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
|
||||
{'drop': np.asarray('b', dtype=object)},
|
||||
"Wrong input for parameter `drop`"),
|
||||
([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
|
||||
{'drop': ['ghi', 3, 59]},
|
||||
"The following categories were supposed")]
|
||||
)
|
||||
def test_one_hot_encoder_invalid_params(X_fit, params, err_msg):
|
||||
enc = OneHotEncoder(**params)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
enc.fit(X_fit)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('drop', [['abc', 3], ['abc', 3, 41, 'a']])
|
||||
def test_invalid_drop_length(drop):
|
||||
enc = OneHotEncoder(drop=drop)
|
||||
err_msg = "`drop` should have length equal to the number"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
enc.fit([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("density", [True, False],
|
||||
ids=['sparse', 'dense'])
|
||||
@pytest.mark.parametrize("drop", ['first',
|
||||
['a', 2, 'b']],
|
||||
ids=['first', 'manual'])
|
||||
def test_categories(density, drop):
|
||||
ohe_base = OneHotEncoder(sparse=density)
|
||||
ohe_test = OneHotEncoder(sparse=density, drop=drop)
|
||||
X = [['c', 1, 'a'],
|
||||
['a', 2, 'b']]
|
||||
ohe_base.fit(X)
|
||||
ohe_test.fit(X)
|
||||
assert_array_equal(ohe_base.categories_, ohe_test.categories_)
|
||||
if drop == 'first':
|
||||
assert_array_equal(ohe_test.drop_idx_, 0)
|
||||
else:
|
||||
for drop_cat, drop_idx, cat_list in zip(drop,
|
||||
ohe_test.drop_idx_,
|
||||
ohe_test.categories_):
|
||||
assert cat_list[int(drop_idx)] == drop_cat
|
||||
assert isinstance(ohe_test.drop_idx_, np.ndarray)
|
||||
assert ohe_test.drop_idx_.dtype == np.object
|
||||
|
||||
|
||||
@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
|
||||
def test_encoders_has_categorical_tags(Encoder):
|
||||
assert 'categorical' in Encoder()._get_tags()['X_types']
|
||||
|
||||
|
||||
@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
|
||||
def test_encoders_does_not_support_none_values(Encoder):
|
||||
values = [["a"], [None]]
|
||||
with pytest.raises(TypeError, match="Encoders require their input to be "
|
||||
"uniformly strings or numbers."):
|
||||
Encoder().fit(values)
|
||||
|
|
@ -0,0 +1,160 @@
|
|||
import pytest
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn.preprocessing import FunctionTransformer
|
||||
from sklearn.utils._testing import (assert_array_equal,
|
||||
assert_allclose_dense_sparse)
|
||||
from sklearn.utils._testing import assert_warns_message, assert_no_warnings
|
||||
|
||||
|
||||
def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
|
||||
def _func(X, *args, **kwargs):
|
||||
args_store.append(X)
|
||||
args_store.extend(args)
|
||||
kwargs_store.update(kwargs)
|
||||
return func(X)
|
||||
|
||||
return _func
|
||||
|
||||
|
||||
def test_delegate_to_func():
|
||||
# (args|kwargs)_store will hold the positional and keyword arguments
|
||||
# passed to the function inside the FunctionTransformer.
|
||||
args_store = []
|
||||
kwargs_store = {}
|
||||
X = np.arange(10).reshape((5, 2))
|
||||
assert_array_equal(
|
||||
FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
|
||||
X, 'transform should have returned X unchanged',
|
||||
)
|
||||
|
||||
# The function should only have received X.
|
||||
assert args_store == [X], ('Incorrect positional arguments passed to '
|
||||
'func: {args}'.format(args=args_store))
|
||||
|
||||
assert not kwargs_store, ('Unexpected keyword arguments passed to '
|
||||
'func: {args}'.format(args=kwargs_store))
|
||||
|
||||
# reset the argument stores.
|
||||
args_store[:] = []
|
||||
kwargs_store.clear()
|
||||
transformed = FunctionTransformer(
|
||||
_make_func(args_store, kwargs_store),
|
||||
).transform(X)
|
||||
|
||||
assert_array_equal(transformed, X,
|
||||
err_msg='transform should have returned X unchanged')
|
||||
|
||||
# The function should have received X
|
||||
assert args_store == [X], ('Incorrect positional arguments passed '
|
||||
'to func: {args}'.format(args=args_store))
|
||||
|
||||
assert not kwargs_store, ('Unexpected keyword arguments passed to '
|
||||
'func: {args}'.format(args=kwargs_store))
|
||||
|
||||
|
||||
def test_np_log():
|
||||
X = np.arange(10).reshape((5, 2))
|
||||
|
||||
# Test that the numpy.log example still works.
|
||||
assert_array_equal(
|
||||
FunctionTransformer(np.log1p).transform(X),
|
||||
np.log1p(X),
|
||||
)
|
||||
|
||||
|
||||
def test_kw_arg():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X),
|
||||
np.around(X, decimals=3))
|
||||
|
||||
|
||||
def test_kw_arg_update():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
F.kw_args['decimals'] = 1
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=1))
|
||||
|
||||
|
||||
def test_kw_arg_reset():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
F.kw_args = dict(decimals=1)
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=1))
|
||||
|
||||
|
||||
def test_inverse_transform():
|
||||
X = np.array([1, 4, 9, 16]).reshape((2, 2))
|
||||
|
||||
# Test that inverse_transform works correctly
|
||||
F = FunctionTransformer(
|
||||
func=np.sqrt,
|
||||
inverse_func=np.around, inv_kw_args=dict(decimals=3),
|
||||
)
|
||||
assert_array_equal(
|
||||
F.inverse_transform(F.transform(X)),
|
||||
np.around(np.sqrt(X), decimals=3),
|
||||
)
|
||||
|
||||
|
||||
def test_check_inverse():
|
||||
X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
|
||||
|
||||
X_list = [X_dense,
|
||||
sparse.csr_matrix(X_dense),
|
||||
sparse.csc_matrix(X_dense)]
|
||||
|
||||
for X in X_list:
|
||||
if sparse.issparse(X):
|
||||
accept_sparse = True
|
||||
else:
|
||||
accept_sparse = False
|
||||
trans = FunctionTransformer(func=np.sqrt,
|
||||
inverse_func=np.around,
|
||||
accept_sparse=accept_sparse,
|
||||
check_inverse=True,
|
||||
validate=True)
|
||||
assert_warns_message(UserWarning,
|
||||
"The provided functions are not strictly"
|
||||
" inverse of each other. If you are sure you"
|
||||
" want to proceed regardless, set"
|
||||
" 'check_inverse=False'.",
|
||||
trans.fit, X)
|
||||
|
||||
trans = FunctionTransformer(func=np.expm1,
|
||||
inverse_func=np.log1p,
|
||||
accept_sparse=accept_sparse,
|
||||
check_inverse=True,
|
||||
validate=True)
|
||||
Xt = assert_no_warnings(trans.fit_transform, X)
|
||||
assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))
|
||||
|
||||
# check that we don't check inverse when one of the func or inverse is not
|
||||
# provided.
|
||||
trans = FunctionTransformer(func=np.expm1, inverse_func=None,
|
||||
check_inverse=True, validate=True)
|
||||
assert_no_warnings(trans.fit, X_dense)
|
||||
trans = FunctionTransformer(func=None, inverse_func=np.expm1,
|
||||
check_inverse=True, validate=True)
|
||||
assert_no_warnings(trans.fit, X_dense)
|
||||
|
||||
|
||||
def test_function_transformer_frame():
|
||||
pd = pytest.importorskip('pandas')
|
||||
X_df = pd.DataFrame(np.random.randn(100, 10))
|
||||
transformer = FunctionTransformer()
|
||||
X_df_trans = transformer.fit_transform(X_df)
|
||||
assert hasattr(X_df_trans, 'loc')
|
||||
656
venv/Lib/site-packages/sklearn/preprocessing/tests/test_label.py
Normal file
656
venv/Lib/site-packages/sklearn/preprocessing/tests/test_label.py
Normal file
|
|
@ -0,0 +1,656 @@
|
|||
import numpy as np
|
||||
|
||||
import pytest
|
||||
|
||||
from scipy.sparse import issparse
|
||||
from scipy.sparse import coo_matrix
|
||||
from scipy.sparse import csc_matrix
|
||||
from scipy.sparse import csr_matrix
|
||||
from scipy.sparse import dok_matrix
|
||||
from scipy.sparse import lil_matrix
|
||||
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_warns_message
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
from sklearn.utils import _to_object_array
|
||||
|
||||
from sklearn.preprocessing._label import LabelBinarizer
|
||||
from sklearn.preprocessing._label import MultiLabelBinarizer
|
||||
from sklearn.preprocessing._label import LabelEncoder
|
||||
from sklearn.preprocessing._label import label_binarize
|
||||
|
||||
from sklearn.preprocessing._label import _inverse_binarize_thresholding
|
||||
from sklearn.preprocessing._label import _inverse_binarize_multiclass
|
||||
from sklearn.preprocessing._label import _encode
|
||||
|
||||
from sklearn import datasets
|
||||
|
||||
iris = datasets.load_iris()
|
||||
|
||||
|
||||
def toarray(a):
|
||||
if hasattr(a, "toarray"):
|
||||
a = a.toarray()
|
||||
return a
|
||||
|
||||
|
||||
def test_label_binarizer():
|
||||
# one-class case defaults to negative label
|
||||
# For dense case:
|
||||
inp = ["pos", "pos", "pos", "pos"]
|
||||
lb = LabelBinarizer(sparse_output=False)
|
||||
expected = np.array([[0, 0, 0, 0]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["pos"])
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
# For sparse case:
|
||||
lb = LabelBinarizer(sparse_output=True)
|
||||
got = lb.fit_transform(inp)
|
||||
assert issparse(got)
|
||||
assert_array_equal(lb.classes_, ["pos"])
|
||||
assert_array_equal(expected, got.toarray())
|
||||
assert_array_equal(lb.inverse_transform(got.toarray()), inp)
|
||||
|
||||
lb = LabelBinarizer(sparse_output=False)
|
||||
# two-class case
|
||||
inp = ["neg", "pos", "pos", "neg"]
|
||||
expected = np.array([[0, 1, 1, 0]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["neg", "pos"])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
to_invert = np.array([[1, 0],
|
||||
[0, 1],
|
||||
[0, 1],
|
||||
[1, 0]])
|
||||
assert_array_equal(lb.inverse_transform(to_invert), inp)
|
||||
|
||||
# multi-class case
|
||||
inp = ["spam", "ham", "eggs", "ham", "0"]
|
||||
expected = np.array([[0, 0, 0, 1],
|
||||
[0, 0, 1, 0],
|
||||
[0, 1, 0, 0],
|
||||
[0, 0, 1, 0],
|
||||
[1, 0, 0, 0]])
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
|
||||
def test_label_binarizer_unseen_labels():
|
||||
lb = LabelBinarizer()
|
||||
|
||||
expected = np.array([[1, 0, 0],
|
||||
[0, 1, 0],
|
||||
[0, 0, 1]])
|
||||
got = lb.fit_transform(['b', 'd', 'e'])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
expected = np.array([[0, 0, 0],
|
||||
[1, 0, 0],
|
||||
[0, 0, 0],
|
||||
[0, 1, 0],
|
||||
[0, 0, 1],
|
||||
[0, 0, 0]])
|
||||
got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f'])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
|
||||
def test_label_binarizer_set_label_encoding():
|
||||
lb = LabelBinarizer(neg_label=-2, pos_label=0)
|
||||
|
||||
# two-class case with pos_label=0
|
||||
inp = np.array([0, 1, 1, 0])
|
||||
expected = np.array([[-2, 0, 0, -2]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
lb = LabelBinarizer(neg_label=-2, pos_label=2)
|
||||
|
||||
# multi-class case
|
||||
inp = np.array([3, 2, 1, 2, 0])
|
||||
expected = np.array([[-2, -2, -2, +2],
|
||||
[-2, -2, +2, -2],
|
||||
[-2, +2, -2, -2],
|
||||
[-2, -2, +2, -2],
|
||||
[+2, -2, -2, -2]])
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
|
||||
@ignore_warnings
|
||||
def test_label_binarizer_errors():
|
||||
# Check that invalid arguments yield ValueError
|
||||
one_class = np.array([0, 0, 0, 0])
|
||||
lb = LabelBinarizer().fit(one_class)
|
||||
|
||||
multi_label = [(2, 3), (0,), (0, 2)]
|
||||
with pytest.raises(ValueError):
|
||||
lb.transform(multi_label)
|
||||
|
||||
lb = LabelBinarizer()
|
||||
with pytest.raises(ValueError):
|
||||
lb.transform([])
|
||||
with pytest.raises(ValueError):
|
||||
lb.inverse_transform([])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
LabelBinarizer(neg_label=2, pos_label=1)
|
||||
with pytest.raises(ValueError):
|
||||
LabelBinarizer(neg_label=2, pos_label=2)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
|
||||
|
||||
# Fail on y_type
|
||||
with pytest.raises(ValueError):
|
||||
_inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
|
||||
output_type="foo", classes=[1, 2],
|
||||
threshold=0)
|
||||
|
||||
# Sequence of seq type should raise ValueError
|
||||
y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
|
||||
with pytest.raises(ValueError):
|
||||
LabelBinarizer().fit_transform(y_seq_of_seqs)
|
||||
|
||||
# Fail on the number of classes
|
||||
with pytest.raises(ValueError):
|
||||
_inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
|
||||
output_type="foo",
|
||||
classes=[1, 2, 3],
|
||||
threshold=0)
|
||||
|
||||
# Fail on the dimension of 'binary'
|
||||
with pytest.raises(ValueError):
|
||||
_inverse_binarize_thresholding(y=np.array([[1, 2, 3], [2, 1, 3]]),
|
||||
output_type="binary",
|
||||
classes=[1, 2, 3],
|
||||
threshold=0)
|
||||
|
||||
# Fail on multioutput data
|
||||
with pytest.raises(ValueError):
|
||||
LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, classes, unknown",
|
||||
[(np.array([2, 1, 3, 1, 3], dtype='int64'),
|
||||
np.array([1, 2, 3], dtype='int64'), np.array([4], dtype='int64')),
|
||||
(np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
|
||||
np.array(['a', 'b', 'c'], dtype=object),
|
||||
np.array(['d'], dtype=object)),
|
||||
(np.array(['b', 'a', 'c', 'a', 'c']),
|
||||
np.array(['a', 'b', 'c']), np.array(['d']))],
|
||||
ids=['int64', 'object', 'str'])
|
||||
def test_label_encoder(values, classes, unknown):
|
||||
# Test LabelEncoder's transform, fit_transform and
|
||||
# inverse_transform methods
|
||||
le = LabelEncoder()
|
||||
le.fit(values)
|
||||
assert_array_equal(le.classes_, classes)
|
||||
assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
|
||||
assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
|
||||
le = LabelEncoder()
|
||||
ret = le.fit_transform(values)
|
||||
assert_array_equal(ret, [1, 0, 2, 0, 2])
|
||||
|
||||
with pytest.raises(ValueError, match="unseen labels"):
|
||||
le.transform(unknown)
|
||||
|
||||
|
||||
def test_label_encoder_negative_ints():
|
||||
le = LabelEncoder()
|
||||
le.fit([1, 1, 4, 5, -1, 0])
|
||||
assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
|
||||
assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]),
|
||||
[1, 2, 3, 3, 4, 0, 0])
|
||||
assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]),
|
||||
[0, 1, 4, 4, 5, -1, -1])
|
||||
with pytest.raises(ValueError):
|
||||
le.transform([0, 6])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ['str', 'object'])
|
||||
def test_label_encoder_str_bad_shape(dtype):
|
||||
le = LabelEncoder()
|
||||
le.fit(np.array(["apple", "orange"], dtype=dtype))
|
||||
msg = "should be a 1d array"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.transform("apple")
|
||||
|
||||
|
||||
def test_label_encoder_errors():
|
||||
# Check that invalid arguments yield ValueError
|
||||
le = LabelEncoder()
|
||||
with pytest.raises(ValueError):
|
||||
le.transform([])
|
||||
with pytest.raises(ValueError):
|
||||
le.inverse_transform([])
|
||||
|
||||
# Fail on unseen labels
|
||||
le = LabelEncoder()
|
||||
le.fit([1, 2, 3, -1, 1])
|
||||
msg = "contains previously unseen labels"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform([-2])
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform([-2, -3, -4])
|
||||
|
||||
# Fail on inverse_transform("")
|
||||
msg = r"should be a 1d array.+shape \(\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform("")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[np.array([2, 1, 3, 1, 3], dtype='int64'),
|
||||
np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
|
||||
np.array(['b', 'a', 'c', 'a', 'c'])],
|
||||
ids=['int64', 'object', 'str'])
|
||||
def test_label_encoder_empty_array(values):
|
||||
le = LabelEncoder()
|
||||
le.fit(values)
|
||||
# test empty transform
|
||||
transformed = le.transform([])
|
||||
assert_array_equal(np.array([]), transformed)
|
||||
# test empty inverse transform
|
||||
inverse_transformed = le.inverse_transform([])
|
||||
assert_array_equal(np.array([]), inverse_transformed)
|
||||
|
||||
|
||||
def test_sparse_output_multilabel_binarizer():
|
||||
# test input as iterable of iterables
|
||||
inputs = [
|
||||
lambda: [(2, 3), (1,), (1, 2)],
|
||||
lambda: ({2, 3}, {1}, {1, 2}),
|
||||
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1],
|
||||
[1, 0, 0],
|
||||
[1, 1, 0]])
|
||||
|
||||
inverse = inputs[0]()
|
||||
for sparse_output in [True, False]:
|
||||
for inp in inputs:
|
||||
# With fit_transform
|
||||
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
|
||||
got = mlb.fit_transform(inp())
|
||||
assert issparse(got) == sparse_output
|
||||
if sparse_output:
|
||||
# verify CSR assumption that indices and indptr have same dtype
|
||||
assert got.indices.dtype == got.indptr.dtype
|
||||
got = got.toarray()
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
# With fit
|
||||
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
|
||||
got = mlb.fit(inp()).transform(inp())
|
||||
assert issparse(got) == sparse_output
|
||||
if sparse_output:
|
||||
# verify CSR assumption that indices and indptr have same dtype
|
||||
assert got.indices.dtype == got.indptr.dtype
|
||||
got = got.toarray()
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(csr_matrix(np.array([[0, 1, 1],
|
||||
[2, 0, 0],
|
||||
[1, 1, 0]])))
|
||||
|
||||
|
||||
def test_multilabel_binarizer():
|
||||
# test input as iterable of iterables
|
||||
inputs = [
|
||||
lambda: [(2, 3), (1,), (1, 2)],
|
||||
lambda: ({2, 3}, {1}, {1, 2}),
|
||||
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1],
|
||||
[1, 0, 0],
|
||||
[1, 1, 0]])
|
||||
inverse = inputs[0]()
|
||||
for inp in inputs:
|
||||
# With fit_transform
|
||||
mlb = MultiLabelBinarizer()
|
||||
got = mlb.fit_transform(inp())
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
# With fit
|
||||
mlb = MultiLabelBinarizer()
|
||||
got = mlb.fit(inp()).transform(inp())
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
|
||||
def test_multilabel_binarizer_empty_sample():
|
||||
mlb = MultiLabelBinarizer()
|
||||
y = [[1, 2], [1], []]
|
||||
Y = np.array([[1, 1],
|
||||
[1, 0],
|
||||
[0, 0]])
|
||||
assert_array_equal(mlb.fit_transform(y), Y)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_unknown_class():
|
||||
mlb = MultiLabelBinarizer()
|
||||
y = [[1, 2]]
|
||||
Y = np.array([[1, 0], [0, 1]])
|
||||
w = 'unknown class(es) [0, 4] will be ignored'
|
||||
matrix = assert_warns_message(UserWarning, w,
|
||||
mlb.fit(y).transform, [[4, 1], [2, 0]])
|
||||
assert_array_equal(matrix, Y)
|
||||
|
||||
Y = np.array([[1, 0, 0], [0, 1, 0]])
|
||||
mlb = MultiLabelBinarizer(classes=[1, 2, 3])
|
||||
matrix = assert_warns_message(UserWarning, w,
|
||||
mlb.fit(y).transform, [[4, 1], [2, 0]])
|
||||
assert_array_equal(matrix, Y)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_given_classes():
|
||||
inp = [(2, 3), (1,), (1, 2)]
|
||||
indicator_mat = np.array([[0, 1, 1],
|
||||
[1, 0, 0],
|
||||
[1, 0, 1]])
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, [1, 3, 2])
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, [1, 3, 2])
|
||||
|
||||
# ensure works with extra class
|
||||
mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
|
||||
assert_array_equal(mlb.fit_transform(inp),
|
||||
np.hstack(([[0], [0], [0]], indicator_mat)))
|
||||
assert_array_equal(mlb.classes_, [4, 1, 3, 2])
|
||||
|
||||
# ensure fit is no-op as iterable is not consumed
|
||||
inp = iter(inp)
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
|
||||
# ensure a ValueError is thrown if given duplicate classes
|
||||
err_msg = "The classes argument contains duplicate classes. Remove " \
|
||||
"these duplicates before passing them to MultiLabelBinarizer."
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
mlb.fit(inp)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_multiple_calls():
|
||||
inp = [(2, 3), (1,), (1, 2)]
|
||||
indicator_mat = np.array([[0, 1, 1],
|
||||
[1, 0, 0],
|
||||
[1, 0, 1]])
|
||||
|
||||
indicator_mat2 = np.array([[0, 1, 1],
|
||||
[1, 0, 0],
|
||||
[1, 1, 0]])
|
||||
|
||||
# first call
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
# second call change class
|
||||
mlb.classes = [1, 2, 3]
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_same_length_sequence():
|
||||
# Ensure sequences of the same length are not interpreted as a 2-d array
|
||||
inp = [[1], [0], [2]]
|
||||
indicator_mat = np.array([[0, 1, 0],
|
||||
[1, 0, 0],
|
||||
[0, 0, 1]])
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_non_integer_labels():
|
||||
tuple_classes = _to_object_array([(1,), (2,), (3,)])
|
||||
inputs = [
|
||||
([('2', '3'), ('1',), ('1', '2')], ['1', '2', '3']),
|
||||
([('b', 'c'), ('a',), ('a', 'b')], ['a', 'b', 'c']),
|
||||
([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1],
|
||||
[1, 0, 0],
|
||||
[1, 1, 0]])
|
||||
for inp, classes in inputs:
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, classes)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, classes)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
with pytest.raises(TypeError):
|
||||
mlb.fit_transform([({}), ({}, {'a': 'b'})])
|
||||
|
||||
|
||||
def test_multilabel_binarizer_non_unique():
|
||||
inp = [(1, 1, 1, 0)]
|
||||
indicator_mat = np.array([[1, 1]])
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_inverse_validation():
|
||||
inp = [(1, 1, 1, 0)]
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit_transform(inp)
|
||||
# Not binary
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1, 3]]))
|
||||
# The following binary cases are fine, however
|
||||
mlb.inverse_transform(np.array([[0, 0]]))
|
||||
mlb.inverse_transform(np.array([[1, 1]]))
|
||||
mlb.inverse_transform(np.array([[1, 0]]))
|
||||
|
||||
# Wrong shape
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1]]))
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1, 1, 1]]))
|
||||
|
||||
|
||||
def test_label_binarize_with_class_order():
|
||||
out = label_binarize([1, 6], classes=[1, 2, 4, 6])
|
||||
expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
# Modified class order
|
||||
out = label_binarize([1, 6], classes=[1, 6, 4, 2])
|
||||
expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
|
||||
expected = np.array([[0, 0, 1, 0],
|
||||
[0, 0, 0, 1],
|
||||
[0, 1, 0, 0],
|
||||
[1, 0, 0, 0]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
|
||||
def check_binarized_results(y, classes, pos_label, neg_label, expected):
|
||||
for sparse_output in [True, False]:
|
||||
if ((pos_label == 0 or neg_label != 0) and sparse_output):
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(y, classes=classes, neg_label=neg_label,
|
||||
pos_label=pos_label,
|
||||
sparse_output=sparse_output)
|
||||
continue
|
||||
|
||||
# check label_binarize
|
||||
binarized = label_binarize(y, classes=classes, neg_label=neg_label,
|
||||
pos_label=pos_label,
|
||||
sparse_output=sparse_output)
|
||||
assert_array_equal(toarray(binarized), expected)
|
||||
assert issparse(binarized) == sparse_output
|
||||
|
||||
# check inverse
|
||||
y_type = type_of_target(y)
|
||||
if y_type == "multiclass":
|
||||
inversed = _inverse_binarize_multiclass(binarized, classes=classes)
|
||||
|
||||
else:
|
||||
inversed = _inverse_binarize_thresholding(binarized,
|
||||
output_type=y_type,
|
||||
classes=classes,
|
||||
threshold=((neg_label +
|
||||
pos_label) /
|
||||
2.))
|
||||
|
||||
assert_array_equal(toarray(inversed), toarray(y))
|
||||
|
||||
# Check label binarizer
|
||||
lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label,
|
||||
sparse_output=sparse_output)
|
||||
binarized = lb.fit_transform(y)
|
||||
assert_array_equal(toarray(binarized), expected)
|
||||
assert issparse(binarized) == sparse_output
|
||||
inverse_output = lb.inverse_transform(binarized)
|
||||
assert_array_equal(toarray(inverse_output), toarray(y))
|
||||
assert issparse(inverse_output) == issparse(y)
|
||||
|
||||
|
||||
def test_label_binarize_binary():
|
||||
y = [0, 1, 0]
|
||||
classes = [0, 1]
|
||||
pos_label = 2
|
||||
neg_label = -1
|
||||
expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
# Binary case where sparse_output = True will not result in a ValueError
|
||||
y = [0, 1, 0]
|
||||
classes = [0, 1]
|
||||
pos_label = 3
|
||||
neg_label = 0
|
||||
expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
|
||||
def test_label_binarize_multiclass():
|
||||
y = [0, 1, 2]
|
||||
classes = [0, 1, 2]
|
||||
pos_label = 2
|
||||
neg_label = 0
|
||||
expected = 2 * np.eye(3)
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label,
|
||||
sparse_output=True)
|
||||
|
||||
|
||||
def test_label_binarize_multilabel():
|
||||
y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
|
||||
classes = [0, 1, 2]
|
||||
pos_label = 2
|
||||
neg_label = 0
|
||||
expected = pos_label * y_ind
|
||||
y_sparse = [sparse_matrix(y_ind)
|
||||
for sparse_matrix in [coo_matrix, csc_matrix, csr_matrix,
|
||||
dok_matrix, lil_matrix]]
|
||||
|
||||
for y in [y_ind] + y_sparse:
|
||||
check_binarized_results(y, classes, pos_label, neg_label,
|
||||
expected)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label,
|
||||
sparse_output=True)
|
||||
|
||||
|
||||
def test_invalid_input_label_binarize():
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
|
||||
with pytest.raises(ValueError, match="continuous target data is not "):
|
||||
label_binarize([1.2, 2.7], classes=[0, 1])
|
||||
with pytest.raises(ValueError, match="mismatch with the labels"):
|
||||
label_binarize([[1, 3]], classes=[1, 2, 3])
|
||||
|
||||
|
||||
def test_inverse_binarize_multiclass():
|
||||
got = _inverse_binarize_multiclass(csr_matrix([[0, 1, 0],
|
||||
[-1, 0, -1],
|
||||
[0, 0, 0]]),
|
||||
np.arange(3))
|
||||
assert_array_equal(got, np.array([1, 1, 0]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, expected",
|
||||
[(np.array([2, 1, 3, 1, 3], dtype='int64'),
|
||||
np.array([1, 2, 3], dtype='int64')),
|
||||
(np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
|
||||
np.array(['a', 'b', 'c'], dtype=object)),
|
||||
(np.array(['b', 'a', 'c', 'a', 'c']),
|
||||
np.array(['a', 'b', 'c']))],
|
||||
ids=['int64', 'object', 'str'])
|
||||
def test_encode_util(values, expected):
|
||||
uniques = _encode(values)
|
||||
assert_array_equal(uniques, expected)
|
||||
uniques, encoded = _encode(values, encode=True)
|
||||
assert_array_equal(uniques, expected)
|
||||
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
|
||||
_, encoded = _encode(values, uniques, encode=True)
|
||||
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
|
||||
|
||||
|
||||
def test_encode_check_unknown():
|
||||
# test for the check_unknown parameter of _encode()
|
||||
uniques = np.array([1, 2, 3])
|
||||
values = np.array([1, 2, 3, 4])
|
||||
|
||||
# Default is True, raise error
|
||||
with pytest.raises(ValueError,
|
||||
match='y contains previously unseen labels'):
|
||||
_encode(values, uniques, encode=True, check_unknown=True)
|
||||
|
||||
# dont raise error if False
|
||||
_encode(values, uniques, encode=True, check_unknown=False)
|
||||
|
||||
# parameter is ignored for object dtype
|
||||
uniques = np.array(['a', 'b', 'c'], dtype=object)
|
||||
values = np.array(['a', 'b', 'c', 'd'], dtype=object)
|
||||
with pytest.raises(ValueError,
|
||||
match='y contains previously unseen labels'):
|
||||
_encode(values, uniques, encode=True, check_unknown=False)
|
||||
Loading…
Add table
Add a link
Reference in a new issue