Uploaded Test files

This commit is contained in:
Batuhan Berk Başoğlu 2020-11-12 11:05:57 -05:00
parent f584ad9d97
commit 2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions

View file

@ -0,0 +1,158 @@
import warnings
import pytest
import numpy as np
from scipy import sparse
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.base import clone
from sklearn.preprocessing import maxabs_scale
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import scale
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import robust_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_allclose
iris = load_iris()
def _get_valid_samples_by_column(X, col):
"""Get non NaN samples in column of X"""
return X[:, [col]][~np.isnan(X[:, col])]
@pytest.mark.parametrize(
"est, func, support_sparse, strictly_positive",
[(MaxAbsScaler(), maxabs_scale, True, False),
(MinMaxScaler(), minmax_scale, False, False),
(StandardScaler(), scale, False, False),
(StandardScaler(with_mean=False), scale, True, False),
(PowerTransformer('yeo-johnson'), power_transform, False, False),
(PowerTransformer('box-cox'), power_transform, False, True),
(QuantileTransformer(n_quantiles=10), quantile_transform, True, False),
(RobustScaler(), robust_scale, False, False),
(RobustScaler(with_centering=False), robust_scale, True, False)]
)
def test_missing_value_handling(est, func, support_sparse, strictly_positive):
# check that the preprocessing method let pass nan
rng = np.random.RandomState(42)
X = iris.data.copy()
n_missing = 50
X[rng.randint(X.shape[0], size=n_missing),
rng.randint(X.shape[1], size=n_missing)] = np.nan
if strictly_positive:
X += np.nanmin(X) + 0.1
X_train, X_test = train_test_split(X, random_state=1)
# sanity check
assert not np.all(np.isnan(X_train), axis=0).any()
assert np.any(np.isnan(X_train), axis=0).all()
assert np.any(np.isnan(X_test), axis=0).all()
X_test[:, 0] = np.nan # make sure this boundary case is tested
with pytest.warns(None) as records:
Xt = est.fit(X_train).transform(X_test)
# ensure no warnings are raised
assert len(records) == 0
# missing values should still be missing, and only them
assert_array_equal(np.isnan(Xt), np.isnan(X_test))
# check that the function leads to the same results as the class
with pytest.warns(None) as records:
Xt_class = est.transform(X_train)
assert len(records) == 0
Xt_func = func(X_train, **est.get_params())
assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
# check that the inverse transform keep NaN
Xt_inv = est.inverse_transform(Xt)
assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
# FIXME: we can introduce equal_nan=True in recent version of numpy.
# For the moment which just check that non-NaN values are almost equal.
assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])
for i in range(X.shape[1]):
# train only on non-NaN
est.fit(_get_valid_samples_by_column(X_train, i))
# check transforming with NaN works even when training without NaN
with pytest.warns(None) as records:
Xt_col = est.transform(X_test[:, [i]])
assert len(records) == 0
assert_allclose(Xt_col, Xt[:, [i]])
# check non-NaN is handled as before - the 1st column is all nan
if not np.isnan(X_test[:, i]).all():
Xt_col_nonan = est.transform(
_get_valid_samples_by_column(X_test, i))
assert_array_equal(Xt_col_nonan,
Xt_col[~np.isnan(Xt_col.squeeze())])
if support_sparse:
est_dense = clone(est)
est_sparse = clone(est)
with pytest.warns(None) as records:
Xt_dense = est_dense.fit(X_train).transform(X_test)
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
assert len(records) == 0
for sparse_constructor in (sparse.csr_matrix, sparse.csc_matrix,
sparse.bsr_matrix, sparse.coo_matrix,
sparse.dia_matrix, sparse.dok_matrix,
sparse.lil_matrix):
# check that the dense and sparse inputs lead to the same results
# precompute the matrix to avoid catching side warnings
X_train_sp = sparse_constructor(X_train)
X_test_sp = sparse_constructor(X_test)
with pytest.warns(None) as records:
warnings.simplefilter('ignore', PendingDeprecationWarning)
Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
assert len(records) == 0
assert_allclose(Xt_sp.A, Xt_dense)
with pytest.warns(None) as records:
warnings.simplefilter('ignore', PendingDeprecationWarning)
Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
assert len(records) == 0
assert_allclose(Xt_inv_sp.A, Xt_inv_dense)
@pytest.mark.parametrize(
"est, func",
[(MaxAbsScaler(), maxabs_scale),
(MinMaxScaler(), minmax_scale),
(StandardScaler(), scale),
(StandardScaler(with_mean=False), scale),
(PowerTransformer('yeo-johnson'), power_transform),
(PowerTransformer('box-cox'), power_transform,),
(QuantileTransformer(n_quantiles=3), quantile_transform),
(RobustScaler(), robust_scale),
(RobustScaler(with_centering=False), robust_scale)]
)
def test_missing_value_pandas_na_support(est, func):
# Test pandas IntegerArray with pd.NA
pd = pytest.importorskip('pandas', minversion="1.0")
X = np.array([[1, 2, 3, np.nan, np.nan, 4, 5, 1],
[np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
[1, 2, 3, 4, 5, 6, 7, 8]]).T
# Creates dataframe with IntegerArrays with pd.NA
X_df = pd.DataFrame(X, dtype="Int16", columns=['a', 'b', 'c'])
X_df['c'] = X_df['c'].astype('int')
X_trans = est.fit_transform(X)
X_df_trans = est.fit_transform(X_df)
assert_allclose(X_trans, X_df_trans)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,283 @@
import pytest
import numpy as np
import scipy.sparse as sp
import warnings
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils._testing import (
assert_array_almost_equal,
assert_array_equal,
assert_warns_message
)
X = [[-2, 1.5, -4, -1],
[-1, 2.5, -3, -0.5],
[0, 3.5, -2, 0.5],
[1, 4.5, -1, 2]]
@pytest.mark.parametrize(
'strategy, expected',
[('uniform', [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]),
('kmeans', [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]),
('quantile', [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]])])
def test_fit_transform(strategy, expected):
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy=strategy)
est.fit(X)
assert_array_equal(expected, est.transform(X))
def test_valid_n_bins():
KBinsDiscretizer(n_bins=2).fit_transform(X)
KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X)
assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(np.int)
def test_invalid_n_bins():
est = KBinsDiscretizer(n_bins=1)
err_msg = ("KBinsDiscretizer received an invalid "
"number of bins. Received 1, expected at least 2.")
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
est = KBinsDiscretizer(n_bins=1.1)
err_msg = ("KBinsDiscretizer received an invalid "
"n_bins type. Received float, expected int.")
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
def test_invalid_n_bins_array():
# Bad shape
n_bins = np.full((2, 4), 2.)
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Incorrect number of features
n_bins = [1, 2, 2]
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Bad bin values
n_bins = [1, 2, 2, 1]
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = ("KBinsDiscretizer received an invalid number of bins "
"at indices 0, 3. Number of bins must be at least 2, "
"and must be an int.")
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Float bin values
n_bins = [2.1, 2, 2.1, 2]
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = ("KBinsDiscretizer received an invalid number of bins "
"at indices 0, 2. Number of bins must be at least 2, "
"and must be an int.")
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
@pytest.mark.parametrize(
'strategy, expected',
[('uniform', [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]),
('kmeans', [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]),
('quantile', [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]])])
def test_fit_transform_n_bins_array(strategy, expected):
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='ordinal',
strategy=strategy).fit(X)
assert_array_equal(expected, est.transform(X))
# test the shape of bin_edges_
n_features = np.array(X).shape[1]
assert est.bin_edges_.shape == (n_features, )
for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_):
assert bin_edges.shape == (n_bins + 1, )
def test_invalid_n_features():
est = KBinsDiscretizer(n_bins=3).fit(X)
bad_X = np.arange(25).reshape(5, -1)
err_msg = "Incorrect number of features. Expecting 4, received 5"
with pytest.raises(ValueError, match=err_msg):
est.transform(bad_X)
@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
def test_same_min_max(strategy):
warnings.simplefilter("always")
X = np.array([[1, -2],
[1, -1],
[1, 0],
[1, 1]])
est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode='ordinal')
assert_warns_message(UserWarning,
"Feature 0 is constant and will be replaced "
"with 0.", est.fit, X)
assert est.n_bins_[0] == 1
# replace the feature with zeros
Xt = est.transform(X)
assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
def test_transform_1d_behavior():
X = np.arange(4)
est = KBinsDiscretizer(n_bins=2)
with pytest.raises(ValueError):
est.fit(X)
est = KBinsDiscretizer(n_bins=2)
est.fit(X.reshape(-1, 1))
with pytest.raises(ValueError):
est.transform(X)
@pytest.mark.parametrize('i', range(1, 9))
def test_numeric_stability(i):
X_init = np.array([2., 4., 6., 8., 10.]).reshape(-1, 1)
Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)
# Test up to discretizing nano units
X = X_init / 10**i
Xt = KBinsDiscretizer(n_bins=2, encode='ordinal').fit_transform(X)
assert_array_equal(Xt_expected, Xt)
def test_invalid_encode_option():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='invalid-encode')
err_msg = (r"Valid options for 'encode' are "
r"\('onehot', 'onehot-dense', 'ordinal'\). "
r"Got encode='invalid-encode' instead.")
with pytest.raises(ValueError, match=err_msg):
est.fit(X)
def test_encode_options():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
encode='ordinal').fit(X)
Xt_1 = est.transform(X)
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
encode='onehot-dense').fit(X)
Xt_2 = est.transform(X)
assert not sp.issparse(Xt_2)
assert_array_equal(OneHotEncoder(
categories=[np.arange(i) for i in [2, 3, 3, 3]],
sparse=False)
.fit_transform(Xt_1), Xt_2)
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
encode='onehot').fit(X)
Xt_3 = est.transform(X)
assert sp.issparse(Xt_3)
assert_array_equal(OneHotEncoder(
categories=[np.arange(i) for i in [2, 3, 3, 3]],
sparse=True)
.fit_transform(Xt_1).toarray(),
Xt_3.toarray())
def test_invalid_strategy_option():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], strategy='invalid-strategy')
err_msg = (r"Valid options for 'strategy' are "
r"\('uniform', 'quantile', 'kmeans'\). "
r"Got strategy='invalid-strategy' instead.")
with pytest.raises(ValueError, match=err_msg):
est.fit(X)
@pytest.mark.parametrize(
'strategy, expected_2bins, expected_3bins, expected_5bins',
[('uniform', [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
('kmeans', [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
('quantile', [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2], [0, 1, 2, 3, 4, 4])])
def test_nonuniform_strategies(
strategy, expected_2bins, expected_3bins, expected_5bins):
X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
# with 2 bins
est = KBinsDiscretizer(n_bins=2, strategy=strategy, encode='ordinal')
Xt = est.fit_transform(X)
assert_array_equal(expected_2bins, Xt.ravel())
# with 3 bins
est = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal')
Xt = est.fit_transform(X)
assert_array_equal(expected_3bins, Xt.ravel())
# with 5 bins
est = KBinsDiscretizer(n_bins=5, strategy=strategy, encode='ordinal')
Xt = est.fit_transform(X)
assert_array_equal(expected_5bins, Xt.ravel())
@pytest.mark.parametrize(
'strategy, expected_inv',
[('uniform', [[-1.5, 2., -3.5, -0.5], [-0.5, 3., -2.5, -0.5],
[0.5, 4., -1.5, 0.5], [0.5, 4., -1.5, 1.5]]),
('kmeans', [[-1.375, 2.125, -3.375, -0.5625],
[-1.375, 2.125, -3.375, -0.5625],
[-0.125, 3.375, -2.125, 0.5625],
[0.75, 4.25, -1.25, 1.625]]),
('quantile', [[-1.5, 2., -3.5, -0.75], [-0.5, 3., -2.5, 0.],
[0.5, 4., -1.5, 1.25], [0.5, 4., -1.5, 1.25]])])
@pytest.mark.parametrize('encode', ['ordinal', 'onehot', 'onehot-dense'])
def test_inverse_transform(strategy, encode, expected_inv):
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
Xt = kbd.fit_transform(X)
Xinv = kbd.inverse_transform(Xt)
assert_array_almost_equal(expected_inv, Xinv)
@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
def test_transform_outside_fit_range(strategy):
X = np.array([0, 1, 2, 3])[:, None]
kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode='ordinal')
kbd.fit(X)
X2 = np.array([-2, 5])[:, None]
X2t = kbd.transform(X2)
assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
assert_array_equal(X2t.min(axis=0), [0])
def test_overwrite():
X = np.array([0, 1, 2, 3])[:, None]
X_before = X.copy()
est = KBinsDiscretizer(n_bins=3, encode="ordinal")
Xt = est.fit_transform(X)
assert_array_equal(X, X_before)
Xt_before = Xt.copy()
Xinv = est.inverse_transform(Xt)
assert_array_equal(Xt, Xt_before)
assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
@pytest.mark.parametrize(
'strategy, expected_bin_edges',
[('quantile', [0, 1, 3]), ('kmeans', [0, 1.5, 3])])
def test_redundant_bins(strategy, expected_bin_edges):
X = [[0], [0], [0], [0], [3], [3]]
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy)
msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
"are removed. Consider decreasing the number of bins.")
assert_warns_message(UserWarning, msg, kbd.fit, X)
assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
def test_percentile_numeric_stability():
X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
Xt = np.array([0, 0, 4]).reshape(-1, 1)
kbd = KBinsDiscretizer(n_bins=10, encode='ordinal',
strategy='quantile')
msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
"are removed. Consider decreasing the number of bins.")
assert_warns_message(UserWarning, msg, kbd.fit, X)
assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
assert_array_almost_equal(kbd.transform(X), Xt)

View file

@ -0,0 +1,698 @@
# -*- coding: utf-8 -*-
import re
import numpy as np
from scipy import sparse
import pytest
from sklearn.exceptions import NotFittedError
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_allclose
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
def test_one_hot_encoder_sparse_dense():
# check that sparse and dense will give the same results
X = np.array([[3, 2, 1], [0, 1, 1]])
enc_sparse = OneHotEncoder()
enc_dense = OneHotEncoder(sparse=False)
X_trans_sparse = enc_sparse.fit_transform(X)
X_trans_dense = enc_dense.fit_transform(X)
assert X_trans_sparse.shape == (2, 5)
assert X_trans_dense.shape == (2, 5)
assert sparse.issparse(X_trans_sparse)
assert not sparse.issparse(X_trans_dense)
# check outcome
assert_array_equal(X_trans_sparse.toarray(), [[0., 1., 0., 1., 1.],
[1., 0., 1., 0., 1.]])
assert_array_equal(X_trans_sparse.toarray(), X_trans_dense)
def test_one_hot_encoder_diff_n_features():
X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
X2 = np.array([[1, 0]])
enc = OneHotEncoder()
enc.fit(X)
err_msg = ("The number of features in X is different to the number of "
"features of the fitted data.")
with pytest.raises(ValueError, match=err_msg):
enc.transform(X2)
def test_one_hot_encoder_handle_unknown():
X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
X2 = np.array([[4, 1, 1]])
# Test that one hot encoder raises error for unknown features
# present during transform.
oh = OneHotEncoder(handle_unknown='error')
oh.fit(X)
with pytest.raises(ValueError, match='Found unknown categories'):
oh.transform(X2)
# Test the ignore option, ignores unknown features (giving all 0's)
oh = OneHotEncoder(handle_unknown='ignore')
oh.fit(X)
X2_passed = X2.copy()
assert_array_equal(
oh.transform(X2_passed).toarray(),
np.array([[0., 0., 0., 0., 1., 0., 0.]]))
# ensure transformed data was not modified in place
assert_allclose(X2, X2_passed)
# Raise error if handle_unknown is neither ignore or error.
oh = OneHotEncoder(handle_unknown='42')
with pytest.raises(ValueError, match='handle_unknown should be either'):
oh.fit(X)
def test_one_hot_encoder_not_fitted():
X = np.array([['a'], ['b']])
enc = OneHotEncoder(categories=['a', 'b'])
msg = ("This OneHotEncoder instance is not fitted yet. "
"Call 'fit' with appropriate arguments before using this "
"estimator.")
with pytest.raises(NotFittedError, match=msg):
enc.transform(X)
def test_one_hot_encoder_handle_unknown_strings():
X = np.array(['11111111', '22', '333', '4444']).reshape((-1, 1))
X2 = np.array(['55555', '22']).reshape((-1, 1))
# Non Regression test for the issue #12470
# Test the ignore option, when categories are numpy string dtype
# particularly when the known category strings are larger
# than the unknown category strings
oh = OneHotEncoder(handle_unknown='ignore')
oh.fit(X)
X2_passed = X2.copy()
assert_array_equal(
oh.transform(X2_passed).toarray(),
np.array([[0., 0., 0., 0.], [0., 1., 0., 0.]]))
# ensure transformed data was not modified in place
assert_array_equal(X2, X2_passed)
@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
@pytest.mark.parametrize("input_dtype", [np.int32, np.float32, np.float64])
def test_one_hot_encoder_dtype(input_dtype, output_dtype):
X = np.asarray([[0, 1]], dtype=input_dtype).T
X_expected = np.asarray([[1, 0], [0, 1]], dtype=output_dtype)
oh = OneHotEncoder(categories='auto', dtype=output_dtype)
assert_array_equal(oh.fit_transform(X).toarray(), X_expected)
assert_array_equal(oh.fit(X).transform(X).toarray(), X_expected)
oh = OneHotEncoder(categories='auto', dtype=output_dtype, sparse=False)
assert_array_equal(oh.fit_transform(X), X_expected)
assert_array_equal(oh.fit(X).transform(X), X_expected)
@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
def test_one_hot_encoder_dtype_pandas(output_dtype):
pd = pytest.importorskip('pandas')
X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]})
X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype)
oh = OneHotEncoder(dtype=output_dtype)
assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected)
assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected)
oh = OneHotEncoder(dtype=output_dtype, sparse=False)
assert_array_equal(oh.fit_transform(X_df), X_expected)
assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)
def test_one_hot_encoder_feature_names():
enc = OneHotEncoder()
X = [['Male', 1, 'girl', 2, 3],
['Female', 41, 'girl', 1, 10],
['Male', 51, 'boy', 12, 3],
['Male', 91, 'girl', 21, 30]]
enc.fit(X)
feature_names = enc.get_feature_names()
assert isinstance(feature_names, np.ndarray)
assert_array_equal(['x0_Female', 'x0_Male',
'x1_1', 'x1_41', 'x1_51', 'x1_91',
'x2_boy', 'x2_girl',
'x3_1', 'x3_2', 'x3_12', 'x3_21',
'x4_3',
'x4_10', 'x4_30'], feature_names)
feature_names2 = enc.get_feature_names(['one', 'two',
'three', 'four', 'five'])
assert_array_equal(['one_Female', 'one_Male',
'two_1', 'two_41', 'two_51', 'two_91',
'three_boy', 'three_girl',
'four_1', 'four_2', 'four_12', 'four_21',
'five_3', 'five_10', 'five_30'], feature_names2)
with pytest.raises(ValueError, match="input_features should have length"):
enc.get_feature_names(['one', 'two'])
def test_one_hot_encoder_feature_names_unicode():
enc = OneHotEncoder()
X = np.array([['c❤t1', 'dat2']], dtype=object).T
enc.fit(X)
feature_names = enc.get_feature_names()
assert_array_equal(['x0_c❤t1', 'x0_dat2'], feature_names)
feature_names = enc.get_feature_names(input_features=['n👍me'])
assert_array_equal(['n👍me_c❤t1', 'n👍me_dat2'], feature_names)
def test_one_hot_encoder_set_params():
X = np.array([[1, 2]]).T
oh = OneHotEncoder()
# set params on not yet fitted object
oh.set_params(categories=[[0, 1, 2, 3]])
assert oh.get_params()['categories'] == [[0, 1, 2, 3]]
assert oh.fit_transform(X).toarray().shape == (2, 4)
# set params on already fitted object
oh.set_params(categories=[[0, 1, 2, 3, 4]])
assert oh.fit_transform(X).toarray().shape == (2, 5)
def check_categorical_onehot(X):
enc = OneHotEncoder(categories='auto')
Xtr1 = enc.fit_transform(X)
enc = OneHotEncoder(categories='auto', sparse=False)
Xtr2 = enc.fit_transform(X)
assert_allclose(Xtr1.toarray(), Xtr2)
assert sparse.isspmatrix_csr(Xtr1)
return Xtr1.toarray()
@pytest.mark.parametrize("X", [
[['def', 1, 55], ['abc', 2, 55]],
np.array([[10, 1, 55], [5, 2, 55]]),
np.array([['b', 'A', 'cat'], ['a', 'B', 'cat']], dtype=object)
], ids=['mixed', 'numeric', 'object'])
def test_one_hot_encoder(X):
Xtr = check_categorical_onehot(np.array(X)[:, [0]])
assert_allclose(Xtr, [[0, 1], [1, 0]])
Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])
Xtr = OneHotEncoder(categories='auto').fit_transform(X)
assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]])
@pytest.mark.parametrize('sparse_', [False, True])
@pytest.mark.parametrize('drop', [None, 'first'])
def test_one_hot_encoder_inverse(sparse_, drop):
X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]]
enc = OneHotEncoder(sparse=sparse_, drop=drop)
X_tr = enc.fit_transform(X)
exp = np.array(X, dtype=object)
assert_array_equal(enc.inverse_transform(X_tr), exp)
X = [[2, 55], [1, 55], [3, 55]]
enc = OneHotEncoder(sparse=sparse_, categories='auto',
drop=drop)
X_tr = enc.fit_transform(X)
exp = np.array(X)
assert_array_equal(enc.inverse_transform(X_tr), exp)
if drop is None:
# with unknown categories
# drop is incompatible with handle_unknown=ignore
X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]]
enc = OneHotEncoder(sparse=sparse_, handle_unknown='ignore',
categories=[['abc', 'def'], [1, 2],
[54, 55, 56]])
X_tr = enc.fit_transform(X)
exp = np.array(X, dtype=object)
exp[2, 1] = None
assert_array_equal(enc.inverse_transform(X_tr), exp)
# with an otherwise numerical output, still object if unknown
X = [[2, 55], [1, 55], [3, 55]]
enc = OneHotEncoder(sparse=sparse_, categories=[[1, 2], [54, 56]],
handle_unknown='ignore')
X_tr = enc.fit_transform(X)
exp = np.array(X, dtype=object)
exp[2, 0] = None
exp[:, 1] = None
assert_array_equal(enc.inverse_transform(X_tr), exp)
# incorrect shape raises
X_tr = np.array([[0, 1, 1], [1, 0, 1]])
msg = re.escape('Shape of the passed X data is not correct')
with pytest.raises(ValueError, match=msg):
enc.inverse_transform(X_tr)
def test_one_hot_encoder_inverse_if_binary():
X = np.array([['Male', 1],
['Female', 3],
['Female', 2]], dtype=object)
ohe = OneHotEncoder(drop='if_binary', sparse=False)
X_tr = ohe.fit_transform(X)
assert_array_equal(ohe.inverse_transform(X_tr), X)
# check that resetting drop option without refitting does not throw an error
@pytest.mark.parametrize('drop', ['if_binary', 'first', None])
@pytest.mark.parametrize('reset_drop', ['if_binary', 'first', None])
def test_one_hot_encoder_drop_reset(drop, reset_drop):
X = np.array([['Male', 1],
['Female', 3],
['Female', 2]], dtype=object)
ohe = OneHotEncoder(drop=drop, sparse=False)
ohe.fit(X)
X_tr = ohe.transform(X)
feature_names = ohe.get_feature_names()
ohe.set_params(drop=reset_drop)
assert_array_equal(ohe.inverse_transform(X_tr), X)
assert_allclose(ohe.transform(X), X_tr)
assert_array_equal(ohe.get_feature_names(), feature_names)
@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
@pytest.mark.parametrize("X", [
[1, 2],
np.array([3., 4.])
])
def test_X_is_not_1D(X, method):
oh = OneHotEncoder()
msg = ("Expected 2D array, got 1D array instead")
with pytest.raises(ValueError, match=msg):
getattr(oh, method)(X)
@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
def test_X_is_not_1D_pandas(method):
pd = pytest.importorskip('pandas')
X = pd.Series([6, 3, 4, 6])
oh = OneHotEncoder()
msg = ("Expected 2D array, got 1D array instead")
with pytest.raises(ValueError, match=msg):
getattr(oh, method)(X)
@pytest.mark.parametrize("X, cat_exp, cat_dtype", [
([['abc', 55], ['def', 55]], [['abc', 'def'], [55]], np.object_),
(np.array([[1, 2], [3, 2]]), [[1, 3], [2]], np.integer),
(np.array([['A', 'cat'], ['B', 'cat']], dtype=object),
[['A', 'B'], ['cat']], np.object_),
(np.array([['A', 'cat'], ['B', 'cat']]),
[['A', 'B'], ['cat']], np.str_)
], ids=['mixed', 'numeric', 'object', 'string'])
def test_one_hot_encoder_categories(X, cat_exp, cat_dtype):
# order of categories should not depend on order of samples
for Xi in [X, X[::-1]]:
enc = OneHotEncoder(categories='auto')
enc.fit(Xi)
# assert enc.categories == 'auto'
assert isinstance(enc.categories_, list)
for res, exp in zip(enc.categories_, cat_exp):
assert res.tolist() == exp
assert np.issubdtype(res.dtype, cat_dtype)
@pytest.mark.parametrize("X, X2, cats, cat_dtype", [
(np.array([['a', 'b']], dtype=object).T,
np.array([['a', 'd']], dtype=object).T,
[['a', 'b', 'c']], np.object_),
(np.array([[1, 2]], dtype='int64').T,
np.array([[1, 4]], dtype='int64').T,
[[1, 2, 3]], np.int64),
(np.array([['a', 'b']], dtype=object).T,
np.array([['a', 'd']], dtype=object).T,
[np.array(['a', 'b', 'c'])], np.object_),
], ids=['object', 'numeric', 'object-string-cat'])
def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype):
enc = OneHotEncoder(categories=cats)
exp = np.array([[1., 0., 0.],
[0., 1., 0.]])
assert_array_equal(enc.fit_transform(X).toarray(), exp)
assert list(enc.categories[0]) == list(cats[0])
assert enc.categories_[0].tolist() == list(cats[0])
# manually specified categories should have same dtype as
# the data when coerced from lists
assert enc.categories_[0].dtype == cat_dtype
# when specifying categories manually, unknown categories should already
# raise when fitting
enc = OneHotEncoder(categories=cats)
with pytest.raises(ValueError, match="Found unknown categories"):
enc.fit(X2)
enc = OneHotEncoder(categories=cats, handle_unknown='ignore')
exp = np.array([[1., 0., 0.], [0., 0., 0.]])
assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)
def test_one_hot_encoder_unsorted_categories():
X = np.array([['a', 'b']], dtype=object).T
enc = OneHotEncoder(categories=[['b', 'a', 'c']])
exp = np.array([[0., 1., 0.],
[1., 0., 0.]])
assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
assert_array_equal(enc.fit_transform(X).toarray(), exp)
assert enc.categories_[0].tolist() == ['b', 'a', 'c']
assert np.issubdtype(enc.categories_[0].dtype, np.object_)
# unsorted passed categories still raise for numerical values
X = np.array([[1, 2]]).T
enc = OneHotEncoder(categories=[[2, 1, 3]])
msg = 'Unsorted categories are not supported'
with pytest.raises(ValueError, match=msg):
enc.fit_transform(X)
def test_one_hot_encoder_specified_categories_mixed_columns():
# multiple columns
X = np.array([['a', 'b'], [0, 2]], dtype=object).T
enc = OneHotEncoder(categories=[['a', 'b', 'c'], [0, 1, 2]])
exp = np.array([[1., 0., 0., 1., 0., 0.],
[0., 1., 0., 0., 0., 1.]])
assert_array_equal(enc.fit_transform(X).toarray(), exp)
assert enc.categories_[0].tolist() == ['a', 'b', 'c']
assert np.issubdtype(enc.categories_[0].dtype, np.object_)
assert enc.categories_[1].tolist() == [0, 1, 2]
# integer categories but from object dtype data
assert np.issubdtype(enc.categories_[1].dtype, np.object_)
def test_one_hot_encoder_pandas():
pd = pytest.importorskip('pandas')
X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]})
Xtr = check_categorical_onehot(X_df)
assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
@pytest.mark.parametrize("drop, expected_names",
[('first', ['x0_c', 'x2_b']),
('if_binary', ['x0_c', 'x1_2', 'x2_b']),
(['c', 2, 'b'], ['x0_b', 'x2_a'])],
ids=['first', 'binary', 'manual'])
def test_one_hot_encoder_feature_names_drop(drop, expected_names):
X = [['c', 2, 'a'],
['b', 2, 'b']]
ohe = OneHotEncoder(drop=drop)
ohe.fit(X)
feature_names = ohe.get_feature_names()
assert isinstance(feature_names, np.ndarray)
assert_array_equal(expected_names, feature_names)
def test_one_hot_encoder_drop_equals_if_binary():
# Canonical case
X = [[10, 'yes'],
[20, 'no'],
[30, 'yes']]
expected = np.array([[1., 0., 0., 1.],
[0., 1., 0., 0.],
[0., 0., 1., 1.]])
expected_drop_idx = np.array([None, 0])
ohe = OneHotEncoder(drop='if_binary', sparse=False)
result = ohe.fit_transform(X)
assert_array_equal(ohe.drop_idx_, expected_drop_idx)
assert_allclose(result, expected)
# with only one cat, the behaviour is equivalent to drop=None
X = [['true', 'a'],
['false', 'a'],
['false', 'a']]
expected = np.array([[1., 1.],
[0., 1.],
[0., 1.]])
expected_drop_idx = np.array([0, None])
ohe = OneHotEncoder(drop='if_binary', sparse=False)
result = ohe.fit_transform(X)
assert_array_equal(ohe.drop_idx_, expected_drop_idx)
assert_allclose(result, expected)
@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
np.array([['a', np.nan]], dtype=object).T],
ids=['numeric', 'object'])
@pytest.mark.parametrize("as_data_frame", [False, True],
ids=['array', 'dataframe'])
@pytest.mark.parametrize("handle_unknown", ['error', 'ignore'])
def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
if as_data_frame:
pd = pytest.importorskip('pandas')
X = pd.DataFrame(X)
ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown)
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.fit(X)
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.fit_transform(X)
if as_data_frame:
X_partial = X.iloc[:1, :]
else:
X_partial = X[:1, :]
ohe.fit(X_partial)
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.transform(X)
@pytest.mark.parametrize("X", [
[['abc', 2, 55], ['def', 1, 55]],
np.array([[10, 2, 55], [20, 1, 55]]),
np.array([['a', 'B', 'cat'], ['b', 'A', 'cat']], dtype=object)
], ids=['mixed', 'numeric', 'object'])
def test_ordinal_encoder(X):
enc = OrdinalEncoder()
exp = np.array([[0, 1, 0],
[1, 0, 0]], dtype='int64')
assert_array_equal(enc.fit_transform(X), exp.astype('float64'))
enc = OrdinalEncoder(dtype='int64')
assert_array_equal(enc.fit_transform(X), exp)
@pytest.mark.parametrize("X, X2, cats, cat_dtype", [
(np.array([['a', 'b']], dtype=object).T,
np.array([['a', 'd']], dtype=object).T,
[['a', 'b', 'c']], np.object_),
(np.array([[1, 2]], dtype='int64').T,
np.array([[1, 4]], dtype='int64').T,
[[1, 2, 3]], np.int64),
(np.array([['a', 'b']], dtype=object).T,
np.array([['a', 'd']], dtype=object).T,
[np.array(['a', 'b', 'c'])], np.object_),
], ids=['object', 'numeric', 'object-string-cat'])
def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
enc = OrdinalEncoder(categories=cats)
exp = np.array([[0.], [1.]])
assert_array_equal(enc.fit_transform(X), exp)
assert list(enc.categories[0]) == list(cats[0])
assert enc.categories_[0].tolist() == list(cats[0])
# manually specified categories should have same dtype as
# the data when coerced from lists
assert enc.categories_[0].dtype == cat_dtype
# when specifying categories manually, unknown categories should already
# raise when fitting
enc = OrdinalEncoder(categories=cats)
with pytest.raises(ValueError, match="Found unknown categories"):
enc.fit(X2)
def test_ordinal_encoder_inverse():
X = [['abc', 2, 55], ['def', 1, 55]]
enc = OrdinalEncoder()
X_tr = enc.fit_transform(X)
exp = np.array(X, dtype=object)
assert_array_equal(enc.inverse_transform(X_tr), exp)
# incorrect shape raises
X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
msg = re.escape('Shape of the passed X data is not correct')
with pytest.raises(ValueError, match=msg):
enc.inverse_transform(X_tr)
@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
np.array([['a', np.nan]], dtype=object).T],
ids=['numeric', 'object'])
def test_ordinal_encoder_raise_missing(X):
ohe = OrdinalEncoder()
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.fit(X)
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.fit_transform(X)
ohe.fit(X[:1, :])
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.transform(X)
def test_ordinal_encoder_raise_categories_shape():
X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T
cats = ['Low', 'Medium', 'High']
enc = OrdinalEncoder(categories=cats)
msg = ("Shape mismatch: if categories is an array,")
with pytest.raises(ValueError, match=msg):
enc.fit(X)
def test_encoder_dtypes():
# check that dtypes are preserved when determining categories
enc = OneHotEncoder(categories='auto')
exp = np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]], dtype='float64')
for X in [np.array([[1, 2], [3, 4]], dtype='int64'),
np.array([[1, 2], [3, 4]], dtype='float64'),
np.array([['a', 'b'], ['c', 'd']]), # string dtype
np.array([[1, 'a'], [3, 'b']], dtype='object')]:
enc.fit(X)
assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
assert_array_equal(enc.transform(X).toarray(), exp)
X = [[1, 2], [3, 4]]
enc.fit(X)
assert all([np.issubdtype(enc.categories_[i].dtype, np.integer)
for i in range(2)])
assert_array_equal(enc.transform(X).toarray(), exp)
X = [[1, 'a'], [3, 'b']]
enc.fit(X)
assert all([enc.categories_[i].dtype == 'object' for i in range(2)])
assert_array_equal(enc.transform(X).toarray(), exp)
def test_encoder_dtypes_pandas():
# check dtype (similar to test_categorical_encoder_dtypes for dataframes)
pd = pytest.importorskip('pandas')
enc = OneHotEncoder(categories='auto')
exp = np.array([[1., 0., 1., 0., 1., 0.],
[0., 1., 0., 1., 0., 1.]], dtype='float64')
X = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}, dtype='int64')
enc.fit(X)
assert all([enc.categories_[i].dtype == 'int64' for i in range(2)])
assert_array_equal(enc.transform(X).toarray(), exp)
X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]})
X_type = [X['A'].dtype, X['B'].dtype, X['C'].dtype]
enc.fit(X)
assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
assert_array_equal(enc.transform(X).toarray(), exp)
def test_one_hot_encoder_warning():
enc = OneHotEncoder()
X = [['Male', 1], ['Female', 3]]
np.testing.assert_no_warnings(enc.fit_transform, X)
def test_one_hot_encoder_drop_manual():
cats_to_drop = ['def', 12, 3, 56]
enc = OneHotEncoder(drop=cats_to_drop)
X = [['abc', 12, 2, 55],
['def', 12, 1, 55],
['def', 12, 3, 56]]
trans = enc.fit_transform(X).toarray()
exp = [[1, 0, 1, 1],
[0, 1, 0, 1],
[0, 0, 0, 0]]
assert_array_equal(trans, exp)
dropped_cats = [cat[feature]
for cat, feature in zip(enc.categories_,
enc.drop_idx_)]
assert_array_equal(dropped_cats, cats_to_drop)
assert_array_equal(np.array(X, dtype=object),
enc.inverse_transform(trans))
@pytest.mark.parametrize(
"X_fit, params, err_msg",
[([["Male"], ["Female"]], {'drop': 'second'},
"Wrong input for parameter `drop`"),
([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'},
"`handle_unknown` must be 'error'"),
([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
{'drop': np.asarray('b', dtype=object)},
"Wrong input for parameter `drop`"),
([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
{'drop': ['ghi', 3, 59]},
"The following categories were supposed")]
)
def test_one_hot_encoder_invalid_params(X_fit, params, err_msg):
enc = OneHotEncoder(**params)
with pytest.raises(ValueError, match=err_msg):
enc.fit(X_fit)
@pytest.mark.parametrize('drop', [['abc', 3], ['abc', 3, 41, 'a']])
def test_invalid_drop_length(drop):
enc = OneHotEncoder(drop=drop)
err_msg = "`drop` should have length equal to the number"
with pytest.raises(ValueError, match=err_msg):
enc.fit([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
@pytest.mark.parametrize("density", [True, False],
ids=['sparse', 'dense'])
@pytest.mark.parametrize("drop", ['first',
['a', 2, 'b']],
ids=['first', 'manual'])
def test_categories(density, drop):
ohe_base = OneHotEncoder(sparse=density)
ohe_test = OneHotEncoder(sparse=density, drop=drop)
X = [['c', 1, 'a'],
['a', 2, 'b']]
ohe_base.fit(X)
ohe_test.fit(X)
assert_array_equal(ohe_base.categories_, ohe_test.categories_)
if drop == 'first':
assert_array_equal(ohe_test.drop_idx_, 0)
else:
for drop_cat, drop_idx, cat_list in zip(drop,
ohe_test.drop_idx_,
ohe_test.categories_):
assert cat_list[int(drop_idx)] == drop_cat
assert isinstance(ohe_test.drop_idx_, np.ndarray)
assert ohe_test.drop_idx_.dtype == np.object
@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
def test_encoders_has_categorical_tags(Encoder):
assert 'categorical' in Encoder()._get_tags()['X_types']
@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
def test_encoders_does_not_support_none_values(Encoder):
values = [["a"], [None]]
with pytest.raises(TypeError, match="Encoders require their input to be "
"uniformly strings or numbers."):
Encoder().fit(values)

View file

@ -0,0 +1,160 @@
import pytest
import numpy as np
from scipy import sparse
from sklearn.preprocessing import FunctionTransformer
from sklearn.utils._testing import (assert_array_equal,
assert_allclose_dense_sparse)
from sklearn.utils._testing import assert_warns_message, assert_no_warnings
def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
def _func(X, *args, **kwargs):
args_store.append(X)
args_store.extend(args)
kwargs_store.update(kwargs)
return func(X)
return _func
def test_delegate_to_func():
# (args|kwargs)_store will hold the positional and keyword arguments
# passed to the function inside the FunctionTransformer.
args_store = []
kwargs_store = {}
X = np.arange(10).reshape((5, 2))
assert_array_equal(
FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
X, 'transform should have returned X unchanged',
)
# The function should only have received X.
assert args_store == [X], ('Incorrect positional arguments passed to '
'func: {args}'.format(args=args_store))
assert not kwargs_store, ('Unexpected keyword arguments passed to '
'func: {args}'.format(args=kwargs_store))
# reset the argument stores.
args_store[:] = []
kwargs_store.clear()
transformed = FunctionTransformer(
_make_func(args_store, kwargs_store),
).transform(X)
assert_array_equal(transformed, X,
err_msg='transform should have returned X unchanged')
# The function should have received X
assert args_store == [X], ('Incorrect positional arguments passed '
'to func: {args}'.format(args=args_store))
assert not kwargs_store, ('Unexpected keyword arguments passed to '
'func: {args}'.format(args=kwargs_store))
def test_np_log():
X = np.arange(10).reshape((5, 2))
# Test that the numpy.log example still works.
assert_array_equal(
FunctionTransformer(np.log1p).transform(X),
np.log1p(X),
)
def test_kw_arg():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
# Test that rounding is correct
assert_array_equal(F.transform(X),
np.around(X, decimals=3))
def test_kw_arg_update():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
F.kw_args['decimals'] = 1
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=1))
def test_kw_arg_reset():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
F.kw_args = dict(decimals=1)
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=1))
def test_inverse_transform():
X = np.array([1, 4, 9, 16]).reshape((2, 2))
# Test that inverse_transform works correctly
F = FunctionTransformer(
func=np.sqrt,
inverse_func=np.around, inv_kw_args=dict(decimals=3),
)
assert_array_equal(
F.inverse_transform(F.transform(X)),
np.around(np.sqrt(X), decimals=3),
)
def test_check_inverse():
X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
X_list = [X_dense,
sparse.csr_matrix(X_dense),
sparse.csc_matrix(X_dense)]
for X in X_list:
if sparse.issparse(X):
accept_sparse = True
else:
accept_sparse = False
trans = FunctionTransformer(func=np.sqrt,
inverse_func=np.around,
accept_sparse=accept_sparse,
check_inverse=True,
validate=True)
assert_warns_message(UserWarning,
"The provided functions are not strictly"
" inverse of each other. If you are sure you"
" want to proceed regardless, set"
" 'check_inverse=False'.",
trans.fit, X)
trans = FunctionTransformer(func=np.expm1,
inverse_func=np.log1p,
accept_sparse=accept_sparse,
check_inverse=True,
validate=True)
Xt = assert_no_warnings(trans.fit_transform, X)
assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))
# check that we don't check inverse when one of the func or inverse is not
# provided.
trans = FunctionTransformer(func=np.expm1, inverse_func=None,
check_inverse=True, validate=True)
assert_no_warnings(trans.fit, X_dense)
trans = FunctionTransformer(func=None, inverse_func=np.expm1,
check_inverse=True, validate=True)
assert_no_warnings(trans.fit, X_dense)
def test_function_transformer_frame():
pd = pytest.importorskip('pandas')
X_df = pd.DataFrame(np.random.randn(100, 10))
transformer = FunctionTransformer()
X_df_trans = transformer.fit_transform(X_df)
assert hasattr(X_df_trans, 'loc')

View file

@ -0,0 +1,656 @@
import numpy as np
import pytest
from scipy.sparse import issparse
from scipy.sparse import coo_matrix
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import dok_matrix
from scipy.sparse import lil_matrix
from sklearn.utils.multiclass import type_of_target
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_warns_message
from sklearn.utils._testing import ignore_warnings
from sklearn.utils import _to_object_array
from sklearn.preprocessing._label import LabelBinarizer
from sklearn.preprocessing._label import MultiLabelBinarizer
from sklearn.preprocessing._label import LabelEncoder
from sklearn.preprocessing._label import label_binarize
from sklearn.preprocessing._label import _inverse_binarize_thresholding
from sklearn.preprocessing._label import _inverse_binarize_multiclass
from sklearn.preprocessing._label import _encode
from sklearn import datasets
iris = datasets.load_iris()
def toarray(a):
if hasattr(a, "toarray"):
a = a.toarray()
return a
def test_label_binarizer():
# one-class case defaults to negative label
# For dense case:
inp = ["pos", "pos", "pos", "pos"]
lb = LabelBinarizer(sparse_output=False)
expected = np.array([[0, 0, 0, 0]]).T
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["pos"])
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
# For sparse case:
lb = LabelBinarizer(sparse_output=True)
got = lb.fit_transform(inp)
assert issparse(got)
assert_array_equal(lb.classes_, ["pos"])
assert_array_equal(expected, got.toarray())
assert_array_equal(lb.inverse_transform(got.toarray()), inp)
lb = LabelBinarizer(sparse_output=False)
# two-class case
inp = ["neg", "pos", "pos", "neg"]
expected = np.array([[0, 1, 1, 0]]).T
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["neg", "pos"])
assert_array_equal(expected, got)
to_invert = np.array([[1, 0],
[0, 1],
[0, 1],
[1, 0]])
assert_array_equal(lb.inverse_transform(to_invert), inp)
# multi-class case
inp = ["spam", "ham", "eggs", "ham", "0"]
expected = np.array([[0, 0, 0, 1],
[0, 0, 1, 0],
[0, 1, 0, 0],
[0, 0, 1, 0],
[1, 0, 0, 0]])
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
def test_label_binarizer_unseen_labels():
lb = LabelBinarizer()
expected = np.array([[1, 0, 0],
[0, 1, 0],
[0, 0, 1]])
got = lb.fit_transform(['b', 'd', 'e'])
assert_array_equal(expected, got)
expected = np.array([[0, 0, 0],
[1, 0, 0],
[0, 0, 0],
[0, 1, 0],
[0, 0, 1],
[0, 0, 0]])
got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f'])
assert_array_equal(expected, got)
def test_label_binarizer_set_label_encoding():
lb = LabelBinarizer(neg_label=-2, pos_label=0)
# two-class case with pos_label=0
inp = np.array([0, 1, 1, 0])
expected = np.array([[-2, 0, 0, -2]]).T
got = lb.fit_transform(inp)
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
lb = LabelBinarizer(neg_label=-2, pos_label=2)
# multi-class case
inp = np.array([3, 2, 1, 2, 0])
expected = np.array([[-2, -2, -2, +2],
[-2, -2, +2, -2],
[-2, +2, -2, -2],
[-2, -2, +2, -2],
[+2, -2, -2, -2]])
got = lb.fit_transform(inp)
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
@ignore_warnings
def test_label_binarizer_errors():
# Check that invalid arguments yield ValueError
one_class = np.array([0, 0, 0, 0])
lb = LabelBinarizer().fit(one_class)
multi_label = [(2, 3), (0,), (0, 2)]
with pytest.raises(ValueError):
lb.transform(multi_label)
lb = LabelBinarizer()
with pytest.raises(ValueError):
lb.transform([])
with pytest.raises(ValueError):
lb.inverse_transform([])
with pytest.raises(ValueError):
LabelBinarizer(neg_label=2, pos_label=1)
with pytest.raises(ValueError):
LabelBinarizer(neg_label=2, pos_label=2)
with pytest.raises(ValueError):
LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
# Fail on y_type
with pytest.raises(ValueError):
_inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
output_type="foo", classes=[1, 2],
threshold=0)
# Sequence of seq type should raise ValueError
y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
with pytest.raises(ValueError):
LabelBinarizer().fit_transform(y_seq_of_seqs)
# Fail on the number of classes
with pytest.raises(ValueError):
_inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
output_type="foo",
classes=[1, 2, 3],
threshold=0)
# Fail on the dimension of 'binary'
with pytest.raises(ValueError):
_inverse_binarize_thresholding(y=np.array([[1, 2, 3], [2, 1, 3]]),
output_type="binary",
classes=[1, 2, 3],
threshold=0)
# Fail on multioutput data
with pytest.raises(ValueError):
LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
with pytest.raises(ValueError):
label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
@pytest.mark.parametrize(
"values, classes, unknown",
[(np.array([2, 1, 3, 1, 3], dtype='int64'),
np.array([1, 2, 3], dtype='int64'), np.array([4], dtype='int64')),
(np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
np.array(['a', 'b', 'c'], dtype=object),
np.array(['d'], dtype=object)),
(np.array(['b', 'a', 'c', 'a', 'c']),
np.array(['a', 'b', 'c']), np.array(['d']))],
ids=['int64', 'object', 'str'])
def test_label_encoder(values, classes, unknown):
# Test LabelEncoder's transform, fit_transform and
# inverse_transform methods
le = LabelEncoder()
le.fit(values)
assert_array_equal(le.classes_, classes)
assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
le = LabelEncoder()
ret = le.fit_transform(values)
assert_array_equal(ret, [1, 0, 2, 0, 2])
with pytest.raises(ValueError, match="unseen labels"):
le.transform(unknown)
def test_label_encoder_negative_ints():
le = LabelEncoder()
le.fit([1, 1, 4, 5, -1, 0])
assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]),
[1, 2, 3, 3, 4, 0, 0])
assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]),
[0, 1, 4, 4, 5, -1, -1])
with pytest.raises(ValueError):
le.transform([0, 6])
@pytest.mark.parametrize("dtype", ['str', 'object'])
def test_label_encoder_str_bad_shape(dtype):
le = LabelEncoder()
le.fit(np.array(["apple", "orange"], dtype=dtype))
msg = "should be a 1d array"
with pytest.raises(ValueError, match=msg):
le.transform("apple")
def test_label_encoder_errors():
# Check that invalid arguments yield ValueError
le = LabelEncoder()
with pytest.raises(ValueError):
le.transform([])
with pytest.raises(ValueError):
le.inverse_transform([])
# Fail on unseen labels
le = LabelEncoder()
le.fit([1, 2, 3, -1, 1])
msg = "contains previously unseen labels"
with pytest.raises(ValueError, match=msg):
le.inverse_transform([-2])
with pytest.raises(ValueError, match=msg):
le.inverse_transform([-2, -3, -4])
# Fail on inverse_transform("")
msg = r"should be a 1d array.+shape \(\)"
with pytest.raises(ValueError, match=msg):
le.inverse_transform("")
@pytest.mark.parametrize(
"values",
[np.array([2, 1, 3, 1, 3], dtype='int64'),
np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
np.array(['b', 'a', 'c', 'a', 'c'])],
ids=['int64', 'object', 'str'])
def test_label_encoder_empty_array(values):
le = LabelEncoder()
le.fit(values)
# test empty transform
transformed = le.transform([])
assert_array_equal(np.array([]), transformed)
# test empty inverse transform
inverse_transformed = le.inverse_transform([])
assert_array_equal(np.array([]), inverse_transformed)
def test_sparse_output_multilabel_binarizer():
# test input as iterable of iterables
inputs = [
lambda: [(2, 3), (1,), (1, 2)],
lambda: ({2, 3}, {1}, {1, 2}),
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
]
indicator_mat = np.array([[0, 1, 1],
[1, 0, 0],
[1, 1, 0]])
inverse = inputs[0]()
for sparse_output in [True, False]:
for inp in inputs:
# With fit_transform
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
got = mlb.fit_transform(inp())
assert issparse(got) == sparse_output
if sparse_output:
# verify CSR assumption that indices and indptr have same dtype
assert got.indices.dtype == got.indptr.dtype
got = got.toarray()
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
# With fit
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
got = mlb.fit(inp()).transform(inp())
assert issparse(got) == sparse_output
if sparse_output:
# verify CSR assumption that indices and indptr have same dtype
assert got.indices.dtype == got.indptr.dtype
got = got.toarray()
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
with pytest.raises(ValueError):
mlb.inverse_transform(csr_matrix(np.array([[0, 1, 1],
[2, 0, 0],
[1, 1, 0]])))
def test_multilabel_binarizer():
# test input as iterable of iterables
inputs = [
lambda: [(2, 3), (1,), (1, 2)],
lambda: ({2, 3}, {1}, {1, 2}),
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
]
indicator_mat = np.array([[0, 1, 1],
[1, 0, 0],
[1, 1, 0]])
inverse = inputs[0]()
for inp in inputs:
# With fit_transform
mlb = MultiLabelBinarizer()
got = mlb.fit_transform(inp())
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
# With fit
mlb = MultiLabelBinarizer()
got = mlb.fit(inp()).transform(inp())
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
def test_multilabel_binarizer_empty_sample():
mlb = MultiLabelBinarizer()
y = [[1, 2], [1], []]
Y = np.array([[1, 1],
[1, 0],
[0, 0]])
assert_array_equal(mlb.fit_transform(y), Y)
def test_multilabel_binarizer_unknown_class():
mlb = MultiLabelBinarizer()
y = [[1, 2]]
Y = np.array([[1, 0], [0, 1]])
w = 'unknown class(es) [0, 4] will be ignored'
matrix = assert_warns_message(UserWarning, w,
mlb.fit(y).transform, [[4, 1], [2, 0]])
assert_array_equal(matrix, Y)
Y = np.array([[1, 0, 0], [0, 1, 0]])
mlb = MultiLabelBinarizer(classes=[1, 2, 3])
matrix = assert_warns_message(UserWarning, w,
mlb.fit(y).transform, [[4, 1], [2, 0]])
assert_array_equal(matrix, Y)
def test_multilabel_binarizer_given_classes():
inp = [(2, 3), (1,), (1, 2)]
indicator_mat = np.array([[0, 1, 1],
[1, 0, 0],
[1, 0, 1]])
# fit_transform()
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, [1, 3, 2])
# fit().transform()
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, [1, 3, 2])
# ensure works with extra class
mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
assert_array_equal(mlb.fit_transform(inp),
np.hstack(([[0], [0], [0]], indicator_mat)))
assert_array_equal(mlb.classes_, [4, 1, 3, 2])
# ensure fit is no-op as iterable is not consumed
inp = iter(inp)
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
# ensure a ValueError is thrown if given duplicate classes
err_msg = "The classes argument contains duplicate classes. Remove " \
"these duplicates before passing them to MultiLabelBinarizer."
mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
with pytest.raises(ValueError, match=err_msg):
mlb.fit(inp)
def test_multilabel_binarizer_multiple_calls():
inp = [(2, 3), (1,), (1, 2)]
indicator_mat = np.array([[0, 1, 1],
[1, 0, 0],
[1, 0, 1]])
indicator_mat2 = np.array([[0, 1, 1],
[1, 0, 0],
[1, 1, 0]])
# first call
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
# second call change class
mlb.classes = [1, 2, 3]
assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
def test_multilabel_binarizer_same_length_sequence():
# Ensure sequences of the same length are not interpreted as a 2-d array
inp = [[1], [0], [2]]
indicator_mat = np.array([[0, 1, 0],
[1, 0, 0],
[0, 0, 1]])
# fit_transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
# fit().transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
def test_multilabel_binarizer_non_integer_labels():
tuple_classes = _to_object_array([(1,), (2,), (3,)])
inputs = [
([('2', '3'), ('1',), ('1', '2')], ['1', '2', '3']),
([('b', 'c'), ('a',), ('a', 'b')], ['a', 'b', 'c']),
([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
]
indicator_mat = np.array([[0, 1, 1],
[1, 0, 0],
[1, 1, 0]])
for inp, classes in inputs:
# fit_transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, classes)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
# fit().transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, classes)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
mlb = MultiLabelBinarizer()
with pytest.raises(TypeError):
mlb.fit_transform([({}), ({}, {'a': 'b'})])
def test_multilabel_binarizer_non_unique():
inp = [(1, 1, 1, 0)]
indicator_mat = np.array([[1, 1]])
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
def test_multilabel_binarizer_inverse_validation():
inp = [(1, 1, 1, 0)]
mlb = MultiLabelBinarizer()
mlb.fit_transform(inp)
# Not binary
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1, 3]]))
# The following binary cases are fine, however
mlb.inverse_transform(np.array([[0, 0]]))
mlb.inverse_transform(np.array([[1, 1]]))
mlb.inverse_transform(np.array([[1, 0]]))
# Wrong shape
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1]]))
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1, 1, 1]]))
def test_label_binarize_with_class_order():
out = label_binarize([1, 6], classes=[1, 2, 4, 6])
expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
assert_array_equal(out, expected)
# Modified class order
out = label_binarize([1, 6], classes=[1, 6, 4, 2])
expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
assert_array_equal(out, expected)
out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
expected = np.array([[0, 0, 1, 0],
[0, 0, 0, 1],
[0, 1, 0, 0],
[1, 0, 0, 0]])
assert_array_equal(out, expected)
def check_binarized_results(y, classes, pos_label, neg_label, expected):
for sparse_output in [True, False]:
if ((pos_label == 0 or neg_label != 0) and sparse_output):
with pytest.raises(ValueError):
label_binarize(y, classes=classes, neg_label=neg_label,
pos_label=pos_label,
sparse_output=sparse_output)
continue
# check label_binarize
binarized = label_binarize(y, classes=classes, neg_label=neg_label,
pos_label=pos_label,
sparse_output=sparse_output)
assert_array_equal(toarray(binarized), expected)
assert issparse(binarized) == sparse_output
# check inverse
y_type = type_of_target(y)
if y_type == "multiclass":
inversed = _inverse_binarize_multiclass(binarized, classes=classes)
else:
inversed = _inverse_binarize_thresholding(binarized,
output_type=y_type,
classes=classes,
threshold=((neg_label +
pos_label) /
2.))
assert_array_equal(toarray(inversed), toarray(y))
# Check label binarizer
lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label,
sparse_output=sparse_output)
binarized = lb.fit_transform(y)
assert_array_equal(toarray(binarized), expected)
assert issparse(binarized) == sparse_output
inverse_output = lb.inverse_transform(binarized)
assert_array_equal(toarray(inverse_output), toarray(y))
assert issparse(inverse_output) == issparse(y)
def test_label_binarize_binary():
y = [0, 1, 0]
classes = [0, 1]
pos_label = 2
neg_label = -1
expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
check_binarized_results(y, classes, pos_label, neg_label, expected)
# Binary case where sparse_output = True will not result in a ValueError
y = [0, 1, 0]
classes = [0, 1]
pos_label = 3
neg_label = 0
expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
check_binarized_results(y, classes, pos_label, neg_label, expected)
def test_label_binarize_multiclass():
y = [0, 1, 2]
classes = [0, 1, 2]
pos_label = 2
neg_label = 0
expected = 2 * np.eye(3)
check_binarized_results(y, classes, pos_label, neg_label, expected)
with pytest.raises(ValueError):
label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label,
sparse_output=True)
def test_label_binarize_multilabel():
y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
classes = [0, 1, 2]
pos_label = 2
neg_label = 0
expected = pos_label * y_ind
y_sparse = [sparse_matrix(y_ind)
for sparse_matrix in [coo_matrix, csc_matrix, csr_matrix,
dok_matrix, lil_matrix]]
for y in [y_ind] + y_sparse:
check_binarized_results(y, classes, pos_label, neg_label,
expected)
with pytest.raises(ValueError):
label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label,
sparse_output=True)
def test_invalid_input_label_binarize():
with pytest.raises(ValueError):
label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
with pytest.raises(ValueError, match="continuous target data is not "):
label_binarize([1.2, 2.7], classes=[0, 1])
with pytest.raises(ValueError, match="mismatch with the labels"):
label_binarize([[1, 3]], classes=[1, 2, 3])
def test_inverse_binarize_multiclass():
got = _inverse_binarize_multiclass(csr_matrix([[0, 1, 0],
[-1, 0, -1],
[0, 0, 0]]),
np.arange(3))
assert_array_equal(got, np.array([1, 1, 0]))
@pytest.mark.parametrize(
"values, expected",
[(np.array([2, 1, 3, 1, 3], dtype='int64'),
np.array([1, 2, 3], dtype='int64')),
(np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
np.array(['a', 'b', 'c'], dtype=object)),
(np.array(['b', 'a', 'c', 'a', 'c']),
np.array(['a', 'b', 'c']))],
ids=['int64', 'object', 'str'])
def test_encode_util(values, expected):
uniques = _encode(values)
assert_array_equal(uniques, expected)
uniques, encoded = _encode(values, encode=True)
assert_array_equal(uniques, expected)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
_, encoded = _encode(values, uniques, encode=True)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
def test_encode_check_unknown():
# test for the check_unknown parameter of _encode()
uniques = np.array([1, 2, 3])
values = np.array([1, 2, 3, 4])
# Default is True, raise error
with pytest.raises(ValueError,
match='y contains previously unseen labels'):
_encode(values, uniques, encode=True, check_unknown=True)
# dont raise error if False
_encode(values, uniques, encode=True, check_unknown=False)
# parameter is ignored for object dtype
uniques = np.array(['a', 'b', 'c'], dtype=object)
values = np.array(['a', 'b', 'c', 'd'], dtype=object)
with pytest.raises(ValueError,
match='y contains previously unseen labels'):
_encode(values, uniques, encode=True, check_unknown=False)